diff --git a/CHANGELOG.md b/CHANGELOG.md index 96053eefb0..eded0a4ef0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,17 @@ # NVIDIA CUTLASS Changelog # CUTLASS 2.x +## [2.4.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.4.0) (2020-11-19) + * Implicit GEMM convolution kernels supporting CUDA and Tensor Cores on NVIDIA GPUs + * Operators: forward (Fprop), backward data gradient (Dgrad), and backward weight gradient (Wgrad) convolution + * Data type: FP32, complex, Tensor Float 32 (TF32), BFloat16 (BF16), Float16, Int4, Int8, Int32 + * Spatial dimensions: 1-D, 2-D, and 3-D + * Layout: NHWC, NCxHWx + * Implicit GEMM convolution components: + * Global memory iterators supporting fprop, dgrad, and wgrad + * `MmaMultistage` for implicit GEMM convolution for NVIDIA Ampere architecture + * `MmaPipeline` for implicit GEMM convolution for NVIDIA Volta and Turing architectures + * [Documentation](/media/docs/implicit_gemm_convolution.md) describing Implicit GEMM Convolution algorithm and implementation ## [2.3.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.3.0) (2020-09-23) * [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/) diff --git a/CMakeLists.txt b/CMakeLists.txt index d853a9dd3c..a0ece82c6d 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,7 @@ endif() message(STATUS "CMake Version: ${CMAKE_VERSION}") -project(CUTLASS VERSION 2.3.0 LANGUAGES CXX) +project(CUTLASS VERSION 2.4.0 LANGUAGES CXX) include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake) find_package(Doxygen QUIET) @@ -137,7 +137,12 @@ if (NOT (CMAKE_BUILD_TYPE OR CONFIGURATION_TYPES)) endif() set(CMAKE_POSITION_INDEPENDENT_CODE ON) -set(CUTLASS_LIBRARY_DEBUG_POSTFIX ".debug" CACHE STRING "Default postfix value for debug libraries") +if (DEFINED CMAKE_DEBUG_POSTFIX) + set(CUTLASS_LIBRARY_DEBUG_POSTFIX_INIT ${CMAKE_DEBUG_POSTFIX}) +else() + set(CUTLASS_LIBRARY_DEBUG_POSTFIX_INIT .debug) +endif() +set(CUTLASS_LIBRARY_DEBUG_POSTFIX ${CUTLASS_LIBRARY_DEBUG_POSTFIX_INIT} CACHE STRING "Default postfix value for debug libraries") if(WIN32) # On Windows we link against the shared (DLL) runtime. Change gtest settings to match this. @@ -192,7 +197,6 @@ endif() set(CUTLASS_DEBUG_TRACE_LEVEL "0" CACHE STRING "Level of debug tracing to perform.") list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_DEBUG_TRACE_LEVEL=${CUTLASS_DEBUG_TRACE_LEVEL}) - set(CUTLASS_ENABLE_TENSOR_CORE_MMA ${CUTLASS_ENABLE_TENSOR_CORE_MMA_DEFAULT} CACHE BOOL "Enable PTX mma instruction for collective matrix multiply operations.") @@ -466,21 +470,195 @@ if (CUTLASS_ENABLE_CUBLAS) target_compile_definitions(CUTLASS INTERFACE CUTLASS_ENABLE_CUBLAS=1) endif() +include(${CMAKE_CURRENT_SOURCE_DIR}/cuDNN.cmake) + +if (CUTLASS_ENABLE_CUDNN) + target_compile_definitions(CUTLASS INTERFACE CUTLASS_ENABLE_CUDNN=1) +endif() + ################################################################################ -if(CUTLASS_ENABLE_TOOLS) +include(CTest) +enable_testing() +if (NOT TARGET test_all) + add_custom_target(test_all) +endif() + +set(CUTLASS_INSTALL_TESTS ON CACHE BOOL "Install test executables") +set(CUTLASS_TEST_EXECUTION_ENVIRONMENT "" CACHE BOOL "Environment in which to invoke unit test executables") + +set(CMAKE_TEST_INSTALL_PREFIX test CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.") +set(CUTLASS_TEST_INSTALL_PREFIX ${CMAKE_TEST_INSTALL_PREFIX}/cutlass CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.") +set(CUTLASS_TEST_INSTALL_BINDIR ${CUTLASS_TEST_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR} CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.") +set(CUTLASS_TEST_INSTALL_LIBDIR ${CUTLASS_TEST_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR} CACHE STRING "Test root install location, relative to CMAKE_INSTALL_PREFIX.") + +install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}) +install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_BINDIR}) +install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_LIBDIR}) +install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ctest) + +set(CUTLASS_CTEST_TEMPLATE_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/CTestTestfile.config.cmake) +set(CUTLASS_CTEST_GENERATED_FILES "" CACHE INTERNAL "") + +function(cutlass_add_executable_tests NAME TARGET) +# +# Generates test rules for `make test`, `make test_all`, and `ctest` invoked from either the +# or the / after installation. +# +# NAME: The base name for the test. Can be run with `make ` or `ctest -R 'c'`. +# TARGET: The target corresponding to the executable under test. +# DISABLE_EXECUTABLE_INSTALL_RULE: An option, if given, that disables creating an install rule for TARGET. +# DEPENDS: A list of targets or files on which this test is dependent. +# DEPENDEES: A list of targets which should depend on this test. +# TEST_COMMAND_OPTIONS: A list of variables (i.e. by reference params) which contain command line arguments +# to pass to the test executable. A unique test with suffix _0, _1, ... is generated for each set of +# options given. If this option is not used, a single test with no arguments is generated. +# + + set(options DISABLE_EXECUTABLE_INSTALL_RULE) + set(oneValueArgs) + set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS) + cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if (NOT __DISABLE_EXECUTABLE_INSTALL_RULE AND CUTLASS_INSTALL_TESTS) + + # file(RELATIVE_PATH CMAKE_CURRENT_BINARY_RELATIVE_DIR ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) + + install( + TARGETS ${TARGET} + RUNTIME DESTINATION ${CUTLASS_TEST_INSTALL_BINDIR} + ) + + endif() + + if (NOT __TEST_COMMAND_OPTIONS) + set(__TEST_COMMAND_OPTIONS " ") + endif() + + list(LENGTH __TEST_COMMAND_OPTIONS CMD_COUNT) + set(CMD_IDX 0) + + if (CMD_COUNT GREATER 1) + add_custom_target(${NAME} DEPENDS ${TARGET} ${__DEPENDS}) + foreach(DEPENDEE ${__DEPENDEES}) + add_dependencies(${DEPENDEE} ${NAME}) + endforeach() + endif() + + foreach(CMD_OPTIONS ${__TEST_COMMAND_OPTIONS}) + + if (CMD_COUNT GREATER 1) + set(TEST_NAME ${NAME}_${CMD_IDX}) + else() + set(TEST_NAME ${NAME}) + endif() + + # The following rigmarole is needed to deal with spaces and possible quotes in + # command line arguments. The options are passed "by reference" as the actual + # variable names holding the real options. We then expand these in a way that + # preserves any quotes. Note, they have to be in this order for it to work for + # all the use cases below. + + set(CMD_OPTIONS ${${CMD_OPTIONS}}) + list(JOIN CMD_OPTIONS " " TEST_COMMAND_OPTIONS) + separate_arguments(CMD_OPTIONS) + + add_custom_target( + ${TEST_NAME} + COMMAND + ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $ ${CMD_OPTIONS} + DEPENDS + ${TARGET} + ) + + if (CMD_COUNT GREATER 1) + add_dependencies(${NAME} ${TEST_NAME}) + endif() + + foreach(DEPENDEE ${__DEPENDEES}) + add_dependencies(${DEPENDEE} ${TEST_NAME}) + endforeach() + + add_test( + NAME c${TEST_NAME} + COMMAND ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $ ${CMD_OPTIONS} + ) + + if (CUTLASS_INSTALL_TESTS) + + # To run the tests from an install package with tests enabled, we need to generate test files + # that don't rely on the current directory structure in build. + + set(TEST_NAME c${TEST_NAME}) + set(TEST_EXE $) + set(TEST_EXE_WORKING_DIRECTORY ./${CMAKE_INSTALL_BINDIR}) + configure_file("${CUTLASS_CTEST_TEMPLATE_FILE}" "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.config.cmake" @ONLY) + + file(GENERATE + OUTPUT "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.cmake" + INPUT "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.config.cmake" + ) + + install( + FILES "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.cmake" + DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ctest/ + ) + + set(CUTLASS_CTEST_GENERATED_FILES ${CUTLASS_CTEST_GENERATED_FILES};ctest/CTestTestfile.${TEST_NAME}.cmake CACHE INTERNAL "") + + endif() + + math(EXPR CMD_IDX "${CMD_IDX} + 1") + + endforeach() + +endfunction() + +if (CUTLASS_ENABLE_TOOLS) add_subdirectory(tools) + if (CUTLASS_ENABLE_PROFILER) + add_dependencies(test_all test_profiler) + endif() endif() -if(CUTLASS_ENABLE_EXAMPLES) +if (CUTLASS_ENABLE_EXAMPLES) add_subdirectory(examples) + add_dependencies(test_all test_examples) endif() -if(CUTLASS_ENABLE_TESTS) - include(CTest) - enable_testing() +if (CUTLASS_ENABLE_TESTS) add_subdirectory(test) + add_dependencies(test_all test_unit) +endif() + +if (CUTLASS_INSTALL_TESTS) + + file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/cmake") + + file(WRITE "${CMAKE_BINARY_DIR}/cmake/CTestTestfile.cmake" "# Generated File\n") + foreach(GENERATED_FILE ${CUTLASS_CTEST_GENERATED_FILES}) + file(APPEND "${CMAKE_BINARY_DIR}/cmake/CTestTestfile.cmake" "include(${GENERATED_FILE})\n") + endforeach() + + install( + FILES "${CMAKE_BINARY_DIR}/cmake/CTestTestfile.cmake" + DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ + ) + endif() +#? install( +#? FILES ${CMAKE_BINARY_DIR}/CTestTestfile.cmake +#? DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ +#? ) +#? +#? install( +#? DIRECTORY +#? ${CMAKE_BINARY_DIR}/tools +#? ${CMAKE_BINARY_DIR}/test +#? DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ +#? FILES_MATCHING PATTERN "CTestTestfile.cmake" +#? ) + ################################################################################ install( diff --git a/README.md b/README.md index 88a1b40706..d7a1d7d475 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ ![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition") -# CUTLASS 2.3 +# CUTLASS 2.4 -_CUTLASS 2.3 - September 2020_ +_CUTLASS 2.4 - November 2020_ CUTLASS is a collection of CUDA C++ template abstractions for implementing high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA. @@ -25,11 +25,22 @@ Furthermore, CUTLASS demonstrates warp-synchronous matrix multiply operations targeting the programmable, high-throughput _Tensor Cores_ implemented by NVIDIA's Volta, Turing, and Ampere architectures. +Additionaly, CUTLASS implements high-performance convolution (implicit GEMM). +Implicit GEMM is the formulation of a convolution operation as a GEMM. This allows CUTLASS +to build convolutions by reusing highly optimized warp-wide GEMM components and below. + See the [Quick Start Guide](/media/docs/quickstart.md) to get started quickly. -See the [functionality listing](media/docs/functionality.md) for the list of operations +See the [functionality listing](/media/docs/functionality.md) for the list of operations supported at each level of the execution model hierarchy. +# What's New in CUTLASS 2.4 +CUTLASS 2.4 is a significant update to CUTLASS adding: +- 1-D, 2-D, and 3-D convolution targeting Tensor and CUDA cores for NVIDIA Ampere, Turing, and Volta GPU architectures +- CUTLASS profiler support for convolution +- [Documentation](/media/docs/implicit_gemm_convolution.md) describing Implicit GEMM Convolution algorithm and implementation +- See the [CHANGELOG](CHANGELOG.md) for more details. + # What's New in CUTLASS 2.3 CUTLASS 2.3 is a minor update to CUTLASS adding: @@ -118,6 +129,7 @@ CUTLASS is described in the following documents and the accompanying - [Functionality](/media/docs/functionality.md) - summarizes functionality available in CUTLASS - [Efficient GEMM in CUDA](media/docs/efficient_gemm.md) - describes how GEMM kernels may be implemented efficiently in CUDA - [GEMM API](media/docs/gemm_api.md) - describes the CUTLASS GEMM model and C++ template concepts +- [Implicit GEMM Convolution](media/docs/implicit_gemm_convolution.md) - describes 2-D and 3-D convolution in CUTLASS - [Code Organization](media/docs/code_organization.md) - describes the organization and contents of the CUTLASS project - [Terminology](media/docs/terminology.md) - describes terms used in the code - [Programming Guidelines](media/docs/programming_guidelines.md) - guidelines for writing efficient modern CUDA C++ @@ -140,7 +152,7 @@ CUTLASS unit tests, examples, and utilities can be build with CMake starting ver Make sure the `CUDACXX` environment variable points to NVCC in the CUDA Toolkit installed on your system. -``` +```bash $ export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc ``` @@ -149,7 +161,7 @@ for CUDA architecture versions 5.0, 6.0, 6.1, 7.0, 7.5, 8.0, and 8.6. To reduce the architectures to build CUTLASS for by changing the CMake configuration setting `CUTLASS_NVCC_ARCHS`. -``` +```bash $ mkdir build && cd build $ cmake .. -DCUTLASS_NVCC_ARCHS=80 # compiles for NVIDIA's Ampere Architecture @@ -160,7 +172,7 @@ From the `build/` directory, compile and run the CUTLASS unit tests by building The unit tests are organized as several binaries mirroring the top-level namespaces of CUTLASS, and they may be executed in parallel via make's `-j` command line argument. -``` +```bash $ make test_unit -j ... ... @@ -191,6 +203,8 @@ include/ # client applications should target this directory arch/ # direct exposure of architecture features (including instruction-level GEMMs) + conv/ # code specialized for convolution + gemm/ # code specialized for general matrix product computations layout/ # layout definitions for matrices, tensors, and other mathematical objects in memory @@ -210,34 +224,39 @@ include/ # client applications should target this directory ``` examples/ - 00_basic_gemm/ # launches a basic GEMM with single precision inputs and outputs + 00_basic_gemm/ # launches a basic GEMM with single precision inputs and outputs - 01_cutlass_utilities/ # demonstrates CUTLASS Utilities for allocating and initializing tensors + 01_cutlass_utilities/ # demonstrates CUTLASS Utilities for allocating and initializing tensors - 02_dump_reg_smem/ # debugging utilities for printing register and shared memory contents + 02_dump_reg_smem/ # debugging utilities for printing register and shared memory contents - 03_visualize_layout/ # utility for visualizing all layout functions in CUTLASS + 03_visualize_layout/ # utility for visualizing all layout functions in CUTLASS + + 04_tile_iterator/ # example demonstrating an iterator over tiles in memory + + 05_batched_gemm/ # example demonstrating CUTLASS's batched strided GEMM operation - 04_tile_iterator/ # example demonstrating an iterator over tiles in memory + 06_splitK_gemm/ # exmaple demonstrating CUTLASS's Split-K parallel reduction kernel - 05_batched_gemm/ # example demonstrating CUTLASS's batched strided GEMM operation + 07_volta_tensorop_gemm/ # example demonstrating mixed precision GEMM using Volta Tensor Cores - 06_splitK_gemm/ # exmaple demonstrating CUTLASS's Split-K parallel reduction kernel + 08_turing_tensorop_gemm/ # example demonstrating integer GEMM using Turing Tensor Cores - 07_volta_tensorop_gemm/ # example demonstrating mixed precision GEMM using Volta Tensor Cores + 09_turing_tensorop_conv2dfprop/ # example demonstrating integer implicit GEMM convolution (forward propagation) using Turing Tensor Cores - 08_turing_tensorop_gemm/ # example demonstrating integer GEMM using Turing Tensor Cores + 10_planar_complex/ # example demonstrating planar complex GEMM kernels - 10_planar_complex/ # example demonstrating planar complex GEMM kernels + 11_planar_complex_array/ # example demonstrating planar complex kernels with batch-specific problem sizes - 11_planar_complex_array/ # example demonstrating planar complex kernels with batch-specific problem sizes + 12_gemm_bias_relu/ # example demonstrating GEMM fused with bias and relu - 12_gemm_bias_relu/ # example demonstrating GEMM fused with bias and relu + 13_fused_two_gemms/ # example demonstrating two GEMms fused in one kernel - 13_fused_two_gemms/ # example demonstrating two GEMms fused in one kernel + 22_ampere_tensorop_conv2dfprop/ # example demonstrating integer implicit GEMM convolution (forward propagation) using Ampere Tensor Cores ``` ### Tools + ``` tools/ library/ # CUTLASS Instance Library - contains instantiations of all supported CUTLASS templates @@ -266,14 +285,14 @@ Instructions for building and running the Unit tests are described in the [Quick The `tools/profiler/` directory contains a command-line utility for launching each of the GEMM kernels. It can be built as follows: -``` +```bash $ make cutlass_profiler -j16 ``` By default, only one tile size is instantiated for each data type, math instruction, and layout. To instantiate all, set the following environment variable when running CMake from an empty `build/` directory. Beware, this results in *thousands* of kernels and long build times. -``` +```bash $ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_KERNELS=all ... $ make cutlass_profiler -j16 @@ -282,7 +301,7 @@ $ make cutlass_profiler -j16 To compile strictly one kernel or a small set of kernels, a comma-delimited list of kernel names with wildcard characters may be reduce the set of kernels. The following builds exactly one kernel: -``` +```bash $ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sgemm_128x128_8x2_nn_align1 ... $ make cutlass_profiler -j16 @@ -318,6 +337,56 @@ $ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096 Math: 17218.4 GFLOP/s ``` +To compile strictly 2-D or 3-D convolution kernels, filter by operation +```bash +$ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_OPERATIONS=conv2d,conv3d +... +$ make cutlass_profiler -j16 +``` + +or by name + +```bash +$ cmake .. -DCUTLASS_NVCC_ARCHS=80 -DCUTLASS_LIBRARY_KERNELS=sfprop,s16816fprop,s16816dgrad,s16816wgrad +... +$ make cutlass_profiler -j16 +``` + +Example command line for profiling 2-D convolution kernels is as follows: + +```bash +$ ./tools/profiler/cutlass_profiler --kernels=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 + + +============================= + Problem ID: 1 + + Provider: CUTLASS + OperationKind: conv2d + Operation: cutlass_simt_sfprop_optimized_128x128_8x2_nhwc + + Status: Success + Verification: ON + Disposition: Passed + +reference_device: Passed + + Arguments: --conv_kind=fprop --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 --p=224 --q=224 --pad_h=1 --pad_w=1 \ + --stride_h=1 --stride_w=1 --dilation_h=1 --dilation_w=1 --Activation=f32:nhwc --Filter=f32:nhwc --Output=f32:nhwc \ + --conv_mode=cross --iterator_algorithm=optimized --alpha=1 --beta=0 --split_k_mode=serial --split_k_slices=1 \ + --eq_gemm_provider=none --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 --stages=2 --warps_m=4 \ + --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 --max_cc=1024 + + Bytes: 2055798784 bytes + FLOPs: 118482796544 flops + + Runtime: 8.13237 ms + Memory: 235.431 GiB/s + + Math: 14569.3 GFLOP/s + +``` + [Further details about the CUTLASS Profiler are described here.](media/docs/profiler.md) diff --git a/cmake/CTestTestfile.config.cmake b/cmake/CTestTestfile.config.cmake new file mode 100644 index 0000000000..65fda51a70 --- /dev/null +++ b/cmake/CTestTestfile.config.cmake @@ -0,0 +1,19 @@ +# Generated file + +if (DEFINED ENV{CUTLASS_TEST_EXECUTION_ENVIRONMENT}) + set(_CUTLASS_TEST_EXECUTION_ENVIRONMENT $ENV{CUTLASS_TEST_EXECUTION_ENVIRONMENT}) +else() + set(_CUTLASS_TEST_EXECUTION_ENVIRONMENT @CUTLASS_TEST_EXECUTION_ENVIRONMENT@) +endif() + +if (NOT "@TEST_EXE_DIR@" STREQUAL "") + set(TEST_EXE_PATH @TEST_EXE_DIR@/@TEST_EXE@) +else() + set(TEST_EXE_PATH @TEST_EXE@) +endif() + +add_test("@TEST_NAME@" ${_CUTLASS_TEST_EXECUTION_ENVIRONMENT} "${TEST_EXE_PATH}" @TEST_COMMAND_OPTIONS@) + +if (NOT "@TEST_EXE_WORKING_DIRECTORY@" STREQUAL "") + set_tests_properties("@TEST_NAME@" PROPERTIES WORKING_DIRECTORY "@TEST_EXE_WORKING_DIRECTORY@") +endif() diff --git a/cuBLAS.cmake b/cuBLAS.cmake index 4c73a1db4c..0ad6db2378 100644 --- a/cuBLAS.cmake +++ b/cuBLAS.cmake @@ -1,3 +1,24 @@ +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, are permitted +# provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright notice, this list of +# conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, this list of +# conditions and the following disclaimer in the documentation and/or other materials +# provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used +# to endorse or promote products derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. message(STATUS "Configuring cublas ...") diff --git a/cuDNN.cmake b/cuDNN.cmake new file mode 100644 index 0000000000..da5e453131 --- /dev/null +++ b/cuDNN.cmake @@ -0,0 +1,107 @@ + +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, are permitted +# provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright notice, this list of +# conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, this list of +# conditions and the following disclaimer in the documentation and/or other materials +# provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used +# to endorse or promote products derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +if(DEFINED CUDNN_ENABLED) + set(CUTLASS_ENABLE_CUDNN ${CUDNN_ENABLED} CACHE BOOL "Enable CUTLASS to build with cuDNN library.") +endif() + +if(DEFINED CUTLASS_ENABLE_CUDNN AND NOT CUTLASS_ENABLE_CUDNN) + return() +endif() + +message(STATUS "Configuring cuDNN ...") + +find_path( + _CUDNN_INCLUDE_DIR cudnn.h + PATHS + ${CUDA_TOOLKIT_ROOT_DIR}/include + $ENV{CUDNN_PATH}/include + $ENV{CUDA_PATH}/include + ${CUDNN_PATH}/include + /usr/include) + +find_library( + _CUDNN_LIBRARY cudnn + HINTS + ${CUDA_TOOLKIT_ROOT_DIR}/lib64 + ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 + ${CUDA_TOOLKIT_ROOT_DIR}/lib + $ENV{CUDNN_PATH}/lib64 + $ENV{CUDNN_PATH}/lib/x64 + $ENV{CUDNN_PATH}/lib + $ENV{CUDA_PATH}/lib64 + $ENV{CUDA_PATH}/lib/x64 + $ENV{CUDA_PATH}/lib + ${CUDNN_PATH}/lib64 + ${CUDNN_PATH}/lib/x64 + ${CUDNN_PATH}/lib + /usr/lib/x86_64-linux-gnu + /usr/lib) + +if(_CUDNN_INCLUDE_DIR AND _CUDNN_LIBRARY) + + message(STATUS "cuDNN: ${_CUDNN_LIBRARY}") + message(STATUS "cuDNN: ${_CUDNN_INCLUDE_DIR}") + + set(CUDNN_FOUND ON CACHE INTERNAL "cuDNN Library Found") + +else() + + message(STATUS "cuDNN not found.") + set(CUDNN_FOUND OFF CACHE INTERNAL "cuDNN Library Found") + +endif() + +set(CUTLASS_ENABLE_CUDNN ${CUDNN_FOUND} CACHE BOOL "Enable CUTLASS to build with cuDNN library.") + +if (CUTLASS_ENABLE_CUDNN AND NOT TARGET cudnn) + + set(CUDNN_INCLUDE_DIR ${_CUDNN_INCLUDE_DIR}) + set(CUDNN_LIBRARY ${_CUDNN_LIBRARY}) + + if(WIN32) + add_library(cudnn STATIC IMPORTED GLOBAL) + else() + add_library(cudnn SHARED IMPORTED GLOBAL) + endif() + + add_library(nvidia::cudnn ALIAS cudnn) + + set_property( + TARGET cudnn + PROPERTY IMPORTED_LOCATION + ${CUDNN_LIBRARY}) + + target_include_directories( + cudnn + INTERFACE + $ + $) + +endif() + +if(CUTLASS_ENABLE_CUDNN AND NOT CUDNN_FOUND) + message(FATAL_ERROR "CUTLASS_ENABLE_CUDNN enabled but cuDNN library could not be found.") +endif() + +message(STATUS "Configuring cuDNN ... done.") diff --git a/examples/03_visualize_layout/CMakeLists.txt b/examples/03_visualize_layout/CMakeLists.txt index e2bb283489..27a87c9292 100644 --- a/examples/03_visualize_layout/CMakeLists.txt +++ b/examples/03_visualize_layout/CMakeLists.txt @@ -20,9 +20,15 @@ # STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +set(TEST_COMMAND_00 RowMajor --extent=16,16) +set(TEST_COMMAND_01 "ColumnMajorInterleaved<4>" --extent=32,8 --output-shape=16 --vectorize=4) + cutlass_example_add_executable( 03_visualize_layout visualize_layout.cpp register_layout.cu + TEST_COMMAND_OPTIONS + TEST_COMMAND_00 + TEST_COMMAND_01 ) diff --git a/examples/03_visualize_layout/visualize_layout.cpp b/examples/03_visualize_layout/visualize_layout.cpp index a0f2718122..3c4b783ca6 100644 --- a/examples/03_visualize_layout/visualize_layout.cpp +++ b/examples/03_visualize_layout/visualize_layout.cpp @@ -32,6 +32,8 @@ #include #include +#include + #include "options.h" #include "register_layout.h" @@ -133,6 +135,8 @@ int main(int argc, char const *arg[]) { layout_it->second->print_csv(std::cout); + cudaFree(0); // Ensure CUDA is available. + return 0; } diff --git a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu index d18a4e6ab7..36f794d921 100644 --- a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu +++ b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu @@ -188,31 +188,6 @@ using Gemm = cutlass::gemm::device::Gemm 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { - std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; - return -1; - } - - cudaDeviceProp props; - - cudaError_t error = cudaGetDeviceProperties(&props, 0); - if (error != cudaSuccess) { - std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; - return -1; - } - - if (!((props.major * 10 + props.minor) >= 75)) { - std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 75." - << std::endl; - - // Return 0 so tests are considered passing if run on unsupported platforms. - return 0; - } - const int length_m = 5120; const int length_n = 4096; const int length_k = 4096; @@ -337,18 +312,37 @@ int run() { } int main() { + bool notSupported = false; + // Turing Tensor Core operations exposed with mma.sync and ldmatrix are first available // in CUDA 10.2. // // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples. if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + notSupported = true; + } - // Returning zero so this test passes when built on older Toolkits. - return 0; + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (!((props.major * 10 + props.minor) >= 75)) { + std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 75." + << std::endl; + + notSupported = true; } - else { - return run(); + + if (notSupported) { + // Returning zero so this test passes on older Toolkits. Its actions are no-op. + return 0; } + + return run(); } diff --git a/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt b/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt new file mode 100644 index 0000000000..b1b5c8df1e --- /dev/null +++ b/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt @@ -0,0 +1,28 @@ +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, are permitted +# provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright notice, this list of +# conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, this list of +# conditions and the following disclaimer in the documentation and/or other materials +# provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used +# to endorse or promote products derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +cutlass_example_add_executable( + 09_turing_tensorop_conv2dfprop + turing_tensorop_conv2dfprop.cu + ) + diff --git a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu new file mode 100644 index 0000000000..cf07efdcb5 --- /dev/null +++ b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu @@ -0,0 +1,758 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/** + + +This example shows how to run convolution kernels using functions and data structures +provided by CUTLASS using tensor cores; which we run on a NVIDIA Turing GPU. + +Writing a single high performance convolution kernel is hard but do-able. Whereas writing +high performance kernels at scale which works for multiple problem sizes with good abstractions is +really hard. CUTLASS solves this problem by providing simplified abstractions to compose +multiple sections of implicit gemm kernel. When used properly, the kernels can hit peak performance +of GPU easily. + +CUTLASS divides a kernel into hierarchical composable sections. Which means, at each thread, warp +and thread-block level, they compute on their own tile-size with higher level of tile sizes being +composed from lower level ones. Multiple thread-tiles (tile size each thread computes) can be used +to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute +threadblock-tile (tile size computed by a threadblock). + +In thie example, we split variable initialization into +1. Setting up data properties : describes how tensors are laid out in the memory and how the kernel +can view them (logical to physical mapping) +2. Setting up computation properties : describes how the above set tensors will be used to compute +output of convolution. + +First, we setup the data types of the input tensor A, weights' tensor B and output tensor C along +with alpha, beta as the equation for convolution is C = alpha * Conv(A, B) + beta * C. In CUTLASS, +the kernels first compute Conv(A, B) and leave the rest of the computation to end of the kernel as +alpha * X + beta * C is a simple element-wise operation on X (Conv(A, B)) and C. We call this as +epilogue of kernel. Hence, we setup data types for alpha and beta to be equal to +ElementComputeEpilogue = float. We want to use MMA instructions on Turing and they support 4-bit +signed integer. But int4b_t is not fully supported by Nvidia software stack, so CUTLASS introduces +cutlass::int4b_t. We use the data type for elements in input tensor A and B as cutlass::int4b_t. We +convey this to CUTLASS kernel by initializing template variables ElementAccumulator (int32_t), +ElementComputeEpilogue (float), ElementInputA (cutlass::int4b_t), ElementInputB (cutlass::int4b_t), +ElementOutput (int32_t). Communicating just the data type is not enough. As the data is laid out +linearly in memory, we have to convey the layout of tensors. We do that by initializing template +variables LayoutInputA, LayoutInputB and LayoutOutput to TensorNHWC cutlass variable. Next, we setup +rules to comptue alpha * X + beta * C which is called epilogue of the kernel. We initialize template +variable EpilogueOp, which takes the data type of output ElementOutput (int32_t), the number of +elements per vector memory access (32), data type of accumulator (int32_t) and data type of +computation of linear combination (alpha * X + beta * C). + +Now that we setup the properties of data, we have to setup properties of computation. + +Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x128, +64x64x128, 8x8x32 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it +internally deduces the amount of threads needed per thread-block, amount of shared memory, storing +data in bank-conflict free manner, and ton of other variables required to compose, intialize and +launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer +from understanding and coding complicated hardware optimizations which can easily go wrong. + +CUTLASS also supports multiple MMA pipelines in a threadblock. What are MMA pipelines? MMA pipelines +constitute the whole process of loading input data from global memory to shared memory, loading data +from shared memory to registers, doing matrix multiplication, store to global memory. The below flow +sequence shows a typical mma pipeline. + +tensor in global memory -> registers -> tile in shared memory -> registers -> mma -> registers -> +output to global memory + +The problem with single pipeline is, each stage is synchronous which means, each stage has to wait +until the previous finished executing. There are stages in the pipeline which do not have fixed +latency, for example, the loads from global memory and shared memory. Therefore, we can add one more +pipeline with a phase shift in mma kernel to hide latency from global and shared memory loads. +Finally, the pipeline in a kernel looks like + +(1) tensor in global memory -> (2) registers -> (3) tile in shared memory -> (4) registers -> (5) +mma -> (6) registers -> (7) output to global memory (1) -> (2) -> (3) tensor in global +memory -> (4) registers -> (5) tile in shared memory -> (6) registers -> (7) mma -> (8) registers -> +(9) output to global memory + +This way, you can hide the second global memory load latency by doing computation on already loaded +input data. + +There are few more template variables initialized such as, which threadblock tile of output matrix +is done which threadblock launched on an SM, CUDA SM architecture of GPU you want to run on. + +These are all put together to create a template variable which describes CUTLASS Implicit GEMM +kernel using cutlass::conv::device::ImplicitGemm template. + +The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it. +We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come +in the way of learning CUTLASS. + +Once all the tensors are initialized and filled with data, create arguments tuple to launch CUTLASS +kernel which takes problem size (N = 1, H = 64, W = 64, C = 128), filter size (K = 64, +R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the +important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space +memory required by the kernel we instantiated. If yes, we create it and pass it along with other +arguments created to intialize CUTLASS kernel then, the kernel is launched. + +In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to +compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel. +*/ + +#include +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "cutlass/util/command_line.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/device/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/convolution.h" +#include "cutlass/util/tensor_view_io.h" + +#include "helper.h" + +// The code section below describes datatype for input, output tensors and computation between +// elements +using ElementAccumulator = int32_t; // Data type of accumulator +using ElementComputeEpilogue = float; // Data type of epilogue computation (alpha, beta) +using ElementInputA = cutlass::int4b_t; // Data type of elements in input tensor +using ElementInputB = cutlass::int4b_t; // Data type of elements in input tensor +using ElementOutput = cutlass::int4b_t; // Data type of elements in output tensor + +using LayoutInputA = cutlass::layout::TensorNHWC; +using LayoutInputB = cutlass::layout::TensorNHWC; +using LayoutOutput = cutlass::layout::TensorNHWC; + +// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM +using MMAOp = cutlass::arch::OpClassTensorOp; + +// This code section describes CUDA SM architecture number +using SmArch = cutlass::arch::Sm75; + +// This code section describes the tile size a thread block will compute +using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 128>; // Threadblock tile shape + +// This code section describes tile size a warp will compute +using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>; // Warp tile shape + +// This code section describes the size of MMA op +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>; // TensorCore instruction shape + +// This code section describes how threadblocks are scheduled on GPU +using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; + +// Number of pipelines you want to use +constexpr int NumStages = 2; + +// This code section describes the epilogue part of the kernel, we use default value +using EpilogueOp = cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, // Data type of output matrix. + 8, // The number of elements per vectorized. + // memory access. This becomes the vector width of + // math instructions in the epilogue too. + ElementAccumulator, // Data type of accumulator + ElementComputeEpilogue>; // Data type for alpha/beta in linear combination + + +using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementInputA, LayoutInputA, + ElementInputB, LayoutInputB, + ElementOutput, LayoutOutput, + ElementAccumulator, + MMAOp, + SmArch, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOp, + SwizzleThreadBlock, + NumStages, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic +>::Kernel; + +using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution; + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + cutlass::Tensor4DCoord input_size; + cutlass::Tensor4DCoord filter_size; + cutlass::Tensor4DCoord padding; + cutlass::MatrixCoord conv_stride; + cutlass::MatrixCoord dilation; + bool reference_check; + bool measure_performance; + int iterations; + bool save_workspace; + ElementComputeEpilogue alpha; + ElementComputeEpilogue beta; + bool benchmark; + std::string tag; + + Options(): + help(false), + input_size(1, 32, 32, 32), + filter_size(32, 3, 3, 32), + padding(1, 1, 1, 1), + conv_stride(1, 1), + dilation(1, 1), + reference_check(false), + measure_performance(true), + iterations(20), + save_workspace(false), + alpha(1), + beta(0), + benchmark(false) { } + + // Verify the problem size is compatible with the CUTLASS Convolution implementation. + bool valid() { + + // + // CUTLASS attempts to load 128b vectors of int4b_t elements. Consequently, + // all pointers, strides, and tensor extents must be divisible by 32 elements. + // + int const kAlignment = 32; + + if ((input_size.c() % kAlignment) || + (filter_size.n() % kAlignment)) { + + // misaligned tensors + return false; + } + + // Invalid padding + if ((padding.h() != filter_size.h() / 2) || + (padding.w() != filter_size.w() / 2)) { + + return false; + } + + return true; + } + + /// Updates input and filter sizes + void update( + cutlass::Tensor4DCoord input_size, + cutlass::Tensor4DCoord filter_size) { + + this->input_size = input_size; + this->filter_size = filter_size; + + padding.n() = filter_size.h() / 2; + padding.h() = filter_size.h() / 2; + padding.w() = filter_size.w() / 2; + padding.c() = filter_size.w() / 2; + } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + } + + if (cmd.check_cmd_line_flag("ref-check")) { + reference_check = true; + } + + if (cmd.check_cmd_line_flag("perf-check")) { + measure_performance = true; + } + + if (cmd.check_cmd_line_flag("save-workspace")) { + save_workspace = true; + } + + if (cmd.check_cmd_line_flag("benchmark")) { + benchmark = true; + } + + cmd.get_cmd_line_argument("n", input_size.n()); + cmd.get_cmd_line_argument("h", input_size.h()); + cmd.get_cmd_line_argument("w", input_size.w()); + cmd.get_cmd_line_argument("c", input_size.c()); + + cmd.get_cmd_line_argument("k", filter_size.n()); + cmd.get_cmd_line_argument("r", filter_size.h()); + cmd.get_cmd_line_argument("s", filter_size.w()); + filter_size.c() = input_size.c(); + + cmd.get_cmd_line_argument("alpha", alpha); + cmd.get_cmd_line_argument("beta", beta); + + cmd.get_cmd_line_argument("iterations", iterations); + cmd.get_cmd_line_argument("tag", tag); + + if (filter_size.h() == 3 && filter_size.w() == 3) { + padding = {1, 1, 1, 1}; + } + else { + filter_size.h() = 1; + filter_size.w() = 1; + padding = {0, 0, 0, 0}; + } + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "09_turing_tensorop_conv2dfprop example\n\n" + << " This example uses Turing's Tensor Core operators on int4 data types to compute\n" + << " forward convolution on tensors of layout NHWC.\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement.\n\n" + << " --n Input tensor extent N\n" + << " --h Input tensor extent H\n" + << " --w Input tensor extent W\n" + << " --c Input tensor extent C\n" + << " --k Filter extent K\n" + << " --r Filter extent R\n" + << " --s Filter extent S\n\n" + << " --alpha Epilogue scalar alpha\n" + << " --beta Epilogue scalar beta\n\n" + << " --ref-check If set (true), reference check on the host is computed\n" + << " --perf-check If set (true), performance is measured.\n" + << " --benchmark If set (true), performance benchmarking on several layers and batch-size.\n" + << " --iterations Number of profiling iterations to perform.\n" + << " --save-workspace If set, workspace is written to a text file.\n" + << " --tag String to replicate across the first column in the results table\n"; + + out << "\n\nExamples:\n\n" + << "$ ./examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n" + << "$ ./examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check\n\n"; + + return out; + } + + /// Computes the output tensor size (NPQK) + cutlass::Tensor4DCoord output_size() const { + return cutlass::Tensor4DCoord( + input_size.n(), + (input_size.h() + padding.n() + padding.h() - filter_size.h()) / conv_stride.row() + 1, + (input_size.w() + padding.w() + padding.c() - filter_size.w()) / conv_stride.column() + 1, + filter_size.n()); + } + + /// Compute performance in GFLOP/s + double gflops(double runtime_s) const { + + // Number of multiply-adds = NPQK * CRS + int64_t fmas = output_size().product() * int64_t(filter_size.h() * filter_size.w() * filter_size.c()); + + // Two flops per multiply-add + return 2.0 * double(fmas) / double(1.0e9) / runtime_s; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +struct Result { + double runtime_ms; + double gflops; + cutlass::Status status; + cutlass::Status reference_check; + cudaError_t error; + + Result(): + runtime_ms(0), + gflops(0), + status(cutlass::Status::kSuccess), + reference_check(cutlass::Status::kInvalid), + error(cudaSuccess) { } + + static std::ostream & print_header(std::ostream &out, Options const &options) { + + if (!options.tag.empty()) { + out << "Name,"; + } + + out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPs"; + + return out; + } + + std::ostream & print(std::ostream &out, int idx, Options const &options) { + + if (!options.tag.empty()) { + out << options.tag << ","; + } + + out + << "conv_" << idx << "," + << options.input_size.n() << "," + << options.input_size.h() << "," + << options.input_size.w() << "," + << options.input_size.c() << "," + << options.filter_size.n() << "," + << options.filter_size.h() << "," + << options.filter_size.w() << "," + << runtime_ms << "," + << gflops; + + return out; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Runs one benchmark +Result profile_convolution(Options const &options) { + + Result result; + + // + // Allocate host-device tensors using the CUTLASS Utilities. + // + + cutlass::HostTensor tensor_a(options.input_size); + cutlass::HostTensor tensor_b(options.filter_size); + cutlass::HostTensor tensor_c(options.output_size()); + cutlass::HostTensor tensor_ref_c(options.output_size()); + + // + // Initialize tensors + // + + // Fill tensor A on host with uniform-distribution random data + cutlass::reference::host::TensorFillRandomUniform( + tensor_a.host_view(), + 1, + ElementInputA(7), + ElementInputA(-8), + 0); + + // Fill tensor B on host with uniform-distribution random data + cutlass::reference::host::TensorFillRandomUniform( + tensor_b.host_view(), + 1, + ElementInputB(7), + ElementInputB(-8), + 0); + + // Fill tensor C on host with zeros + cutlass::reference::host::TensorFill( + tensor_c.host_view()); + + // Fill tensor C for reference on host with zeros + cutlass::reference::host::TensorFill( + tensor_ref_c.host_view()); + + // Copy data from host to GPU + tensor_a.sync_device(); + tensor_b.sync_device(); + tensor_c.sync_device(); + tensor_ref_c.sync_device(); + + // + // Define arguments for CUTLASS Convolution + // + + // mode (kCrossCorrelation or kConvolution) + cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation; + + // Split K dimension into 1 partitions + int split_k_slices = 1; + + cutlass::conv::Conv2dProblemSize problem_size( + options.input_size, + options.filter_size, + options.padding, + options.conv_stride, + options.dilation, + options.output_size(), + mode, + split_k_slices); + + typename ImplicitGemm::Arguments arguments{ + problem_size, + tensor_a.device_ref(), + tensor_b.device_ref(), + tensor_c.device_ref(), + tensor_c.device_ref(), + {options.alpha, options.beta}, + }; + + // + // Initialize CUTLASS Convolution + // + + ImplicitGemm implicit_gemm_op; + + size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments); + + // Allocate workspace memory + cutlass::device_memory::allocation workspace(workspace_size); + + result.status = implicit_gemm_op.initialize(arguments, workspace.get()); + CUTLASS_CHECK(result.status); + + // + // Launch initialized CUTLASS kernel + // + result.status = implicit_gemm_op(); + + CUTLASS_CHECK(result.status); + + // + // Optional reference check + // + + if (options.reference_check) { + std::cout << "Verification on host...\n"; + + // Compute with reference implementation + cutlass::reference::host::Conv2dFprop< + ElementInputA, + LayoutInputA, + ElementInputB, + LayoutInputB, + ElementOutput, + LayoutOutput, + ElementComputeEpilogue, + ElementAccumulator, + cutlass::NumericConverterClamp + >( + problem_size, + tensor_a.host_ref(), + tensor_b.host_ref(), + tensor_c.host_ref(), + tensor_ref_c.host_ref(), + options.alpha, + options.beta + ); + + // Check if output from CUTLASS kernel and reference kernel are equal or not + tensor_c.sync_host(); + + bool passed = cutlass::reference::host::TensorEquals( + tensor_c.host_view(), + tensor_ref_c.host_view()); + + if (!passed) { + result.reference_check = cutlass::Status::kErrorInternal; + std::cout << "ERROR - results miscompared.\n"; + } + else { + result.reference_check = cutlass::Status::kSuccess; + std::cout << "Passed.\n"; + } + } + else { + result.reference_check = cutlass::Status::kInvalid; + } + + if (options.save_workspace) { + + std::stringstream ss; + + ss << "09_tensor_conv_workspace_conv2dfprop_" + << options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c() + << "_" + << options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c() + << ".dat"; + + std::ofstream output_workspace(ss.str()); + + output_workspace + << "Input = \n" << tensor_a.host_view() << "\n\n" + << "Filters = \n" << tensor_b.host_view() << "\n\n"; + + if (options.reference_check) { + output_workspace << "Reference = \n" << tensor_ref_c.host_view() << "\n\n"; + } + + output_workspace << "Computed = \n" << tensor_c.host_view() << std::endl; + + std::cout << "Results written to '" << ss.str() << "'." << std::endl; + } + + // + // Performance measurement + // + + if (options.measure_performance) { + + cudaEvent_t events[2]; + + for (auto & event : events) { + result.error = cudaEventCreate(&event); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + } + + // Record an event at the start of a series of convolution operations. + result.error = cudaEventRecord(events[0]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Launch a sequence of implicit GEMM operations on the device + for (int iteration = 0; iteration < options.iterations; ++iteration) { + result.status = implicit_gemm_op(); + CUTLASS_CHECK(result.status); + } + + // Record an event when the convolutions have been launched. + result.error = cudaEventRecord(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Wait for work on the device to complete. + result.error = cudaEventSynchronize(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Measure elapsed runtime + float runtime_ms = 0; + result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Print average runtime and GFLOPs. + result.runtime_ms = double(runtime_ms) / double(options.iterations); + result.gflops = options.gflops(result.runtime_ms / 1000.0); + + // Cleanup + for (auto event : events) { + (void)cudaEventDestroy(event); + } + } + + return result; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +int main(int argc, char const **args) { + + // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2. + // + // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples. + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { + std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + return 0; + } + + cudaDeviceProp props; + CUDA_CHECK(cudaGetDeviceProperties(&props, 0)); + + if (!(props.major > 7 || (props.major == 7 && props.minor >= 5))) { + std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75." + << std::endl; + return 0; + } + + Options options; + + options.parse(argc, args); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + if (options.benchmark) { + // Benchmark several layers + + int batch_sizes[] = {1, 32, 64, 128, 256, 512}; + + struct Benchmark { + int h, w, c, k, r, s; + } layers[] = { + {56, 56, 64, 256, 1, 1}, + {56, 56, 64, 64, 1, 1}, + {56, 56, 64, 64, 3, 3}, + {56, 56, 256, 64, 1, 1}, + {56, 56, 256, 512, 1, 1}, + {56, 56, 256, 128, 1, 1}, + {28, 28, 128, 128, 3, 3}, + {28, 28, 128, 512, 1, 1}, + {28, 28, 512, 128, 1, 1}, + {28, 28, 512, 1024, 1, 1}, + {28, 28, 512, 256, 1, 1}, + {14, 14, 256, 256, 3, 3}, + {14, 14, 256, 1024, 1, 1}, + {14, 14, 1024, 256, 1, 1}, + {14, 14, 1024, 2048, 1, 1}, + {14, 14, 1024, 512, 1, 1}, + {7, 7, 512, 512, 3, 3}, + }; + + Result::print_header(std::cout, options) << std::endl; + + int idx = 1; + + for (auto const &layer : layers) { + for (auto N : batch_sizes) { + + options.update({N, layer.h, layer.w, layer.c}, {layer.k, layer.r, layer.s, layer.c}); + + Result result = profile_convolution(options); + result.print(std::cout, idx, options) << std::endl; + } + + ++idx; + } + } + else { + + // Execute one problem size + if (!options.valid()) { + std::cerr << "Invalid problem." << std::endl; + return -1; + } + + Result result = profile_convolution(options); + + Result::print_header(std::cout, options) << std::endl; + result.print(std::cout, 1, options) << std::endl; + } + + return 0; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + + + diff --git a/examples/12_gemm_bias_relu/gemm_bias_relu.cu b/examples/12_gemm_bias_relu/gemm_bias_relu.cu index 1f83a61af9..2b5c779bc6 100644 --- a/examples/12_gemm_bias_relu/gemm_bias_relu.cu +++ b/examples/12_gemm_bias_relu/gemm_bias_relu.cu @@ -106,21 +106,6 @@ using Gemm = cutlass::gemm::device::Gemm= 75)) { - std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75." - << std::endl; - // Returning zero so this test passes on older Toolkits. Its actions are no-op. - return 0; - } - const int length_m = 5120; const int length_n = 4096; const int length_k = 4096; @@ -265,17 +250,36 @@ int run() { } int main() { + + bool notSupported = false; + // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2. // // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples. if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + notSupported = true; + } + + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (!(props.major * 10 + props.minor >= 75)) { + std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75." + << std::endl; + notSupported = true; + } + if (notSupported) { // Returning zero so this test passes on older Toolkits. Its actions are no-op. return 0; } - else { - return run(); - } + + return run(); } diff --git a/examples/13_fused_two_gemms/fused_gemm.cu b/examples/13_fused_two_gemms/fused_gemm.cu index edc08d3189..b96a0ef090 100644 --- a/examples/13_fused_two_gemms/fused_gemm.cu +++ b/examples/13_fused_two_gemms/fused_gemm.cu @@ -55,22 +55,6 @@ Performance: int run() { - cudaDeviceProp props; - - cudaError_t error = cudaGetDeviceProperties(&props, 0); - if (error != cudaSuccess) { - std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; - return -1; - } - - if (!(props.major * 10 + props.minor >= 75)) { - std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75." - << std::endl; - - // Returning zero so this test passes on older Toolkits. Its actions are no-op. - return 0; - } - #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) run_nonfused_gemm_s8_sm80(); run_fused_gemm_s8_sm80(); @@ -85,17 +69,38 @@ int run() { } int main() { + + bool notSupported = false; + // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2. // // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples. if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + notSupported = true; + } + + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (!(props.major * 10 + props.minor >= 75)) { + std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75." + << std::endl; + + notSupported = true; + } + + if (notSupported) { // Returning zero so this test passes on older Toolkits. Its actions are no-op. return 0; } - else { - return run(); - } + + return run(); } diff --git a/examples/13_fused_two_gemms/kernel/b2b_gemm.h b/examples/13_fused_two_gemms/kernel/b2b_gemm.h index 5df5e4e38d..a67b1e877c 100644 --- a/examples/13_fused_two_gemms/kernel/b2b_gemm.h +++ b/examples/13_fused_two_gemms/kernel/b2b_gemm.h @@ -335,7 +335,7 @@ struct B2bGemm { semaphore.fetch(); // Indicate which position in a serial reduction the output operator is currently updating - output_op_1.set_k_partition(threadblock_tile_offset.k()); + output_op_1.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k()); } // Tile iterator loading from source tensor. diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu index 2533557134..84eadc5eab 100644 --- a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu +++ b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu @@ -113,31 +113,6 @@ using Gemm = cutlass::gemm::device::Gemm= 11)) { - std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl; - return -1; - } - - cudaDeviceProp props; - - cudaError_t error = cudaGetDeviceProperties(&props, 0); - if (error != cudaSuccess) { - std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; - return -1; - } - - if (!((props.major * 10 + props.minor) >= 80)) { - std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 80." - << std::endl; - - // Return 0 so tests are considered passing if run on unsupported platforms. - return 0; - } - const int length_m = 5120; const int length_n = 4096; const int length_k = 4096; @@ -262,17 +237,36 @@ int run() { } int main() { + + bool notSupported = false; + // Ampere Tensor Core operations exposed with mma.sync and ldmatrix are first available // in CUDA 11.0. // // CUTLASS must be compiled with CUDA 11.0 Toolkit to run these examples. if (!(__CUDACC_VER_MAJOR__ >= 11)) { std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl; + notSupported = true; + } - // Returning zero so this test passes when built on older Toolkits. - return 0; + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (!((props.major * 10 + props.minor) >= 80)) { + std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 80." + << std::endl; + notSupported = true; } - else { - return run(); + + if (notSupported) { + // Returning zero so this test passes on older Toolkits. Its actions are no-op. + return 0; } + + return run(); } diff --git a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu index 02f65b199e..1b233c488b 100644 --- a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu +++ b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu @@ -71,7 +71,7 @@ using SmArch = cutlass::arch::Sm80; // This code section describes the tile size a thread block will compute using ShapeMMAThreadBlock = - cutlass::gemm::GemmShape<256, 128, 256>; // <- threadblock tile M = 128, N = 128, K = 256 + cutlass::gemm::GemmShape<128, 128, 256>; // <- threadblock tile M = 128, N = 128, K = 256 // This code section describes tile size a warp will compute using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 256>; // <- warp tile M = 64, N = 64, K = 256 // This code section describes the size of MMA op @@ -123,31 +123,6 @@ constexpr int kMetaSizeInBits = Gemm::kMetaSizeInBits; int run() { - // Ampere Sparse Tensor Core operations exposed with mma.sync and ldmatrix are first available - // in CUDA 11.1. - // - // CUTLASS must be compiled with CUDA 11.1 Toolkit to run these examples. - if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 1))) { - std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.1 Toolkit or later." << std::endl; - return -1; - } - - cudaDeviceProp props; - - cudaError_t error = cudaGetDeviceProperties(&props, 0); - if (error != cudaSuccess) { - std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; - return -1; - } - - if (!((props.major * 10 + props.minor) >= 80)) { - std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 80." - << std::endl; - - // Return 0 so tests are considered passing if run on unsupported platforms. - return 0; - } - const int length_m = 512; const int length_n = 512; const int length_k = 1024; @@ -295,17 +270,37 @@ int run() { } int main() { + + bool notSupported = false; + // Ampere Sparse Tensor Core operations exposed with mma.sync and ldmatrix are first available // in CUDA 11.1. // // CUTLASS must be compiled with CUDA 11.1 Toolkit to run these examples. + if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 1))) { std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.1 Toolkit or later." << std::endl; + notSupported = true; + } - // Returning zero so this test passes when built on older Toolkits. - return 0; + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; } - else { - return run(); + + if (!((props.major * 10 + props.minor) >= 80)) { + std::cerr << "Ampere Tensor Core operations must be run on a machine with compute capability at least 80." + << std::endl; + notSupported = true; } + + if (notSupported) { + // Returning zero so this test passes on older Toolkits. Its actions are no-op. + return 0; + } + + return run(); } diff --git a/examples/22_ampere_tensorop_conv2dfprop/CMakeLists.txt b/examples/22_ampere_tensorop_conv2dfprop/CMakeLists.txt new file mode 100644 index 0000000000..1b7daac3dc --- /dev/null +++ b/examples/22_ampere_tensorop_conv2dfprop/CMakeLists.txt @@ -0,0 +1,28 @@ +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, are permitted +# provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright notice, this list of +# conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, this list of +# conditions and the following disclaimer in the documentation and/or other materials +# provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used +# to endorse or promote products derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +cutlass_example_add_executable( + 22_ampere_tensorop_conv2dfprop + ampere_tensorop_conv2dfprop.cu + ) + diff --git a/examples/22_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu b/examples/22_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu new file mode 100644 index 0000000000..cb7c398661 --- /dev/null +++ b/examples/22_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu @@ -0,0 +1,763 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/** + +This example shows how to run convolution kernels using functions and data structures +provided by CUTLASS using tensor cores; which we run on a NVIDIA Ampere GPU. + +Writing a single high performance convolution kernel is hard but do-able. Whereas writing +high performance kernels at scale which works for multiple problem sizes with good abstractions is +really hard. CUTLASS solves this problem by providing simplified abstractions to compose +multiple sections of implicit gemm kernel. When used properly, the kernels can hit peak performance +of GPU easily. + +CUTLASS divides a kernel into hierarchical composable sections. Which means, at each thread, warp +and thread-block level, they compute on their own tile-size with higher level of tile sizes being +composed from lower level ones. Multiple thread-tiles (tile size each thread computes) can be used +to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute +threadblock-tile (tile size computed by a threadblock). + +In thie example, we split variable initialization into +1. Setting up data properties : describes how tensors are laid out in the memory and how the kernel +can view them (logical to physical mapping) +2. Setting up computation properties : describes how the above set tensors will be used to compute +output of convolution. + +First, we setup the data types of the input tensor A, weights' tensor B and output tensor C along +with alpha, beta as the equation for convolution is C = alpha * Conv2dFprop(A, B) + beta * C. In CUTLASS, +the kernels first compute Conv2dFprop(A, B) and leave the rest of the computation to end of the kernel as +alpha * X + beta * C is a simple element-wise operation on X (Conv2dFprop(A, B)) and C. We call this as +epilogue of kernel. Hence, we setup data types for alpha and beta to be equal to +ElementComputeEpilogue = float. We use the data type for elements in input tensor A and B as +cutlass::half_t. We convey this to CUTLASS kernel by initializing template variables ElementAccumulator (float), +ElementComputeEpilogue (float), ElementInputA (cutlass::half_t), ElementInputB (cutlass::half_t), +ElementOutput (float). Communicating just the data type is not enough. As the data is laid out +linearly in memory, we have to convey the layout of tensors. We do that by initializing template +variables LayoutInputA, LayoutInputB and LayoutOutput to TensorNHWC cutlass variable. Next, we setup +rules to comptue alpha * X + beta * C which is called epilogue of the kernel. We initialize template +variable EpilogueOp, which takes the data type of output ElementOutput (float), the number of +elements per vector memory access (8), data type of accumulator (float) and data type of +computation of linear combination (alpha * X + beta * C). + +Now that we setup the properties of data, we have to setup properties of computation. + +Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x64, +64x64x64, 16x8x16 (MxNxK) respectively. When passed to instantiate CUTLASS Implicit GEMM kernel, it +internally deduces the amount of threads needed per thread-block, amount of shared memory, storing +data in bank-conflict free manner, and ton of other variables required to compose, intialize and +launch a high performance Implicit GEMM kernel. This is the beauty of CUTLASS, it relieves developer +from understanding and coding complicated hardware optimizations which can easily go wrong. + +CUTLASS also supports multiple MMA pipelines in a threadblock. What are MMA pipelines? MMA pipelines +constitute the whole process of loading input data from global memory to shared memory, loading data +from shared memory to registers, doing matrix multiplication, store to global memory. The below flow +sequence shows a typical mma multistage pipeline. +(see include/cutlass/conv/threadblock/implicit_gemm_multistage.h) + +tensor in global memory --cp_async--> tile in shared memory --smem loads--> registers +--mma--> registers --global stores--> output to global memory + +NVIDIA Ampere uses `cp_async` to build multistage software pipeline to better hide latencies. + + +There are few more template variables initialized such as, which threadblock tile of output matrix +is done which threadblock launched on an SM, CUDA SM architecture of GPU you want to run on. + +These are all put together to create a template variable which describes CUTLASS Implicit GEMM +kernel using cutlass::conv::device::ImplicitGemm template. + +The next step is to intialize physical data, instantiate and initialize CUTLASS kernel and run it. +We use CUTLASS utilities to initialize, fill, compare tensors as they are simple and doesn't come +in the way of learning CUTLASS. + +Once all the tensors are initialized and filled with data, create arguments tuple to launch CUTLASS +kernel which takes problem size (N = 1, H = 64, W = 64, C = 128), filter size (K = 64, +R = 3, S = 3, C = 128 ), padding, strides, dilation, tensors, alpha, beta and the +important one, split k-dimension factor. Along with that, we query CUTLASS if any scratch-space +memory required by the kernel we instantiated. If yes, we create it and pass it along with other +arguments created to intialize CUTLASS kernel then, the kernel is launched. + +In this example, we later on launch a reference convolution kernel (from CUTLASS utilities) to +compare if the output from CUTLASS kernel is same as the reference implicit GEMM kernel. +*/ + +#include +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "cutlass/util/command_line.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/device/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/convolution.h" +#include "cutlass/util/tensor_view_io.h" + +#include "helper.h" + +// The code section below describes datatype for input, output tensors and computation between +// elements +using ElementAccumulator = float; // Data type of accumulator +using ElementComputeEpilogue = float; // Data type of epilogue computation (alpha, beta) +using ElementInputA = cutlass::half_t; // Data type of elements in input tensor +using ElementInputB = cutlass::half_t; // Data type of elements in input tensor +using ElementOutput = float; // Data type of elements in output tensor + +using LayoutInputA = cutlass::layout::TensorNHWC; +using LayoutInputB = cutlass::layout::TensorNHWC; +using LayoutOutput = cutlass::layout::TensorNHWC; + +// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM +using MMAOp = cutlass::arch::OpClassTensorOp; + +// This code section describes CUDA SM architecture number +using SmArch = cutlass::arch::Sm80; + +// This code section describes the tile size a thread block will compute +using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 64>; // Threadblock tile shape + +// This code section describes tile size a warp will compute +using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; // Warp tile shape + +// This code section describes the size of MMA op +using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; // TensorCore instruction shape + +// This code section describes how threadblocks are scheduled on GPU +using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; + +// Number of pipelines you want to use +constexpr int NumStages = 3; + +// This code section describe iterator algorithm selected is Analytic or Optimized +static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = cutlass::conv::IteratorAlgorithm::kAnalytic; + +// This code section describes the epilogue part of the kernel, we use default value +using EpilogueOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, // Data type of output matrix. + 128 / cutlass::sizeof_bits::value, // The number of elements per vectorized. + // memory access. This becomes the vector width of + // math instructions in the epilogue too. + ElementAccumulator, // Data type of accumulator + ElementComputeEpilogue>; // Data type for alpha/beta in linear combination + + +using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementInputA, LayoutInputA, + ElementInputB, LayoutInputB, + ElementOutput, LayoutOutput, + ElementAccumulator, + MMAOp, + SmArch, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOp, + SwizzleThreadBlock, + NumStages, + cutlass::arch::OpMultiplyAdd, + IteratorAlgorithm +>::Kernel; + +using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution; + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + cutlass::Tensor4DCoord input_size; + cutlass::Tensor4DCoord filter_size; + cutlass::Tensor4DCoord padding; + cutlass::MatrixCoord conv_stride; + cutlass::MatrixCoord dilation; + bool reference_check; + bool measure_performance; + int iterations; + bool save_workspace; + ElementComputeEpilogue alpha; + ElementComputeEpilogue beta; + bool benchmark; + std::string tag; + + Options(): + help(false), + input_size(1, 32, 32, 32), + filter_size(32, 3, 3, 32), + padding(1, 1, 1, 1), + conv_stride(1, 1), + dilation(1, 1), + reference_check(false), + measure_performance(true), + iterations(20), + save_workspace(false), + alpha(1), + beta(0), + benchmark(false) { } + + // Verify the problem size is compatible with the CUTLASS Convolution implementation. + bool valid() { + + // + // CUTLASS attempts to load 128b vectors of cutlass::half_t (F16) elements. Consequently, + // all pointers, strides, and tensor extents must be divisible by 8 elements. + // + int const kAlignment = 8; + + if ((input_size.c() % kAlignment) || + (filter_size.n() % kAlignment)) { + + // misaligned tensors + return false; + } + + // Invalid padding + if ((padding.h() != filter_size.h() / 2) || + (padding.w() != filter_size.w() / 2)) { + + return false; + } + + return true; + } + + /// Updates input and filter sizes + void update( + cutlass::Tensor4DCoord input_size, + cutlass::Tensor4DCoord filter_size) { + + this->input_size = input_size; + this->filter_size = filter_size; + + padding.n() = filter_size.h() / 2; + padding.h() = filter_size.h() / 2; + padding.w() = filter_size.w() / 2; + padding.c() = filter_size.w() / 2; + } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + } + + if (cmd.check_cmd_line_flag("ref-check")) { + reference_check = true; + } + + if (cmd.check_cmd_line_flag("perf-check")) { + measure_performance = true; + } + + if (cmd.check_cmd_line_flag("save-workspace")) { + save_workspace = true; + } + + if (cmd.check_cmd_line_flag("benchmark")) { + benchmark = true; + } + + cmd.get_cmd_line_argument("n", input_size.n()); + cmd.get_cmd_line_argument("h", input_size.h()); + cmd.get_cmd_line_argument("w", input_size.w()); + cmd.get_cmd_line_argument("c", input_size.c()); + + cmd.get_cmd_line_argument("k", filter_size.n()); + cmd.get_cmd_line_argument("r", filter_size.h()); + cmd.get_cmd_line_argument("s", filter_size.w()); + filter_size.c() = input_size.c(); + + cmd.get_cmd_line_argument("alpha", alpha); + cmd.get_cmd_line_argument("beta", beta); + + cmd.get_cmd_line_argument("iterations", iterations); + cmd.get_cmd_line_argument("tag", tag); + + if (filter_size.h() == 3 && filter_size.w() == 3) { + padding = {1, 1, 1, 1}; + } + else { + filter_size.h() = 1; + filter_size.w() = 1; + padding = {0, 0, 0, 0}; + } + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "22_ampere_tensorop_conv2dfprop example\n\n" + << " This example uses Ampere's Tensor Core operators on F16 data types to compute\n" + << " forward convolution on tensors of layout NHWC.\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement.\n\n" + << " --n Input tensor extent N\n" + << " --h Input tensor extent H\n" + << " --w Input tensor extent W\n" + << " --c Input tensor extent C\n" + << " --k Filter extent K\n" + << " --r Filter extent R\n" + << " --s Filter extent S\n\n" + << " --alpha Epilogue scalar alpha\n" + << " --beta Epilogue scalar beta\n\n" + << " --ref-check If set (true), reference check on the host is computed\n" + << " --perf-check If set (true), performance is measured.\n" + << " --benchmark If set (true), performance benchmarking on several layers and batch-size.\n" + << " --iterations Number of profiling iterations to perform.\n" + << " --save-workspace If set, workspace is written to a text file.\n" + << " --tag String to replicate across the first column in the results table\n"; + + out << "\n\nExamples:\n\n" + << "$ ./examples/22_ampere_tensorop_conv2dfprop/22_ampere_tensorop_conv2dfprop --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n" + << "$ ./examples/22_ampere_tensorop_conv2dfprop/22_ampere_tensorop_conv2dfprop --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check\n\n"; + + return out; + } + + /// Computes the output tensor size (NPQK) + cutlass::Tensor4DCoord output_size() const { + return cutlass::Tensor4DCoord( + input_size.n(), + (input_size.h() + padding.n() + padding.h() - filter_size.h()) / conv_stride.row() + 1, + (input_size.w() + padding.w() + padding.c() - filter_size.w()) / conv_stride.column() + 1, + filter_size.n()); + } + + /// Compute performance in GFLOP/s + double gflops(double runtime_s) const { + + // Number of multiply-adds = NPQK * CRS + int64_t fmas = output_size().product() * int64_t(filter_size.h() * filter_size.w() * filter_size.c()); + + // Two flops per multiply-add + return 2.0 * double(fmas) / double(1.0e9) / runtime_s; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +struct Result { + double runtime_ms; + double gflops; + cutlass::Status status; + cutlass::Status reference_check; + cudaError_t error; + + Result(): + runtime_ms(0), + gflops(0), + status(cutlass::Status::kSuccess), + reference_check(cutlass::Status::kInvalid), + error(cudaSuccess) { } + + static std::ostream & print_header(std::ostream &out, Options const &options) { + + if (!options.tag.empty()) { + out << "Name,"; + } + + out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPs"; + + return out; + } + + std::ostream & print(std::ostream &out, int idx, Options const &options) { + + if (!options.tag.empty()) { + out << options.tag << ","; + } + + out + << "conv_" << idx << "," + << options.input_size.n() << "," + << options.input_size.h() << "," + << options.input_size.w() << "," + << options.input_size.c() << "," + << options.filter_size.n() << "," + << options.filter_size.h() << "," + << options.filter_size.w() << "," + << runtime_ms << "," + << gflops; + + return out; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Runs one benchmark +Result profile_convolution(Options const &options) { + + Result result; + + // + // Allocate host-device tensors using the CUTLASS Utilities. + // + + cutlass::HostTensor tensor_a(options.input_size); + cutlass::HostTensor tensor_b(options.filter_size); + cutlass::HostTensor tensor_c(options.output_size()); + cutlass::HostTensor tensor_ref_c(options.output_size()); + + // + // Initialize tensors + // + + // Fill tensor A on host with uniform-distribution random data + cutlass::reference::host::TensorFillRandomUniform( + tensor_a.host_view(), + 1, + ElementInputA(7), + ElementInputA(-8), + 0); + + // Fill tensor B on host with uniform-distribution random data + cutlass::reference::host::TensorFillRandomUniform( + tensor_b.host_view(), + 1, + ElementInputB(7), + ElementInputB(-8), + 0); + + // Fill tensor C on host with zeros + cutlass::reference::host::TensorFill( + tensor_c.host_view()); + + // Fill tensor C for reference on host with zeros + cutlass::reference::host::TensorFill( + tensor_ref_c.host_view()); + + // Copy data from host to GPU + tensor_a.sync_device(); + tensor_b.sync_device(); + tensor_c.sync_device(); + tensor_ref_c.sync_device(); + + // + // Define arguments for CUTLASS Convolution + // + + cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation; + + // Split K dimension into 1 partitions + int split_k_slices = 1; + + typename ImplicitGemm::Arguments arguments{ + { + options.input_size, + options.filter_size, + options.padding, + options.conv_stride, + options.dilation, + options.output_size(), + mode, + split_k_slices + }, + tensor_a.device_ref(), + tensor_b.device_ref(), + tensor_c.device_ref(), + tensor_c.device_ref(), + {options.alpha, options.beta}, + + + }; + + // + // Initialize CUTLASS Convolution + // + + ImplicitGemm implicit_gemm_op; + + size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments); + + // Allocate workspace memory + cutlass::device_memory::allocation workspace(workspace_size); + + result.status = implicit_gemm_op.initialize(arguments, workspace.get()); + CUTLASS_CHECK(result.status); + + // + // Launch initialized CUTLASS kernel + // + result.status = implicit_gemm_op(); + + CUTLASS_CHECK(result.status); + + // + // Optional reference check + // + + if (options.reference_check) { + std::cout << "Verification on host...\n"; + + cutlass::conv::Conv2dProblemSize problem_size( + options.input_size, + options.filter_size, + options.padding, + options.conv_stride, + options.dilation, + mode + ); + + // Compute with reference implementation + cutlass::reference::host::Conv2dFprop< + ElementInputA, + LayoutInputA, + ElementInputB, + LayoutInputB, + ElementOutput, + LayoutOutput, + ElementComputeEpilogue, + ElementAccumulator, + cutlass::NumericConverter + >( + problem_size, + tensor_a.host_ref(), + tensor_b.host_ref(), + tensor_c.host_ref(), + tensor_ref_c.host_ref(), + options.alpha, + options.beta + ); + + // Check if output from CUTLASS kernel and reference kernel are equal or not + tensor_c.sync_host(); + + bool passed = cutlass::reference::host::TensorEquals( + tensor_c.host_view(), + tensor_ref_c.host_view()); + + if (!passed) { + result.reference_check = cutlass::Status::kErrorInternal; + std::cout << "ERROR - results miscompared.\n"; + } + else { + result.reference_check = cutlass::Status::kSuccess; + std::cout << "Passed.\n"; + } + } + else { + result.reference_check = cutlass::Status::kInvalid; + } + + if (options.save_workspace) { + + std::stringstream ss; + + ss << "22_ampere_workspace_conv2dfprop_" + << options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c() + << "_" + << options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c() + << ".dat"; + + std::ofstream output_workspace(ss.str()); + + output_workspace + << "Input = \n" << tensor_a.host_view() << "\n\n" + << "Filters = \n" << tensor_b.host_view() << "\n\n"; + + if (options.reference_check) { + output_workspace << "Reference = \n" << tensor_ref_c.host_view() << "\n\n"; + } + + output_workspace << "Computed = \n" << tensor_c.host_view() << std::endl; + + std::cout << "Results written to '" << ss.str() << "'." << std::endl; + } + + // + // Performance measurement + // + + if (options.measure_performance) { + + cudaEvent_t events[2]; + + for (auto & event : events) { + result.error = cudaEventCreate(&event); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + } + + // Record an event at the start of a series of convolution operations. + result.error = cudaEventRecord(events[0]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Launch a sequence of implicit GEMM operations on the device + for (int iteration = 0; iteration < options.iterations; ++iteration) { + result.status = implicit_gemm_op(); + CUTLASS_CHECK(result.status); + } + + // Record an event when the convolutions have been launched. + result.error = cudaEventRecord(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Wait for work on the device to complete. + result.error = cudaEventSynchronize(events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Measure elapsed runtime + float runtime_ms = 0; + result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]); + if (result.error != cudaSuccess) { + std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl; + return result; + } + + // Print average runtime and GFLOPs. + result.runtime_ms = double(runtime_ms) / double(options.iterations); + result.gflops = options.gflops(result.runtime_ms / 1000.0); + + // Cleanup + for (auto event : events) { + (void)cudaEventDestroy(event); + } + } + + return result; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +int main(int argc, char const **args) { + + bool notSupported = false; + + // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 10.2. + // + // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples. + if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) { + std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl; + notSupported = true; + } + + cudaDeviceProp props; + CUDA_CHECK(cudaGetDeviceProperties(&props, 0)); + + if (!(props.major > 8 || (props.major == 8 && props.minor >= 0))) { + std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80." + << std::endl; + notSupported = true; + } + + if (notSupported) { + return 0; + } + + Options options; + + options.parse(argc, args); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + if (options.benchmark) { + // Benchmark several layers + + int batch_sizes[] = {1, 32, 64, 128, 256, 512}; + + struct Benchmark { + int h, w, c, k, r, s; + } layers[] = { + {56, 56, 64, 256, 1, 1}, + {56, 56, 64, 64, 1, 1}, + {56, 56, 64, 64, 3, 3}, + {56, 56, 256, 64, 1, 1}, + {56, 56, 256, 512, 1, 1}, + {56, 56, 256, 128, 1, 1}, + {28, 28, 128, 128, 3, 3}, + {28, 28, 128, 512, 1, 1}, + {28, 28, 512, 128, 1, 1}, + {28, 28, 512, 1024, 1, 1}, + {28, 28, 512, 256, 1, 1}, + {14, 14, 256, 256, 3, 3}, + {14, 14, 256, 1024, 1, 1}, + {14, 14, 1024, 256, 1, 1}, + {14, 14, 1024, 2048, 1, 1}, + {14, 14, 1024, 512, 1, 1}, + {7, 7, 512, 512, 3, 3}, + }; + + Result::print_header(std::cout, options) << std::endl; + + int idx = 1; + + for (auto const &layer : layers) { + for (auto N : batch_sizes) { + + options.update({N, layer.h, layer.w, layer.c}, {layer.k, layer.r, layer.s, layer.c}); + + Result result = profile_convolution(options); + result.print(std::cout, idx, options) << std::endl; + } + + ++idx; + } + } + else { + + // Execute one problem size + if (!options.valid()) { + std::cerr << "Invalid problem." << std::endl; + return -1; + } + + Result result = profile_convolution(options); + + Result::print_header(std::cout, options) << std::endl; + result.print(std::cout, 1, options) << std::endl; + } + + return 0; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + + + diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index aabfa53c62..d51df92c70 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -22,15 +22,20 @@ set(CUTLASS_EXAMPLES_COMMON_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/common) +add_custom_target(cutlass_examples) +add_custom_target(test_examples) + function(cutlass_example_add_executable NAME) set(options) set(oneValueArgs) - set(multiValueArgs) + set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS) cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cutlass_add_executable(${NAME} ${__UNPARSED_ARGUMENTS}) + add_dependencies(cutlass_examples ${NAME}) + target_link_libraries( ${NAME} PRIVATE @@ -44,18 +49,20 @@ function(cutlass_example_add_executable NAME) ${CUTLASS_EXAMPLES_COMMON_SOURCE_DIR} ) - add_custom_target( - test_${NAME} - COMMAND - ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $ - DEPENDS - ${NAME} + install( + TARGETS ${NAME} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ) -endfunction() + cutlass_add_executable_tests( + test_examples_${NAME} ${NAME} + DEPENDS ${__DEPENDS} + DEPENDEES test_examples ${__DEPENDEES} + TEST_COMMAND_OPTIONS ${__TEST_COMMAND_OPTIONS} + DISABLE_EXECUTABLE_INSTALL_RULE + ) -add_custom_target(cutlass_examples) -add_custom_target(test_examples) +endfunction() foreach(EXAMPLE 00_basic_gemm @@ -67,16 +74,16 @@ foreach(EXAMPLE 06_splitK_gemm 07_volta_tensorop_gemm 08_turing_tensorop_gemm + 09_turing_tensorop_conv2dfprop 10_planar_complex 11_planar_complex_array 12_gemm_bias_relu 13_fused_two_gemms 14_ampere_tf32_tensorop_gemm 15_ampere_sparse_tensorop_gemm -) + 22_ampere_tensorop_conv2dfprop + ) add_subdirectory(${EXAMPLE}) - add_dependencies(cutlass_examples ${EXAMPLE}) - add_dependencies(test_examples test_${EXAMPLE}) endforeach() diff --git a/include/cutlass/arch/memory_sm80.h b/include/cutlass/arch/memory_sm80.h index 04c568760e..045196cb8f 100644 --- a/include/cutlass/arch/memory_sm80.h +++ b/include/cutlass/arch/memory_sm80.h @@ -74,6 +74,10 @@ template < /// Size of the access in bytes int SizeInBytes> struct cp_async { + // Make sure the size is supported. + static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16), + "Size is not supported"); + /// Copy CUTLASS_DEVICE cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) { @@ -104,6 +108,10 @@ template < /// Size of the access in bytes int SizeInBytes> struct cp_async_zfill { + // Make sure the size is supported. + static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16), + "Size is not supported"); + /// Copy with zero fill CUTLASS_DEVICE cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) { @@ -138,6 +146,10 @@ template < /// Size of the access in bytes int SizeInBytes> struct cp_async { + // Make sure the size is supported. + static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16), + "Size is not supported"); + /// Copy CUTLASS_DEVICE cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) { @@ -171,6 +183,10 @@ template < /// Size of the access in bytes int SizeInBytes> struct cp_async_zfill { + // Make sure the size is supported. + static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16), + "Size is not supported"); + /// Copy with zero fill CUTLASS_DEVICE cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) { @@ -235,4 +251,3 @@ CUTLASS_DEVICE void cp_async_wait<0>() { } // namespace cutlass ///////////////////////////////////////////////////////////////////////////////////////////////// - diff --git a/include/cutlass/arch/mma.h b/include/cutlass/arch/mma.h index 49f3979cab..729cd17917 100644 --- a/include/cutlass/arch/mma.h +++ b/include/cutlass/arch/mma.h @@ -201,5 +201,5 @@ struct SparseMma; #include "cutlass/arch/mma_sm70.h" #include "cutlass/arch/mma_sm75.h" #include "cutlass/arch/mma_sm80.h" -#include "cutlass/arch/sp_mma_sm80.h" +#include "cutlass/arch/mma_sparse_sm80.h" ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/arch/mma_sm75.h b/include/cutlass/arch/mma_sm75.h index a862e65df5..c5e0db9720 100644 --- a/include/cutlass/arch/mma_sm75.h +++ b/include/cutlass/arch/mma_sm75.h @@ -365,7 +365,7 @@ struct Mma< } }; -/// Matrix multiply-add operation: S32 = S8 * U8 + S32 +/// Matrix multiply-add operation: S32 = U8 * U8 + S32 template <> struct Mma< gemm::GemmShape<8, 8, 16>, @@ -599,7 +599,7 @@ struct Mma< } }; -/// Matrix multiply-add operation: S32 = S8 * U8 + S32 +/// Matrix multiply-add operation: S32 = U8 * U8 + S32 template <> struct Mma< gemm::GemmShape<8,8,16>, diff --git a/include/cutlass/arch/sp_mma_sm80.h b/include/cutlass/arch/mma_sparse_sm80.h similarity index 99% rename from include/cutlass/arch/sp_mma_sm80.h rename to include/cutlass/arch/mma_sparse_sm80.h index 0c8989b86a..a93fd2924c 100644 --- a/include/cutlass/arch/sp_mma_sm80.h +++ b/include/cutlass/arch/mma_sparse_sm80.h @@ -29,7 +29,15 @@ #pragma once -#include "mma_sm80.h" +#if defined(__CUDACC_RTC__) +#include +#else +#include +#endif + +#include "mma.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/numeric_types.h" ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/arch/wmma.h b/include/cutlass/arch/wmma.h index 88968abdc5..0a556aee3a 100644 --- a/include/cutlass/arch/wmma.h +++ b/include/cutlass/arch/wmma.h @@ -52,7 +52,7 @@ #endif #endif -#endif //__clang__ +#endif //!defined(__clang__) #if defined(CUTLASS_ARCH_WMMA_ENABLED) @@ -82,6 +82,12 @@ struct CutlassToWmmaDataType { using Type = __half; }; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11) +template<> +struct CutlassToWmmaDataType { + using Type = __nv_bfloat16; +}; +#endif /// Statically maps int8_t => char template<> @@ -158,6 +164,14 @@ template<> struct WmmaToCutlassDataType<__half> { using Type = cutlass::half_t; }; + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11) +template<> +struct WmmaToCutlassDataType<__nv_bfloat16> { + using Type = cutlass::bfloat16_t; +}; +#endif + //////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/conv/conv2d_problem_size.h b/include/cutlass/conv/conv2d_problem_size.h new file mode 100644 index 0000000000..735103722d --- /dev/null +++ b/include/cutlass/conv/conv2d_problem_size.h @@ -0,0 +1,450 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief This file contains definitions and utility functions for describing convolution problem sizes. + + Conv2dProblem desciption: + activation (NHWC), + filter (KRSC), + output (NPQK), + pading (pad_h, pad_w), + stride (stride_h, stride_w), + dilation (dilation_h, dilation_w). + + Free functions to map: + Map tensor extents (Conv2d -> ImplicitGemm) : implicit_gemm_tensor_[a|b|c]_extent(ConvolutionOperator) + Map tensor sizes (Conv2d -> ImplicitGemm) : implicit_gemm_tensor_[a|b|c]_size(ConvolutionOperator) + Map tensor problem sizes (Conv2d -> ImplicitGemm): implicit_gemm_problem_size(ConvolutionOperator) +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/tensor_coord.h" +#include "cutlass/fast_math.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/matrix_coord.h" +#include "cutlass/conv/convolution.h" + +namespace cutlass { +namespace conv { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Problem size structure +struct Conv2dProblemSize { + + // Conv2d strictly problem size parameters + int N, H, W, C, P, Q, K, R, S; + int pad_h, pad_w; + int stride_h, stride_w; + int dilation_h, dilation_w; + Mode mode; + + // Conv2d implementation-related parameters + int split_k_slices; + int groups; + + // + // Methods + // + +public: + CUTLASS_HOST_DEVICE + Conv2dProblemSize(): + N(0), H(0), W(0), C(0), P(0), Q(0), K(0), R(0), S(0), + pad_h(0), pad_w(0), stride_h(1), stride_w(1), dilation_h(1), dilation_w(1), + mode(Mode::kConvolution), split_k_slices(1), groups(1) { } + + /// Constructor for default padding, stride, dilation, and split-K + CUTLASS_HOST_DEVICE + Conv2dProblemSize( + int N, + int H, + int W, + int C, + int P, + int Q, + int K, + int R, + int S, + Mode mode + ): + N(N), H(H), W(W), C(C), P(P), Q(Q), K(K), R(R), S(S), + pad_h(R / 2), pad_w(S / 2), stride_h(1), stride_w(1), dilation_h(1), dilation_w(1), + mode(mode), split_k_slices(1), groups (1) { } + + /// Constructor + CUTLASS_HOST_DEVICE + Conv2dProblemSize( + int N, + int H, + int W, + int C, + int K, + int R, + int S, + int P, + int Q, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + int dilation_h, + int dilation_w, + Mode mode, + int split_k_slices = 1, + int groups = 1 + ): + N(N), H(H), W(W), C(C), K(K), R(R), S(S), P(P), Q(Q), + pad_h(pad_h), pad_w(pad_w), stride_h(stride_h), stride_w(stride_w), + dilation_h(dilation_h), dilation_w(dilation_w), + mode(mode), split_k_slices(split_k_slices), groups (groups) { } + + /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord + // set user-defined output size and sets P and Q (include all data members in ctor) + CUTLASS_HOST_DEVICE + Conv2dProblemSize( + cutlass::Tensor4DCoord input_size, // NHWC + cutlass::Tensor4DCoord filter_size, // KRSC + cutlass::Tensor4DCoord padding, // pad_h, _, pad_w, _ + cutlass::MatrixCoord stride, // stride_h, stride_w + cutlass::MatrixCoord dilation, // dilation_h, dilation_w + cutlass::Tensor4DCoord output_size, // NPQK + cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation, + int split_k_slices = 1, + int groups = 1 + ): + N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()), + K(filter_size.n()), R(filter_size.h()), S(filter_size.w()), + pad_h(padding[0]), pad_w(padding[2]), + stride_h(stride.row()), stride_w(stride.column()), + dilation_h(dilation.row()), dilation_w(dilation.column()), + P(output_size.h()), Q(output_size.w()), + mode(mode), split_k_slices(split_k_slices), groups(groups) {} + + /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord + // computes output size and sets P and Q (skip output from ctor arguments) + CUTLASS_HOST_DEVICE + Conv2dProblemSize( + cutlass::Tensor4DCoord input_size, // NHWC + cutlass::Tensor4DCoord filter_size, // KRSC + cutlass::Tensor4DCoord padding, // pad_h, _, pad_w, _ + cutlass::MatrixCoord stride, // stride_h, stride_w + cutlass::MatrixCoord dilation, // dilation_h, dilation_w + cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation, + int split_k_slices = 1, + int groups = 1 + ): + N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()), + K(filter_size.n()), R(filter_size.h()), S(filter_size.w()), + pad_h(padding[0]), pad_w(padding[2]), + stride_h(stride.row()), stride_w(stride.column()), + dilation_h(dilation.row()), dilation_w(dilation.column()), + mode(mode), split_k_slices(split_k_slices), groups(groups) { + // set output P and Q + P = ((H + pad_h * 2 - R * dilation_h) / stride_h) + 1; + Q = ((W + pad_w * 2 - S * dilation_w) / stride_w) + 1; + } + + /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord + // set user-defined output size and sets P and Q (skip padding, striding, and dilation) + CUTLASS_HOST_DEVICE + Conv2dProblemSize( + cutlass::Tensor4DCoord input_size, // NHWC + cutlass::Tensor4DCoord filter_size, // KRSC + cutlass::Tensor4DCoord output_size, // NPQK + cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation, + int split_k_slices = 1, + int groups = 1 + ): + N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()), + K(filter_size.n()), R(filter_size.h()), S(filter_size.w()), + P(output_size.h()), Q(output_size.w()), + pad_h(R / 2), pad_w(S / 2), stride_h(1), stride_w(1), + dilation_h(1), dilation_w(1), + mode(mode), split_k_slices(split_k_slices), groups(groups) {} + + // Reset covolution mode in the problem + CUTLASS_HOST_DEVICE + Conv2dProblemSize reset_mode(cutlass::conv::Mode mode_) { + Conv2dProblemSize tmp(*this); + tmp.mode = mode_; + return tmp; + } + + // Reset covolution mode in the problem + CUTLASS_HOST_DEVICE + Conv2dProblemSize reset_split_k_slices(int split_k_slices_) { + Conv2dProblemSize tmp(*this); + tmp.split_k_slices = split_k_slices_; + return tmp; + } + + /// Equality operator (ignores mode and split_k_slice) + CUTLASS_HOST_DEVICE + bool operator==(Conv2dProblemSize const &conv) const { + return ( + (N == conv.N) && (W == conv.H) && (W == conv.W) && (C == conv.C) && + (K == conv.K) && (R == conv.R) && (S == conv.S) && + (P == conv.P) && (Q == conv.Q) && + (pad_h == conv.pad_h) && (pad_w == conv.pad_w) && + (stride_h == conv.stride_h) && (stride_w == conv.stride_w) && + (dilation_h == conv.dilation_h) && (dilation_h == conv.dilation_h) + ); + } + + /// Inequality operator + CUTLASS_HOST_DEVICE + bool operator!=(Conv2dProblemSize const &rhs) const { + return !(*this == rhs); + } + + /// Returns activation extent as Tensor4DCoord + CUTLASS_HOST_DEVICE + cutlass::Tensor4DCoord activation_extent() const { + + return cutlass::Tensor4DCoord ({N, H, W, C}); + } + + /// Returns filter extent as Tensor4DCoord + CUTLASS_HOST_DEVICE + cutlass::Tensor4DCoord filter_extent() const { + + return cutlass::Tensor4DCoord ({K, R, S, C}); + } + + /// Returns output extent as Tensor4DCoord + CUTLASS_HOST_DEVICE + cutlass::Tensor4DCoord output_extent() const { + + return cutlass::Tensor4DCoord ({N, P, Q, K}); + } + + /// Returns activation size in number of elements + CUTLASS_HOST_DEVICE + int64_t activation_size() const { + + return (N * H * W * C); + } + + /// Returns filter size in number of elements + CUTLASS_HOST_DEVICE + int64_t filter_size() const { + + return (K * R * S * C); + } + + /// Returns output size in number of elements + CUTLASS_HOST_DEVICE + int64_t output_size() const { + + return (N * P * Q * K); + } + + /// Returns output extent as Tensor4DCoord + CUTLASS_HOST_DEVICE + cutlass::Tensor4DCoord padding() const { + + return cutlass::Tensor4DCoord ({pad_h, pad_h, pad_w, pad_w}); + } + + /// Returns stride as MatrixCoord + CUTLASS_HOST_DEVICE + cutlass::MatrixCoord stride() const { + + return cutlass::MatrixCoord ({stride_h, stride_w}); + } + + /// Returns dilation as MatrixCoord + CUTLASS_HOST_DEVICE + cutlass::MatrixCoord dilation() const { + + return cutlass::MatrixCoord ({dilation_h, dilation_w}); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// ImplicitGemm helper functions // +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Determine the problem size of the implicit GEMM operation +CUTLASS_HOST_DEVICE +cutlass::gemm::GemmCoord implicit_gemm_problem_size( + Operator conv_operator, + Conv2dProblemSize const &problem_size) { + // Compute problem size + switch (conv_operator) { + case Operator::kFprop: + return gemm::GemmCoord( + problem_size.N * problem_size.P * problem_size.Q, + problem_size.K, + problem_size.R * problem_size.S * problem_size.C + ); + case Operator::kDgrad: + return gemm::GemmCoord( + problem_size.N * problem_size.H * problem_size.W, + problem_size.C, + problem_size.R * problem_size.S * problem_size.K + ); + case Operator::kWgrad: + return gemm::GemmCoord( + problem_size.K, + problem_size.R * problem_size.S * problem_size.C, + problem_size.N * problem_size.P * problem_size.Q + ); + default: + break; + } + return gemm::GemmCoord(); +} + +// Determine the number of gemm_k iterations for conv2d problem using implicit gemm algorithm +CUTLASS_HOST_DEVICE +int implicit_gemm_k_iterations( + Operator conv_operator, + int threadblock_K, + Conv2dProblemSize const &problem_size) { + + int iterations = 0; + int elements_per_split_k_slice = 0; + + switch (conv_operator) { + case Operator::kFprop: + elements_per_split_k_slice = (problem_size.C + problem_size.split_k_slices - 1) / problem_size.split_k_slices; + iterations = problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K); + break; + + case Operator::kDgrad: + elements_per_split_k_slice = (problem_size.K + problem_size.split_k_slices - 1) / problem_size.split_k_slices; + iterations = problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K); + break; + + case Operator::kWgrad: + elements_per_split_k_slice = (problem_size.N * problem_size.P * problem_size.Q + problem_size.split_k_slices - 1) / problem_size.split_k_slices; + iterations = (elements_per_split_k_slice + threadblock_K - 1) / threadblock_K; + break; + + default: + break; + } + + return iterations; +} + + +//////////////////////////////////////////////////////////////////////////////// +// Mapping function (ImplicitGemm A, B, C -> Conv Activation, Filter, Output) +//////////////////////////////////////////////////////////////////////////////// +/// Returns ImplicitGemm tensor A extent as Tensor4DCoord +CUTLASS_HOST_DEVICE +cutlass::Tensor4DCoord implicit_gemm_tensor_a_extent( + Operator conv_operator, + Conv2dProblemSize const &problem_size) { + switch (conv_operator) { + case cutlass::conv::Operator::kFprop: return problem_size.activation_extent(); + case cutlass::conv::Operator::kDgrad: return problem_size.output_extent(); + case cutlass::conv::Operator::kWgrad: return problem_size.output_extent(); + default : break; + } + return cutlass::Tensor4DCoord(); +} + +/// Returns ImplicitGemm tensor B extent as Tensor4DCoord +CUTLASS_HOST_DEVICE +cutlass::Tensor4DCoord implicit_gemm_tensor_b_extent( + Operator conv_operator, + Conv2dProblemSize const &problem_size) { + switch (conv_operator) { + case cutlass::conv::Operator::kFprop: return problem_size.filter_extent(); + case cutlass::conv::Operator::kDgrad: return problem_size.filter_extent(); + case cutlass::conv::Operator::kWgrad: return problem_size.activation_extent(); + default : break; + } + return cutlass::Tensor4DCoord(); +} + +/// Returns ImplicitGemm tensor C extent as Tensor4DCoord +CUTLASS_HOST_DEVICE +cutlass::Tensor4DCoord implicit_gemm_tensor_c_extent( + Operator conv_operator, + Conv2dProblemSize const &problem_size) { + switch (conv_operator) { + case cutlass::conv::Operator::kFprop: return problem_size.output_extent(); + case cutlass::conv::Operator::kDgrad: return problem_size.activation_extent(); + case cutlass::conv::Operator::kWgrad: return problem_size.filter_extent(); + default : break; + } + return cutlass::Tensor4DCoord(); +} + +/// Returns ImplicitGemm tensor A size in number of elements +CUTLASS_HOST_DEVICE +int64_t implicit_gemm_tensor_a_size( + Operator conv_operator, + Conv2dProblemSize const &problem_size) { + switch (conv_operator) { + case cutlass::conv::Operator::kFprop: return problem_size.activation_size(); + case cutlass::conv::Operator::kDgrad: return problem_size.output_size(); + case cutlass::conv::Operator::kWgrad: return problem_size.output_size(); + default : break; + } + return 0; +} + +/// Returns ImplicitGemm tensor B size in number of elements +CUTLASS_HOST_DEVICE +int64_t implicit_gemm_tensor_b_size( + Operator conv_operator, + Conv2dProblemSize const &problem_size) { + switch (conv_operator) { + case cutlass::conv::Operator::kFprop: return problem_size.filter_size(); + case cutlass::conv::Operator::kDgrad: return problem_size.filter_size(); + case cutlass::conv::Operator::kWgrad: return problem_size.activation_size(); + default : break; + } + return 0; +} + +/// Returns ImplicitGemm tensor C size in number of elements +CUTLASS_HOST_DEVICE +int64_t implicit_gemm_tensor_c_size( + Operator conv_operator, + Conv2dProblemSize const &problem_size) { + switch (conv_operator) { + case cutlass::conv::Operator::kFprop: return problem_size.output_size(); + case cutlass::conv::Operator::kDgrad: return problem_size.activation_size(); + case cutlass::conv::Operator::kWgrad: return problem_size.filter_size(); + default : break; + } + return 0; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace conv +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/conv/conv3d_problem_size.h b/include/cutlass/conv/conv3d_problem_size.h new file mode 100644 index 0000000000..91827d2724 --- /dev/null +++ b/include/cutlass/conv/conv3d_problem_size.h @@ -0,0 +1,453 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief This file contains definitions and utility functions for describing convolution problem sizes. + + Conv3dProblem desciption: + activation (NDHWC), + filter (KTRSC), + output (NZPQK), + pading (pad_d, pad_h, pad_w), + stride (stride_d, stride_h, stride_w), + dilation (dilation_d, dilation_h, dilation_w). + + Free functions to map: + Map tensor extents (Conv3d -> ImplicitGemm) : implicit_gemm_tensor_[a|b|c]_extent(ConvolutionOperator) + Map tensor sizes (Conv3d -> ImplicitGemm) : implicit_gemm_tensor_[a|b|c]_size(ConvolutionOperator) + Map tensor problem sizes (Conv3d -> ImplicitGemm): implicit_gemm_problem_size(ConvolutionOperator) +*/ + +#pragma once + +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" + +namespace cutlass { +namespace conv { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Problem size structure +struct Conv3dProblemSize : public Conv2dProblemSize { + // + // Type definitions + // + + // 3D coordinate for padding, stride, and dilation in (d, h, w) dimensions + using Coord3D = Coord<3>; + + // + // Data members + // + + // Conv3d strictly problem size parameters + int D, T, Z; // input depth, filter depth, output depth + int pad_d; // padding in depth dimension + int stride_d; // stride in depth dimension + int dilation_d; // dilation in depth dimension + + // + // Methods + // +public: + CUTLASS_HOST_DEVICE + Conv3dProblemSize(): + D(0), T(0), Z(0), + pad_d(0), + stride_d(1), + dilation_d(1), + Conv2dProblemSize() { } + + /// Constructor for default padding, stride, dilation, and split-K + CUTLASS_HOST_DEVICE + Conv3dProblemSize( + int N, + int D, + int H, + int W, + int C, + int Z, + int P, + int Q, + int K, + int T, + int R, + int S, + Mode mode + ): + D(D), T(T), Z(Z), + pad_d(T / 2), stride_d(1), dilation_d(1), + Conv2dProblemSize(N, H, W, C, P, Q, K, R, S, mode) { } + + /// Constructor + CUTLASS_HOST_DEVICE + Conv3dProblemSize( + int N, + int D, + int H, + int W, + int C, + int K, + int T, + int R, + int S, + int Z, + int P, + int Q, + int pad_d, + int pad_h, + int pad_w, + int stride_d, + int stride_h, + int stride_w, + int dilation_d, + int dilation_h, + int dilation_w, + Mode mode, + int split_k_slices = 1, + int groups = 1 + ): + D(D), T(T), Z(Z), + pad_d(pad_d), stride_d(stride_d), dilation_d(dilation_d), + Conv2dProblemSize( + N, H, W, C, K, R, S, P, Q, + pad_h, pad_w, + stride_h, stride_w, + dilation_h, dilation_w, + mode, split_k_slices, groups) { } + + /// Constructs convolution problem size from cutlass Tensor5DCoord and Coord3D + // set *user-defined* output size and sets Z, P, and Q (include all data members in ctor) + CUTLASS_HOST_DEVICE + Conv3dProblemSize( + cutlass::Tensor5DCoord input_size, // NDHWC + cutlass::Tensor5DCoord filter_size, // KTRSC + Coord3D padding, // pad_d, pad_h, pad_w + Coord3D stride, // stride_d, stride_h, stride_w + Coord3D dilation, // dilation_d, dilation_h, dilation_w + cutlass::Tensor5DCoord output_size, // NZPQK + cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation, + int split_k_slices = 1, + int groups = 1 + ): + D(input_size.d()), T(filter_size.d()), Z(output_size.d()), + pad_d(padding[0]), stride_d(stride[0]), dilation_d(dilation[0]), + Conv2dProblemSize( + {input_size.n(), input_size.h(), input_size.w(), input_size.c()}, + {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()}, + {padding[1], padding[1], padding[2], padding[2]}, + {stride[1], stride[2]}, + {dilation[1], dilation[2]}, + {output_size.n(), output_size.h(), output_size.w(), output_size.c()}, + mode, split_k_slices, groups + ) { } + + /// Constructs convolution problem size from cutlass Tensor5DCoord and Coord3D + // *computes* output size and sets Z, P and Q (include all data members in ctor) + CUTLASS_HOST_DEVICE + Conv3dProblemSize( + cutlass::Tensor5DCoord input_size, // NDHWC + cutlass::Tensor5DCoord filter_size, // KTRSC + Coord3D padding, // pad_d, pad_h, pad_w + Coord3D stride, // stride_d, stride_h, stride_w + Coord3D dilation, // dilation_d, dilation_h, dilation_w + cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation, + int split_k_slices = 1, + int groups = 1 + ): + D(input_size.d()), T(filter_size.d()), + pad_d(padding[0]), stride_d(stride[0]), dilation_d(dilation[0]), + Conv2dProblemSize( + {input_size.n(), input_size.h(), input_size.w(), input_size.c()}, + {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()}, + {padding[1], padding[1], padding[2], padding[2]}, + {stride[1], stride[2]}, + {dilation[1], dilation[2]}, + mode, split_k_slices, groups + ) { + // set output Z + Z = ((D + pad_d - T * dilation_d) / stride_d) + 1; + } + + /// Equality operator (ignores mode and split_k_slice) + CUTLASS_HOST_DEVICE + bool operator==(Conv3dProblemSize const &conv) const { + return ( + (N == conv.N) && (D == conv.D) && (H == conv.H) && (W == conv.W) && (C == conv.C) && + (K == conv.K) && (T == conv.T) && (R == conv.R) && (S == conv.S) && + (Z == conv.Z) &&(P == conv.P) && (Q == conv.Q) && + (pad_d == conv.pad_d) && (pad_h == conv.pad_h) && (pad_w == conv.pad_w) && + (stride_d == conv.stride_d) && (stride_h == conv.stride_h) && (stride_w == conv.stride_h) && + (dilation_d == conv.dilation_d) && (dilation_h == conv.dilation_h) && (dilation_h == conv.dilation_h) + ); + } + + /// Inequality operator + CUTLASS_HOST_DEVICE + bool operator!=(Conv3dProblemSize const &rhs) const { + return !(*this == rhs); + } + + // Reset covolution mode in the problem + CUTLASS_HOST_DEVICE + Conv3dProblemSize reset_mode(cutlass::conv::Mode mode_) { + Conv3dProblemSize tmp(*this); + tmp.mode = mode_; + return tmp; + } + + // Reset covolution mode in the problem + CUTLASS_HOST_DEVICE + Conv3dProblemSize reset_split_k_slices(int split_k_slices_) { + Conv3dProblemSize tmp(*this); + tmp.split_k_slices = split_k_slices_; + return tmp; + } + + /// Returns activation extent as Tensor5DCoord + CUTLASS_HOST_DEVICE + cutlass::Tensor5DCoord activation_extent() const { + + return cutlass::Tensor5DCoord ({N, D, H, W, C}); + } + + /// Returns filter extent as Tensor5DCoord + CUTLASS_HOST_DEVICE + cutlass::Tensor5DCoord filter_extent() const { + + return cutlass::Tensor5DCoord ({K, T, R, S, C}); + } + + /// Returns output extent as Tensor5DCoord + CUTLASS_HOST_DEVICE + cutlass::Tensor5DCoord output_extent() const { + + return cutlass::Tensor5DCoord ({N, Z, P, Q, K}); + } + + /// Returns activation size in number of elements + CUTLASS_HOST_DEVICE + int64_t activation_size() const { + + return (N * D * H * W * C); + } + + /// Returns filter size in number of elements + CUTLASS_HOST_DEVICE + int64_t filter_size() const { + + return (K * T * R * S * C); + } + + /// Returns output size in number of elements + CUTLASS_HOST_DEVICE + int64_t output_size() const { + + return (N * Z * P * Q * K); + } + + /// Returns output extent as Tensor5DCoord + CUTLASS_HOST_DEVICE + Coord3D padding() const { + + return Coord3D ({pad_d, pad_h, pad_w}); + } + + /// Returns stride as MatrixCoord + CUTLASS_HOST_DEVICE + Coord3D stride() const { + + return Coord3D ({stride_d, stride_h, stride_w}); + } + + /// Returns dilation as MatrixCoord + CUTLASS_HOST_DEVICE + Coord3D dilation() const { + + return Coord3D ({dilation_d, dilation_h, dilation_w}); + } + +}; + + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// ImplicitGemm helper functions // +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Determine the problem size of the implicit GEMM operation +CUTLASS_HOST_DEVICE +cutlass::gemm::GemmCoord implicit_gemm_problem_size( + Operator conv_operator, + Conv3dProblemSize const &problem_size) { + // Compute problem size + switch (conv_operator) { + case Operator::kFprop: + return gemm::GemmCoord( + problem_size.N * problem_size.Z * problem_size.P * problem_size.Q, + problem_size.K, + problem_size.T * problem_size.R * problem_size.S * problem_size.C + ); + case Operator::kDgrad: + return gemm::GemmCoord( + problem_size.N * problem_size.D * problem_size.H * problem_size.W, + problem_size.C, + problem_size.T * problem_size.R * problem_size.S * problem_size.K + ); + case Operator::kWgrad: + return gemm::GemmCoord( + problem_size.K, + problem_size.T * problem_size.R * problem_size.S * problem_size.C, + problem_size.N * problem_size.Z * problem_size.P * problem_size.Q + ); + default: + break; + } + return gemm::GemmCoord(); +} + +// Determine the number of gemm_k iterations for conv2d problem using implicit gemm algorithm +CUTLASS_HOST_DEVICE +int implicit_gemm_k_iterations( + Operator conv_operator, + int threadblock_K, + Conv3dProblemSize const &problem_size) { + + int iterations = 0; + int elements_per_split_k_slice = 0; + + switch (conv_operator) { + case Operator::kFprop: + elements_per_split_k_slice = (problem_size.C + problem_size.split_k_slices - 1) / problem_size.split_k_slices; + iterations = problem_size.T * problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K); + break; + + case Operator::kDgrad: + elements_per_split_k_slice = (problem_size.K + problem_size.split_k_slices - 1) / problem_size.split_k_slices; + iterations = problem_size.T * problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K); + break; + + case Operator::kWgrad: + elements_per_split_k_slice = (problem_size.N * problem_size.Z * problem_size.P * problem_size.Q + problem_size.split_k_slices - 1) / problem_size.split_k_slices; + iterations = (elements_per_split_k_slice + threadblock_K - 1) / threadblock_K; + break; + + default: + break; + } + + return iterations; +} + +//////////////////////////////////////////////////////////////////////////////// +// Mapping function (ImplicitGemm A, B, C -> Conv Activation, Filter, Output) +//////////////////////////////////////////////////////////////////////////////// +/// Returns ImplicitGemm tensor A extent as Tensor5DCoord +CUTLASS_HOST_DEVICE +cutlass::Tensor5DCoord implicit_gemm_tensor_a_extent( + Operator conv_operator, + Conv3dProblemSize const &problem_size) { + switch (conv_operator) { + case cutlass::conv::Operator::kFprop: return problem_size.activation_extent(); + case cutlass::conv::Operator::kDgrad: return problem_size.output_extent(); + case cutlass::conv::Operator::kWgrad: return problem_size.output_extent(); + default : break; + } + return cutlass::Tensor5DCoord(); +} + +/// Returns ImplicitGemm tensor B extent as Tensor5DCoord +CUTLASS_HOST_DEVICE +cutlass::Tensor5DCoord implicit_gemm_tensor_b_extent( + Operator conv_operator, + Conv3dProblemSize const &problem_size) { + switch (conv_operator) { + case cutlass::conv::Operator::kFprop: return problem_size.filter_extent(); + case cutlass::conv::Operator::kDgrad: return problem_size.filter_extent(); + case cutlass::conv::Operator::kWgrad: return problem_size.activation_extent(); + default : break; + } + return cutlass::Tensor5DCoord(); +} + +/// Returns ImplicitGemm tensor C extent as Tensor5DCoord +CUTLASS_HOST_DEVICE +cutlass::Tensor5DCoord implicit_gemm_tensor_c_extent( + Operator conv_operator, + Conv3dProblemSize const &problem_size) { + switch (conv_operator) { + case cutlass::conv::Operator::kFprop: return problem_size.output_extent(); + case cutlass::conv::Operator::kDgrad: return problem_size.activation_extent(); + case cutlass::conv::Operator::kWgrad: return problem_size.filter_extent(); + default : break; + } + return cutlass::Tensor5DCoord(); +} + +/// Returns ImplicitGemm tensor A size in number of elements +CUTLASS_HOST_DEVICE +int64_t implicit_gemm_tensor_a_size( + Operator conv_operator, + Conv3dProblemSize const &problem_size) { + switch (conv_operator) { + case cutlass::conv::Operator::kFprop: return problem_size.activation_size(); + case cutlass::conv::Operator::kDgrad: return problem_size.output_size(); + case cutlass::conv::Operator::kWgrad: return problem_size.output_size(); + default : break; + } + return 0; +} + +/// Returns ImplicitGemm tensor B size in number of elements +CUTLASS_HOST_DEVICE +int64_t implicit_gemm_tensor_b_size( + Operator conv_operator, + Conv3dProblemSize const &problem_size) { + switch (conv_operator) { + case cutlass::conv::Operator::kFprop: return problem_size.filter_size(); + case cutlass::conv::Operator::kDgrad: return problem_size.filter_size(); + case cutlass::conv::Operator::kWgrad: return problem_size.activation_size(); + default : break; + } + return 0; +} + +/// Returns ImplicitGemm tensor C size in number of elements +CUTLASS_HOST_DEVICE +int64_t implicit_gemm_tensor_c_size( + Operator conv_operator, + Conv3dProblemSize const &problem_size) { + switch (conv_operator) { + case cutlass::conv::Operator::kFprop: return problem_size.output_size(); + case cutlass::conv::Operator::kDgrad: return problem_size.activation_size(); + case cutlass::conv::Operator::kWgrad: return problem_size.filter_size(); + default : break; + } + return 0; +} + +} // namespace conv +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/conv/convolution.h b/include/cutlass/conv/convolution.h new file mode 100644 index 0000000000..c743ea6faa --- /dev/null +++ b/include/cutlass/conv/convolution.h @@ -0,0 +1,118 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief + +This file contains definitions and utility functions for describing convolution problem sizes in terms of +activation (NHWC), filter (KRSC), output (NPQK), pading (pad_h, pad_w), stride (stride_h, stride_w), +dilation (dilation_h, dilation_w). Furthermore, it defines helper functions to map cutlass' implicit gemm +tensor extents, sizes, data types to that of convolutions extents, sizes, and data types. + + * Mapping convolutions to Gemm computation * + +Cutlass employs ImplicitGemm algorithm to implement convolutions. ImplicitGemm algorithm runs gemm operation +on convolution tensors Activation, Filter, and Output . The underlying gemm operation follows the standard +gemm definition: + + C = A * B + C + + A and B are input matrices + C is source and output matrix + + +For the three convolutional operators (Fprop, Dgrad, Wgrad), ImplicitGemm matrices A, B, and C are mapped on +to convolution tensors Activation, Filter and Output as per the below table: + + ___________________________________________________________________________ + ConvolutionalOperator | A | B | C + ___________________________________________________________________________ + | | | | | + | Fprop | Activation | Filter | Output | + | Dgrad | Output | Filter | Activation | + | Wgrad | Output | Activation | Filter | + ___________________________________________________________________________ + +In convolution codebase, DO NOT mix using (A, B, C) with (Acvitation, Filter, Output). + +For example, a convolution class/function with A, B, Output is confusing and error-prone. Instead use below +mapping functions and adhere to using either A, B, C or Acvitation, Filter, Output. + +Map elements' data types (ImplicitGemm -> Conv): GemmToConvElementMap +Map elements' data types (Conv -> ImplicitGemm): ConvToGemmElementMap +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/tensor_coord.h" +#include "cutlass/fast_math.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/matrix_coord.h" + +namespace cutlass { +namespace conv { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Convolutional operator +enum class Operator { + kFprop, + kDgrad, + kWgrad +}; + +/// Distinguishes convolution from cross correlation +enum class Mode { + kCrossCorrelation, + kConvolution +}; + +/// Selects among several implementation variants trading off performance with simplicity +enum class IteratorAlgorithm { + kAnalytic, ///< functionally correct in all cases but lower performance + kOptimized ///< optimized for R <= 32, S <= 32 and unity-stride dgrad +}; + +/// Distinguishes among partial specializations that accelerate certain problems where convolution +/// stride is unit. +enum class StrideSupport { + kStrided, ///< arbitrary convolution stride + kUnity ///< unit convolution stride +}; + +/// Identifies split-K mode +enum class SplitKMode { + kNone, + kSerial, + kParallel +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace conv +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/conv/device/implicit_gemm_convolution.h b/include/cutlass/conv/device/implicit_gemm_convolution.h new file mode 100644 index 0000000000..0aa03d1997 --- /dev/null +++ b/include/cutlass/conv/device/implicit_gemm_convolution.h @@ -0,0 +1,263 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief Template for device-level Implicit GEMM Convolution +*/ + +#pragma once + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/device_kernel.h" +#include "cutlass/conv/convolution.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace device { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template +class ImplicitGemmConvolution { +public: + + using ImplicitGemmKernel = ImplicitGemmKernel_; + + using ElementA = typename ImplicitGemmKernel::ElementA; + using LayoutA = typename ImplicitGemmKernel::LayoutA; + using ElementB = typename ImplicitGemmKernel::ElementB; + using LayoutB = typename ImplicitGemmKernel::LayoutB; + using ElementC = typename ImplicitGemmKernel::ElementC; + using LayoutC = typename ImplicitGemmKernel::LayoutC; + using ElementAccumulator = typename ImplicitGemmKernel::ElementAccumulator; + using ElementCompute = typename ImplicitGemmKernel::ElementCompute; + using OperatorClass = typename ImplicitGemmKernel::OperatorClass; + using ArchTag = typename ImplicitGemmKernel::ArchTag; + using ThreadblockShape = typename ImplicitGemmKernel::ThreadblockShape; + using WarpShape = typename ImplicitGemmKernel::WarpShape; + using InstructionShape = typename ImplicitGemmKernel::InstructionShape; + using ThreadblockSwizzle = typename ImplicitGemmKernel::ThreadblockSwizzle; + using EpilogueOutputOp = typename ImplicitGemmKernel::EpilogueOutputOp; + static int const kStages = ImplicitGemmKernel::kStages; + static int const kConvDim = ImplicitGemmKernel::kConvDim; + using WarpMmaOperator = typename ImplicitGemmKernel::WarpMmaOperator; + using ArchMmaOperator = typename ImplicitGemmKernel::ArchMmaOperator; + using MathOperator = typename ImplicitGemmKernel::MathOperator; + + static cutlass::conv::Operator const kConvolutionalOperator = ImplicitGemmKernel::kConvolutionalOperator; + static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = ImplicitGemmKernel::kIteratorAlgorithm; + + static int const kWarpCount = + (ThreadblockShape::kM / WarpShape::kM) * + (ThreadblockShape::kN / WarpShape::kN); + + /// Argument structure + using Arguments = typename ImplicitGemmKernel::Arguments; + +private: + + /// Kernel parameters object + typename ImplicitGemmKernel::Params params_; + +public: + + /// Constructs Implicit GEMM + ImplicitGemmConvolution() { } + + /// Determines whether the Implicit GEMM can execute the given problem. + static Status can_implement(Arguments const &args) { + + // dispatch to iterators + Status status = ImplicitGemmKernel::Mma::IteratorA::can_implement(args.problem_size); + if (Status::kSuccess != status) { + return status; + } + + status = ImplicitGemmKernel::Mma::IteratorB::can_implement(args.problem_size); + if (Status::kSuccess != status) { + return status; + } + + // Determine grid shape + ThreadblockSwizzle threadblock_swizzle; + + dim3 grid = threadblock_swizzle.get_grid_shape( + threadblock_swizzle.get_tiled_shape( + cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size), + {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK}, + args.problem_size.split_k_slices)); + + if (!(grid.y <= std::numeric_limits::max() && + grid.z <= std::numeric_limits::max())) { + + return Status::kErrorInvalidProblem; + } + + return Status::kSuccess; + } + + /// Gets the workspace size + static size_t get_workspace_size(Arguments const &args) { + + size_t workspace_bytes = 0; + + // Determine grid shape + ThreadblockSwizzle threadblock_swizzle; + + cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape( + cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size), + {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK}, + args.problem_size.split_k_slices); + + if(args.split_k_mode == SplitKMode::kParallel) { + + // Split-K parallel: CTAs in k-dimension write the partial results in a temporary workspace. + // The user needs to call a reduction operator to optain the final output tensor + workspace_bytes = + sizeof(ElementAccumulator) * + size_t(cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, args.problem_size)) * + size_t(grid_tiled_shape.k()); + } + + else if(args.split_k_mode == SplitKMode::kSerial && args.problem_size.split_k_slices > 1) { + + // Split-K serial: The user workspace is used to store semaphore and serialize writing the + // final reduced output to user's output tensor + workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n()); + } + + return workspace_bytes; + } + + /// Initializes GEMM state from arguments. + Status initialize( + Arguments const &args, + void *workspace = nullptr, + cudaStream_t stream = nullptr) { + + if (args.problem_size.split_k_slices > 1) { + + if (!workspace) { + return Status::kErrorWorkspaceNull; + } + + cudaError_t status = cudaMemsetAsync(workspace, 0, get_workspace_size(args), stream); + + if (status != cudaSuccess) { + return Status::kErrorInternal; + } + } + + // initialize the params structure from the arguments + params_ = typename ImplicitGemmKernel::Params( + args, + static_cast(workspace) + ); + + int smem_size = int(sizeof(typename ImplicitGemmKernel::SharedStorage)); + + if (smem_size >= (48 << 10)) { + cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + smem_size); + + if (result != cudaSuccess) { + return Status::kErrorInternal; + } + + result = cudaFuncSetAttribute( + cutlass::Kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 100); + + if (result != cudaSuccess) { + return Status::kErrorInternal; + } + } + + return Status::kSuccess; + } + + /// Initializes GEMM state from arguments. + Status update(Arguments const &args, void *workspace = nullptr) { + + // update the params structure from the arguments + params_.ptr_A = args.ref_A.data(); + params_.ptr_B = args.ref_B.data(); + params_.ptr_C = args.ref_C.data(); + params_.ptr_D = args.ref_D.data(); + params_.output_op = args.output_op; + params_.semaphore = static_cast(workspace); + + return Status::kSuccess; + } + + /// Runs the kernel using initialized state. + Status run(cudaStream_t stream = nullptr) { + + ThreadblockSwizzle threadblock_swizzle; + + dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape); + dim3 block(32 * kWarpCount, 1, 1); + + int smem_size = int(sizeof(typename ImplicitGemmKernel::SharedStorage)); + + cutlass::Kernel<<>>(params_); + + cudaError_t result = cudaGetLastError(); + + return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal; + } + + /// Runs the kernel using initialized state. + Status operator()(cudaStream_t stream = nullptr) { + return run(stream); + } + + /// Runs the kernel using initialized state. + Status operator()( + Arguments const &args, + void *workspace = nullptr, + cudaStream_t stream = nullptr) { + + Status status = initialize(args, workspace); + + if (status == Status::kSuccess) { + status = run(stream); + } + + return status; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} +} +} + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/conv/kernel/default_conv2d.h b/include/cutlass/conv/kernel/default_conv2d.h new file mode 100644 index 0000000000..57fae79655 --- /dev/null +++ b/include/cutlass/conv/kernel/default_conv2d.h @@ -0,0 +1,104 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief + Default kernel-level implicit GEMM convolution definitions for threadblock-scoped epilogue. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/threadblock/default_mma.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" +#include "cutlass/epilogue/threadblock/default_epilogue_simt.h" +#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h" +#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/threadblock/conv2d_tile_iterator.h" +#include "cutlass/conv/threadblock/implicit_gemm_pipelined.h" +#include "cutlass/conv/threadblock/implicit_gemm_multistage.h" +#include "cutlass/conv/kernel/implicit_gemm_convolution.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace kernel { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace detail { + +template < + typename ArchTag, + typename Shape, + typename WarpMmaTensorOp, + int PartitionsK, + typename OutputOp +> +struct DefaultConvEpilogue { + using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp< + Shape, + WarpMmaTensorOp, + 1, + OutputOp, + OutputOp::kCount + >::Epilogue; +}; + +template < + typename Shape, + typename WarpMmaTensorOp, + int PartitionsK, + typename OutputOp +> +struct DefaultConvEpilogue< + arch::Sm70, + Shape, + WarpMmaTensorOp, + PartitionsK, + OutputOp +> { + + using Epilogue = typename epilogue::threadblock::DefaultEpilogueVoltaTensorOp< + Shape, + WarpMmaTensorOp, + 1, + OutputOp, + OutputOp::kCount + >::Epilogue; +}; + +} // namespace detail + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace kernel +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/conv/kernel/default_conv2d_dgrad.h b/include/cutlass/conv/kernel/default_conv2d_dgrad.h new file mode 100644 index 0000000000..c590f57efc --- /dev/null +++ b/include/cutlass/conv/kernel/default_conv2d_dgrad.h @@ -0,0 +1,1154 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief + Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped + matrix multiply-add with the appropriate threadblock-scoped epilogue. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/conv/kernel/default_conv2d.h" + +#include "cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h" +#include "cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h" +#include "cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h" +#include "cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h" +#include "cutlass/conv/threadblock/conv2d_tile_iterator.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace kernel { + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Defines a kernel for Conv2dDgrad +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag, + conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic, + conv::StrideSupport StrideSupport = StrideSupport::kStrided +> struct DefaultConv2dDgrad; + +///////////////////////////////////////////////////////////////////////////////////////////////// +// OpClassTensorOp convolutions +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm Dgrad Strided and +// multistage pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultConv2dDgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kAnalytic, + StrideSupport::kStrided +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass, + Stages, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, + ThreadMapA, + StrideSupport::kStrided + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Global, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp< + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kDgrad + >; +}; + +/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm Dgrad Strided +// and 2 stage pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + typename MathOperatorTag +> +struct DefaultConv2dDgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kAnalytic, + StrideSupport::kStrided +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass, + 2, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, + ThreadMapA, + StrideSupport::kStrided + > + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, + ThreadMapB + > + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmPipelined< + ThreadblockShape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + MmaPolicy + >; + + // Define the epilogue + using Epilogue = typename detail::DefaultConvEpilogue< + ArchTag, + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kDgrad + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm Dgrad Unity Strided +// and multistage pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultConv2dDgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kAnalytic, + StrideSupport::kUnity +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass, + Stages, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, + ThreadMapA, + StrideSupport::kUnity + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Global, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp< + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kDgrad + >; +}; + +/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm Dgrad Unity +// 2 stage pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + typename MathOperatorTag +> +struct DefaultConv2dDgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kAnalytic, + StrideSupport::kUnity +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass, + 2, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, + ThreadMapA, + StrideSupport::kUnity + > + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, + ThreadMapB + > + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmPipelined< + ThreadblockShape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + MmaPolicy + >; + + // Define the epilogue + using Epilogue = typename detail::DefaultConvEpilogue< + ArchTag, + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kDgrad + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dDgrad specialzation for optimized IteratorAlgorithm Dgrad Unity Strided +// and multistage pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultConv2dDgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kOptimized, + StrideSupport::kUnity +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass, + Stages, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, + ThreadMapA, + StrideSupport::kUnity + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Global, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp< + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kDgrad + >; +}; + +/// Defines a kernel for Conv2dDgrad specialzation for Optimized IteratorAlgorithm Dgrad Unity +// 2 stage pipeline +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + typename MathOperatorTag +> +struct DefaultConv2dDgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kOptimized, + StrideSupport::kUnity +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass, + 2, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, + ThreadMapA, + StrideSupport::kUnity + > + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, + ThreadMapB + > + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmPipelined< + ThreadblockShape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + MmaPolicy + >; + + // Define the epilogue + using Epilogue = typename detail::DefaultConvEpilogue< + ArchTag, + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kDgrad + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +// OpClassSimt convolutions +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm, +/// multi-stage pipeline, and FFMA-based mainloop for SM80 + +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultConv2dDgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassSimt, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kAnalytic, + StrideSupport::kStrided +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt, + Stages, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, + ThreadMapA, + StrideSupport::kStrided + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Always, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt< + ThreadblockShape, + WarpMmaSimtOp, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kDgrad + >; + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dDgrad specialzation for Optimized IteratorAlgorithm, +/// multi-stage pipeline, and FFMA-based mainloop for SM80 + +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultConv2dDgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassSimt, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kOptimized, + StrideSupport::kUnity +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt, + Stages, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, + ThreadMapA, + StrideSupport::kUnity + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Always, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt< + ThreadblockShape, + WarpMmaSimtOp, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kDgrad + >; + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm, +/// 2 stage pipeline, and FFMA-based mainloop for SM50 +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + typename MathOperatorTag +> +struct DefaultConv2dDgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassSimt, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kAnalytic, + StrideSupport::kStrided +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt, + 2, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, + ThreadMapA, + StrideSupport::kStrided + > + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, + ThreadMapB + > + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmPipelined< + ThreadblockShape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + MmaPolicy + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt< + ThreadblockShape, + WarpMmaSimtOp, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kDgrad + >; + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dDgrad specialzation for Optimized IteratorAlgorithm, +/// 2 stage pipeline, and FFMA-based mainloop for SM50 +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + typename MathOperatorTag +> +struct DefaultConv2dDgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassSimt, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kOptimized, + StrideSupport::kUnity +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt, + 2, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, + ThreadMapA, + StrideSupport::kUnity + > + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, + ThreadMapB + > + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmPipelined< + ThreadblockShape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + MmaPolicy + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt< + ThreadblockShape, + WarpMmaSimtOp, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kDgrad + >; + +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + + +} // namespace kernel +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/conv/kernel/default_conv2d_fprop.h b/include/cutlass/conv/kernel/default_conv2d_fprop.h new file mode 100644 index 0000000000..c38d5150b1 --- /dev/null +++ b/include/cutlass/conv/kernel/default_conv2d_fprop.h @@ -0,0 +1,1379 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: +namespace conv { + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief + Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped + matrix multiply-add with the appropriate threadblock-scoped epilogue. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/conv/kernel/default_conv2d.h" + +#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h" +#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h" +#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h" +#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace kernel { + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Defines a kernel for Conv2dFprop +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag, + conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic, + conv::StrideSupport StrideSupport = StrideSupport::kStrided +> struct DefaultConv2dFprop; + +///////////////////////////////////////////////////////////////////////////////////////////////// +// OpClassTensorOp convolutions +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and multistage +/// pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultConv2dFprop < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassTensorOp, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kAnalytic +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, + Stages, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, LayoutA, + ThreadMapA + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, LayoutB, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Global, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp< + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and multistage +/// pipeline with interleaved layout. +template < + typename ElementA, + typename ElementB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag, + int InterleavedK +> +struct DefaultConv2dFprop < + ElementA, + layout::TensorNCxHWx, + ElementB, + layout::TensorCxRSKx, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassTensorOp, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kAnalytic +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved, + ElementB, layout::RowMajorInterleaved, + ElementAccumulator, LayoutC, arch::OpClassTensorOp, + Stages, MathOperatorTag, true>; + + // Define iterators over tiles from the A operand + // Note GEMM shared memory threadmap is used here because conv global memory + // layout needs to be mapped to fprop which is similar to the crosswise + // layout which is used by the interleaved GEMM shared memory threadmap. + // The Interleaved GEMM global memory layout is similar to the congruous + // layout. + using ThreadMapA = typename MmaCore::SmemThreadMapA; + using IteratorA = + cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, layout::TensorNCxHWx, + ThreadMapA + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + // Note GEMM shared memory threadmap is used here because conv global memory + // layout needs to be mapped to fprop which is similar to the crosswise + // layout which is used by the interleaved GEMM shared memory threadmap. + // The Interleaved GEMM global memory layout is similar to the congruous + // layout. + using ThreadMapB = typename MmaCore::SmemThreadMapB; + using IteratorB = + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, layout::TensorCxRSKx, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Global, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue< + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp, + EpilogueOutputOp::kCount, + InterleavedK + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm +/// and 2 stage pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + typename MathOperatorTag +> +struct DefaultConv2dFprop < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassTensorOp, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kAnalytic +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, + 2, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, LayoutA, + ThreadMapA + > + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, LayoutB, + ThreadMapB + > + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmPipelined< + ThreadblockShape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + MmaPolicy + >; + + // Define the epilogue + using Epilogue = typename detail::DefaultConvEpilogue< + ArchTag, + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and 2 stage +/// pipeline with interleaved layout. +template < + typename ElementA, + typename ElementB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + typename MathOperatorTag, + int InterleavedK +> +struct DefaultConv2dFprop < + ElementA, + layout::TensorNCxHWx, + ElementB, + layout::TensorCxRSKx, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassTensorOp, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kAnalytic +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved, + ElementB, layout::RowMajorInterleaved, + ElementAccumulator, LayoutC, arch::OpClassTensorOp, + 2, MathOperatorTag, true>; + + // Define iterators over tiles from the A operand + // Note GEMM shared memory threadmap is used here because conv global memory + // layout needs to be mapped to fprop which is similar to the crosswise + // layout which is used by the interleaved GEMM shared memory threadmap. + // The Interleaved GEMM global memory layout is similar to the congruous + // layout. + using ThreadMapA = typename MmaCore::SmemThreadMapA; + using IteratorA = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, layout::TensorNCxHWx, + ThreadMapA + > + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + // Note GEMM shared memory threadmap is used here because conv global memory + // layout needs to be mapped to fprop which is similar to the crosswise + // layout which is used by the interleaved GEMM shared memory threadmap. + // The Interleaved GEMM global memory layout is similar to the congruous + // layout. + using ThreadMapB = typename MmaCore::SmemThreadMapB; + using IteratorB = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, layout::TensorCxRSKx, + ThreadMapB + > + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmPipelined< + ThreadblockShape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + MmaPolicy + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue< + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp, + EpilogueOutputOp::kCount, + InterleavedK + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dFprop specialzation for Optimzed IteratorAlgorithm and +/// multistage pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultConv2dFprop < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassTensorOp, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kOptimized +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, + Stages, MathOperatorTag + >; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, + LayoutA, + ThreadMapA + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, + LayoutB, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Global, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp< + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dFprop specialzation for Optimzed IteratorAlgorithm and +// multistage pipeline with interleaved layout. +template < + typename ElementA, + typename ElementB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag, + int InterleavedK +> +struct DefaultConv2dFprop < + ElementA, + layout::TensorNCxHWx, + ElementB, + layout::TensorCxRSKx, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassTensorOp, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kOptimized +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved, + ElementB, layout::RowMajorInterleaved, ElementAccumulator, LayoutC, arch::OpClassTensorOp, + Stages, MathOperatorTag, true + >; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::SmemThreadMapA; + using IteratorA = + cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, + layout::TensorNCxHWx, + ThreadMapA + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::SmemThreadMapB; + using IteratorB = + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, + layout::TensorCxRSKx, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Global, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue< + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp, + EpilogueOutputOp::kCount, + InterleavedK + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm +/// and 2 stage pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + typename MathOperatorTag +> +struct DefaultConv2dFprop < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassTensorOp, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kOptimized +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, + 2, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, + LayoutA, + ThreadMapA + > + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, + LayoutB, + ThreadMapB + > + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmPipelined< + ThreadblockShape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + MmaPolicy + >; + + // Define the epilogue + using Epilogue = typename detail::DefaultConvEpilogue< + ArchTag, + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm and 2 stage +/// pipeline with interleaved layout. +template < + typename ElementA, + typename ElementB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + typename MathOperatorTag, + int InterleavedK +> +struct DefaultConv2dFprop < + ElementA, + layout::TensorNCxHWx, + ElementB, + layout::TensorCxRSKx, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassTensorOp, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kOptimized +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved, + ElementB, layout::RowMajorInterleaved, + ElementAccumulator, LayoutC, arch::OpClassTensorOp, + 2, MathOperatorTag, true>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::SmemThreadMapA; + using IteratorA = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, layout::TensorNCxHWx, + ThreadMapA + > + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::SmemThreadMapB; + using IteratorB = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, layout::TensorCxRSKx, + ThreadMapB + > + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmPipelined< + ThreadblockShape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + MmaPolicy + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue< + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp, + EpilogueOutputOp::kCount, + InterleavedK + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +// OpClassSimt convolutions +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm, +/// multi-stage pipeline, and FFMA-based mainloop for SM80 + +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultConv2dFprop < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassSimt, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kAnalytic +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt, + Stages, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, LayoutA, + ThreadMapA + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, LayoutB, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Always, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt< + ThreadblockShape, + WarpMmaSimtOp, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop + >; + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm, +/// multi-stage pipeline, and FFMA-based mainloop for SM80 + +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultConv2dFprop < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassSimt, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kOptimized +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt, + Stages, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, + LayoutA, + ThreadMapA + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, + LayoutB, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Always, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt< + ThreadblockShape, + WarpMmaSimtOp, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop + >; + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm, +/// 2 stage pipeline, and FFMA-based mainloop for SM50 +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + typename MathOperatorTag +> +struct DefaultConv2dFprop < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassSimt, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kAnalytic +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt, + 2, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, LayoutA, + ThreadMapA + > + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, LayoutB, + ThreadMapB + > + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmPipelined< + ThreadblockShape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + MmaPolicy + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt< + ThreadblockShape, + WarpMmaSimtOp, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop + >; + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm, +/// 2 stage pipeline, and FFMA-based mainloop for SM50 +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + typename MathOperatorTag +> +struct DefaultConv2dFprop < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassSimt, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kOptimized +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt, + 2, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, + LayoutA, + ThreadMapA + > + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, + LayoutB, + ThreadMapB + > + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmPipelined< + ThreadblockShape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + MmaPolicy + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt< + ThreadblockShape, + WarpMmaSimtOp, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop + >; + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace kernel +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/conv/kernel/default_conv2d_wgrad.h b/include/cutlass/conv/kernel/default_conv2d_wgrad.h new file mode 100644 index 0000000000..c7912203a4 --- /dev/null +++ b/include/cutlass/conv/kernel/default_conv2d_wgrad.h @@ -0,0 +1,928 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief + Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped + matrix multiply-add with the appropriate threadblock-scoped epilogue. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/conv/kernel/default_conv2d.h" + +#include "cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h" +#include "cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h" +#include "cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h" +#include "cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h" +#include "cutlass/conv/threadblock/conv2d_tile_iterator.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace kernel { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dWgrad +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag, + conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic, + conv::StrideSupport StrideSupport = StrideSupport::kStrided +> struct DefaultConv2dWgrad; +///////////////////////////////////////////////////////////////////////////////////////////////// + +///////////////////////////////////////////////////////////////////////////////////////////////// +// OpClassTensorOp convolutions +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dWgrad specialzation for Analytic IteratorAlgorithm and multistage +// pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultConv2dWgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kAnalytic +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass, + Stages, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, + ThreadMapA + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Always, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp< + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kWgrad + >; +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dWgrad specialzation for Analytic IteratorAlgorithm and two +// pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + typename MathOperatorTag +> +struct DefaultConv2dWgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kAnalytic +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass, + 2, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, + ThreadMapA + > + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, + ThreadMapB + > + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmPipelined< + ThreadblockShape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + MmaPolicy + >; + + // Define the epilogue + using Epilogue = typename detail::DefaultConvEpilogue< + ArchTag, + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kWgrad + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dWgrad specialzation for Optimized IteratorAlgorithm and multistage +// pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultConv2dWgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kOptimized +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass, + Stages, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, + ThreadMapA + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Always, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp< + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kWgrad + >; +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dWgrad specialzation for Optimized IteratorAlgorithm and two +// pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + typename MathOperatorTag +> +struct DefaultConv2dWgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kOptimized +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass, + 2, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, + ThreadMapA + > + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, + ThreadMapB + > + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmPipelined< + ThreadblockShape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + MmaPolicy + >; + + // Define the epilogue + using Epilogue = typename detail::DefaultConvEpilogue< + ArchTag, + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kWgrad + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +// OpClassSimt convolutions +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Defines a kernel for Conv2dWgrad specialzation for Analytic IteratorAlgorithm, +/// multi-stage pipeline, and FFMA-based mainloop for SM80 + +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultConv2dWgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassSimt, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kAnalytic +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt, + Stages, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, + ThreadMapA + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Always, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt< + ThreadblockShape, + WarpMmaSimtOp, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kWgrad + >; + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dWgrad specialzation for Optimized IteratorAlgorithm, +/// multi-stage pipeline, and FFMA-based mainloop for SM80 + +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultConv2dWgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassSimt, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kOptimized +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt, + Stages, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, + ThreadMapA + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Always, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt< + ThreadblockShape, + WarpMmaSimtOp, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kWgrad + >; + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dWgrad specialzation for Analytic IteratorAlgorithm, +/// 2 stage pipeline, and FFMA-based mainloop for SM50 +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + typename MathOperatorTag +> +struct DefaultConv2dWgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassSimt, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kAnalytic +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt, + 2, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, + ThreadMapA + > + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, + ThreadMapB + > + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmPipelined< + ThreadblockShape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + MmaPolicy + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt< + ThreadblockShape, + WarpMmaSimtOp, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kWgrad + >; + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dWgrad specialzation for Optimized IteratorAlgorithm, +/// 2 stage pipeline, and FFMA-based mainloop for SM50 +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + typename MathOperatorTag +> +struct DefaultConv2dWgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassSimt, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kOptimized +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt, + 2, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, + ThreadMapA + > + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, + ThreadMapB + > + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmPipelined< + ThreadblockShape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + MmaPolicy + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt< + ThreadblockShape, + WarpMmaSimtOp, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kWgrad + >; + +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + + +} // namespace kernel +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/conv/kernel/default_conv3d_dgrad.h b/include/cutlass/conv/kernel/default_conv3d_dgrad.h new file mode 100644 index 0000000000..a92b4bfb6a --- /dev/null +++ b/include/cutlass/conv/kernel/default_conv3d_dgrad.h @@ -0,0 +1,184 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief + Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped + matrix multiply-add with the appropriate threadblock-scoped epilogue. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/conv/kernel/default_conv2d.h" + +#include "cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h" +#include "cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h" +#include "cutlass/conv/threadblock/conv2d_tile_iterator.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace kernel { + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Defines a kernel for Conv2dDgrad +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag, + conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic, + conv::StrideSupport StrideSupport = StrideSupport::kStrided +> struct DefaultConv3dDgrad; + +/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm Dgrad Strided +// and multistage pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultConv3dDgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kAnalytic, + StrideSupport::kStrided +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass, + Stages, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, + ThreadMapA, + StrideSupport::kStrided + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Global, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp< + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kDgrad, + Conv3dProblemSize + >; +}; + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace kernel +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/conv/kernel/default_conv3d_fprop.h b/include/cutlass/conv/kernel/default_conv3d_fprop.h new file mode 100644 index 0000000000..7694c8b9e8 --- /dev/null +++ b/include/cutlass/conv/kernel/default_conv3d_fprop.h @@ -0,0 +1,181 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief + Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped + matrix multiply-add with the appropriate threadblock-scoped epilogue. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/conv/kernel/default_conv2d.h" + +#include "cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h" +#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace kernel { + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Defines a kernel for Conv2dFprop +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag, + conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic, + conv::StrideSupport StrideSupport = StrideSupport::kStrided +> struct DefaultConv3dFprop; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and multistage +// pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultConv3dFprop < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassTensorOp, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kAnalytic +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, + Stages, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, + ThreadMapA + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Global, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp< + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop, + Conv3dProblemSize + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace kernel +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/conv/kernel/default_conv3d_wgrad.h b/include/cutlass/conv/kernel/default_conv3d_wgrad.h new file mode 100644 index 0000000000..b0f5b91558 --- /dev/null +++ b/include/cutlass/conv/kernel/default_conv3d_wgrad.h @@ -0,0 +1,504 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief + Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped + matrix multiply-add with the appropriate threadblock-scoped epilogue. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/conv/kernel/default_conv2d.h" + +#include "cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h" +#include "cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h" +#include "cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h" +#include "cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace kernel { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dWgrad +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag, + conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic, + conv::StrideSupport StrideSupport = StrideSupport::kStrided +> struct DefaultConv3dWgrad; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv3dWgrad specialzation for Analytic IteratorAlgorithm and multistage +// pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultConv3dWgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kAnalytic +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass, + Stages, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, + ThreadMapA + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Always, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp< + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kWgrad, + Conv3dProblemSize + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Defines a kernel for Conv3dWgrad specialzation for Analytic IteratorAlgorithm and two +// pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + typename MathOperatorTag +> +struct DefaultConv3dWgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kAnalytic +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass, + 2, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, + ThreadMapA + > + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, + ThreadMapB + > + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmPipelined< + ThreadblockShape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + MmaPolicy + >; + + // Define the epilogue + using Epilogue = typename detail::DefaultConvEpilogue< + ArchTag, + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kWgrad, + Conv3dProblemSize + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv3dWgrad specialzation for Optimized IteratorAlgorithm and multistage +// pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultConv3dWgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kOptimized +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass, + Stages, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, + ThreadMapA + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Always, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp< + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kWgrad, + Conv3dProblemSize + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Defines a kernel for Conv3dWgrad specialzation for Optimized IteratorAlgorithm and two +// pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + typename MathOperatorTag +> +struct DefaultConv3dWgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kOptimized +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass, + 2, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, + ThreadMapA + > + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, + ThreadMapB + > + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmPipelined< + ThreadblockShape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + MmaPolicy + >; + + // Define the epilogue + using Epilogue = typename detail::DefaultConvEpilogue< + ArchTag, + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kWgrad, + Conv3dProblemSize + >; +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace kernel +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution.h b/include/cutlass/conv/kernel/implicit_gemm_convolution.h new file mode 100644 index 0000000000..2ec1566889 --- /dev/null +++ b/include/cutlass/conv/kernel/implicit_gemm_convolution.h @@ -0,0 +1,424 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a pipelined Implicit GEMM kernel. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/aligned_buffer.h" +#include "cutlass/array.h" +#include "cutlass/numeric_types.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/semaphore.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" +#include "cutlass/conv/conv3d_problem_size.h" +#include "cutlass/epilogue/threadblock/output_iterator_parameter.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace kernel { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Mma_, ///! Threadblock-scoped matrix multiply-accumulate + typename Epilogue_, ///! Epilogue + typename ThreadblockSwizzle_, ///! Threadblock swizzling function + conv::Operator ConvOperator, ///! Convolutional operator (Fprop, Dgrad, Wgrad) + typename ConvProblemSize_ = Conv2dProblemSize ///! Convolutional operator on 2D or 3D problem +> +struct ImplicitGemmConvolution { + + using Mma = Mma_; + using Epilogue = Epilogue_; + using EpilogueOutputOp = typename Epilogue::OutputOp; + using ThreadblockSwizzle = ThreadblockSwizzle_; + static Operator const kConvolutionalOperator = ConvOperator; + + using ElementA = typename Mma::IteratorA::Element; + using LayoutA = typename Mma::IteratorA::Layout; + using ElementB = typename Mma::IteratorB::Element; + using LayoutB = typename Mma::IteratorB::Layout; + using ElementC = typename EpilogueOutputOp::ElementOutput; + + /// Set output tensor C layout + using LayoutC = LayoutA; + + using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator; + using ElementCompute = typename EpilogueOutputOp::ElementCompute; + + using WarpMmaOperator = typename Mma::Policy::Operator; + + using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator; + using MathOperator = typename ArchMmaOperator::Operator; + + using OperatorClass = typename WarpMmaOperator::OperatorClass; + using ArchTag = typename WarpMmaOperator::ArchTag; + + using ThreadblockShape = typename Mma::Shape; + using WarpShape = typename WarpMmaOperator::Shape; + using InstructionShape = typename ArchMmaOperator::Shape; + + static int const kStages = Mma::kStages; + static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; + + /// Warp count (concept: GemmShape) + using WarpCount = typename Mma::WarpCount; + static int const kThreadCount = 32 * WarpCount::kCount; + + using TensorRefA = typename Mma::IteratorA::TensorRef; + using TensorRefB = typename Mma::IteratorB::TensorRef; + using TensorRefC = cutlass::TensorRef; + + /// Check iterator A and B convolution dimension are the same and + // set device::ImplicitGemmConvolution::kConvDim + static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, + "Convolution on different different dimensions is not supported"); + static int const kConvDim = Mma::IteratorA::kConvDim; + + /// Conv dimension and problem size structure (Conv2d or Conv3d) + using ConvProblemSize = ConvProblemSize_; + + /// Wgrad C stride idx for implicit gemm algorithm + // Conv2d row-major matrix C (KxRSC) + // Conv3d row-major matrix C (KxTRSC) + static int const kWgradCStrideIdx = + cutlass::platform::is_same::value ? 2 : 3; + + /// This chooses the appropriate stride element of the C tensor. + static int const kTensorCStrideIdx = + (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0); + + // + // + // + using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter< + LayoutC, + typename Epilogue::OutputTileIterator::Layout, + TensorRefC, + ConvOperator, + ConvProblemSize + >; + + /// Argument structure + struct Arguments { + + // + // Data members + // + + ConvProblemSize problem_size; + TensorRefA ref_A; + TensorRefB ref_B; + TensorRefC ref_C; + TensorRefC ref_D; + typename EpilogueOutputOp::Params output_op; + SplitKMode split_k_mode; + + // + // Methods + // + + /// Default ctor + CUTLASS_HOST_DEVICE + Arguments() { } + + CUTLASS_HOST_DEVICE + Arguments( + ConvProblemSize const & problem_size + ): + problem_size(problem_size) { } + + CUTLASS_HOST_DEVICE + Arguments( + ConvProblemSize const & problem_size, + TensorRefA const & ref_A, + TensorRefB const & ref_B, + TensorRefC const & ref_C, + TensorRefC const & ref_D, + typename EpilogueOutputOp::Params const & output_op, + SplitKMode const & split_k_mode = SplitKMode::kSerial + ): + problem_size(problem_size), + ref_A(ref_A), + ref_B(ref_B), + ref_C(ref_C), + ref_D(ref_D), + output_op(output_op), + split_k_mode(split_k_mode) + { + + } + + }; + + /// Parameters structure + struct Params { + ConvProblemSize problem_size; + cutlass::gemm::GemmCoord grid_tiled_shape; + gemm::GemmCoord implicit_gemm_problem_size; + int gemm_k_iterations; + typename Mma::IteratorA::Params iterator_A; + typename Mma::IteratorA::Element const *ptr_A; + typename Mma::IteratorB::Params iterator_B; + typename Mma::IteratorB::Element const *ptr_B; + typename Epilogue::OutputTileIterator::Params iterator_C; + typename Epilogue::OutputTileIterator::Element *ptr_C; + typename Epilogue::OutputTileIterator::Params iterator_D; + typename Epilogue::OutputTileIterator::Element *ptr_D; + typename EpilogueOutputOp::Params output_op; + int *semaphore; + SplitKMode split_k_mode; + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Params(): gemm_k_iterations(0) { } + + /// + CUTLASS_HOST_DEVICE + Params( + Arguments const &args, + int *semaphore = nullptr + ): + problem_size(args.problem_size), + implicit_gemm_problem_size(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)), + grid_tiled_shape(grid_tiled_shape), + iterator_A(args.problem_size, args.ref_A.layout()), + ptr_A(args.ref_A.data()), + iterator_B(args.problem_size, args.ref_B.layout()), + ptr_B(args.ref_B.data()), + iterator_C(ConvOutputIteratorParameter::layout(args.ref_C)), + ptr_C(args.ref_C.data()), + iterator_D(ConvOutputIteratorParameter::layout(args.ref_D)), + ptr_D(args.ref_D.data()), + output_op(args.output_op), + semaphore(semaphore), + split_k_mode(args.split_k_mode) + { + gemm_k_iterations = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape::kK, args.problem_size); + + ThreadblockSwizzle threadblock_swizzle; + + grid_tiled_shape = threadblock_swizzle.get_tiled_shape( + implicit_gemm_problem_size, + {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK}, + args.problem_size.split_k_slices); + } + }; + + /// Shared memory storage structure + union SharedStorage { + typename Mma::SharedStorage main_loop; + typename Epilogue::SharedStorage epilogue; + }; + + // + // Methods + // + + CUTLASS_HOST_DEVICE + ImplicitGemmConvolution() { } + + /// Executes one ImplicitGEMM + CUTLASS_DEVICE + void operator()(Params const ¶ms, SharedStorage &shared_storage) { + + // Compute threadblock location + ThreadblockSwizzle threadblock_swizzle; + + cutlass::gemm::GemmCoord threadblock_tile_idx = + threadblock_swizzle.get_tile_offset(params.grid_tiled_shape); + + // Early exit if CTA is out of range + if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() || + params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) { + + return; + } + + // Compute position within threadblock + int thread_idx = threadIdx.x; + + // Construct iterators to A and B operands + typename Mma::IteratorA iterator_A( + params.iterator_A, + params.problem_size, + params.ptr_A, + thread_idx, + MatrixCoord( + threadblock_tile_idx.m() * Mma::Shape::kM, + threadblock_tile_idx.k() * Mma::Shape::kK + ) + ); + + typename Mma::IteratorB iterator_B( + params.iterator_B, + params.problem_size, + params.ptr_B, + thread_idx, + MatrixCoord( + threadblock_tile_idx.k() * Mma::Shape::kK, + threadblock_tile_idx.n() * Mma::Shape::kN + ) + ); + + // Broadcast the warp_id computed by lane 0 to ensure dependent code + // is compiled as warp-uniform. + int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int lane_idx = threadIdx.x % 32; + + // + // Main loop + // + + // Construct thread-scoped matrix multiply + Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx); + + typename Mma::FragmentC accumulators; + + accumulators.clear(); + + // Compute threadblock-scoped matrix multiply-add + mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators); + + // + // Epilogue + // + + EpilogueOutputOp output_op(params.output_op); + + // Construct the semaphore. + int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m(); + + Semaphore semaphore(params.semaphore + block_idx, thread_idx); + + // Compute logical position within grid + threadblock_tile_idx = + threadblock_swizzle.get_tile_offset(params.grid_tiled_shape); + + // If performing a reduction via split-K, fetch the initial synchronization + if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { + + // Fetch the synchronization lock initially but do not block. + semaphore.fetch(); + + // Indicate which position in a serial reduction the output operator is currently updating + output_op.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k()); + } + + MatrixCoord threadblock_offset( + threadblock_tile_idx.m() * Mma::Shape::kM, + threadblock_tile_idx.n() * Mma::Shape::kN + ); + + // Tile iterator writing to destination tensor + typename Epilogue::OutputTileIterator iterator_D( + params.iterator_D, + params.ptr_D, + ConvOutputIteratorParameter::extent(params.problem_size), + thread_idx, + threadblock_offset + ); + + // Tile iterator reading from source accumulator tensor + typename Epilogue::OutputTileIterator iterator_C( + params.iterator_C, + params.ptr_C, + ConvOutputIteratorParameter::extent(params.problem_size), + thread_idx, + threadblock_offset + ); + + + // Construct the epilogue + Epilogue epilogue( + shared_storage.epilogue, + thread_idx, + warp_idx, + lane_idx); + + // Wait on the semaphore - this latency may have been covered by iterator construction + if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { + + // For subsequent threadblocks, the source matrix is held in the 'D' tensor. + if (threadblock_tile_idx.k()) { + iterator_C = iterator_D; + } + + semaphore.wait(threadblock_tile_idx.k()); + + __threadfence(); + } + // Each split-k-slice writes to a unique tensor location + else if (params.split_k_mode == SplitKMode::kParallel) { + iterator_D.add_pointer_offset(threadblock_tile_idx.k() * + cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size)); + } + + // Run efficient epilogue + epilogue(output_op, iterator_D, accumulators, iterator_C); + + // + // Release the semaphore + // + + if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { + + int lock = 0; + if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) { + + // The final threadblock resets the semaphore for subsequent grids. + lock = 0; + } + else { + // Otherwise, the semaphore is incremented + lock = threadblock_tile_idx.k() + 1; + } + + semaphore.release(lock); + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace kernel +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h new file mode 100644 index 0000000000..14c8a4e829 --- /dev/null +++ b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h @@ -0,0 +1,240 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) + matrix from memory. + + This iterator assumes TensorNHWC layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" +#include "cutlass/conv/threadblock/conv2d_params.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename ThreadMap_ +> +class Conv2dDgradFilterTileAccessIteratorAnalytic { +public: + + // + // Types + // + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorNHWC; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using TensorCoord = typename Layout::TensorCoord; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic; + static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided; + static int const kConvDim = 2; + using ConvProblemSize = typename conv::Conv2dProblemSize; + + static_assert(sizeof_bits::value >= 8, + "DGRAD requires elements of size 8b or larger."); + + // + // Parameters structure + // + + using Params = Conv2dAnalyticParams; + +private: + + Params const ¶ms_; + Conv2dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + // For a fixed filter position (r,s) find and fill offset_k_, offset_c_ in strided and contiguous dimension + int filter_r_; + int filter_s_; + int offset_k_[ThreadMap::Iterations::kStrided]; + int offset_c_[ThreadMap::Iterations::kContiguous]; + +public: + + CUTLASS_HOST_DEVICE + Conv2dDgradFilterTileAccessIteratorAnalytic( + Params const ¶ms, + Conv2dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)), + filter_r_(0), + filter_s_(0) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + offset_c_[c] = threadblock_offset.column() + thread_coord.contiguous() + + c * ThreadMap::Delta::kContiguous; + } + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + offset_k_[s] = + threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided; + } + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + CUTLASS_HOST_DEVICE + void advance() { + // moves to the next tile + ++filter_s_; + if (filter_s_ < problem_size_.S) { + return; + } + filter_s_ = 0; + ++filter_r_; + if (filter_r_ < problem_size_.R) { + return; + } + filter_r_ = 0; + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + offset_k_[s] += Shape::kRow * problem_size_.split_k_slices; + } + } + + /// Returns the coordinate in the filter tensor w that is currently pointed to + /// by the iterator. + CUTLASS_HOST_DEVICE + TensorCoord at() const { + + int c = offset_c_[iteration_contiguous_]; + int k = offset_k_[iteration_strided_]; + + return TensorCoord(k, filter_r_, filter_s_, c); + } + + /// Returns true if the current coordinate is within the filter tensor w + CUTLASS_HOST_DEVICE + bool valid() const { + + TensorCoord coord = at(); + + return coord.n() < problem_size_.K && coord.c() < problem_size_.C; + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + + TensorCoord coord = at(); + LongIndex offset = params_.layout(coord); + + return reinterpret_cast(pointer_ + offset * sizeof_bits::value / 8); + + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv2dDgradFilterTileAccessIteratorAnalytic &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv2dProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.C % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + return Status::kSuccess; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + + diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h new file mode 100644 index 0000000000..f76dcde931 --- /dev/null +++ b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h @@ -0,0 +1,283 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) + matrix from memory. + + This iterator assumes TensorNHWC layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" + +#include "cutlass/conv/threadblock/conv2d_params.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename ThreadMap_, + conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity +> +class Conv2dDgradFilterTileAccessIteratorOptimized { +public: + + // + // Types + // + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorNHWC; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using TensorCoord = typename Layout::TensorCoord; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized; + static StrideSupport const kStrideSupport = StrideSupport_; + static int const kConvDim = 2; + using ConvProblemSize = typename conv::Conv2dProblemSize; + + // + // Parameters structure + // + + struct Params : Conv2dDgradFilterIteratorOptimizedParams { + + // + // Methods + // + CUTLASS_HOST_DEVICE + Params() { } + + CUTLASS_HOST_DEVICE + Params(Conv2dDgradFilterIteratorOptimizedParams const &base): + Conv2dDgradFilterIteratorOptimizedParams(base) { } + + CUTLASS_HOST_DEVICE + Params( + Conv2dProblemSize const &problem_size, + Layout const &layout + ): + Conv2dDgradFilterIteratorOptimizedParams( + problem_size, + layout, + sizeof_bits::value, + {Shape::kRow, Shape::kColumn}, + ThreadMap::kThreads, + ThreadMap::kElementsPerAccess, + {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided}, + {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided} + ) { } + + }; + +private: + + Conv2dDgradFilterIteratorOptimizedParams const ¶ms_; + Conv2dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + uint32_t predicates_; + int filter_rs_; + int filter_k_; + + // + // Assertions + // + + // We map predicates into bits packed in this uint32_t container + static_assert(ThreadMap::Iterations::kStrided * + ThreadMap::Iterations::kContiguous < sizeof(predicates_) * 8, + "Currently, the number of loads per iteration is limited by the size of the predicates container."); + +public: + + CUTLASS_HOST_DEVICE + Conv2dDgradFilterTileAccessIteratorOptimized( + Conv2dDgradFilterIteratorOptimizedParams const ¶ms, + Conv2dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)), + predicates_(0), + filter_rs_(0), + filter_k_(0) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + filter_k_ = threadblock_offset.row() + thread_coord.strided(); + Index column = threadblock_offset.column() + thread_coord.contiguous(); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + + int filter_k = filter_k_ + s * ThreadMap::Delta::kStrided; + int filter_c = column + c * ThreadMap::Delta::kContiguous; + + uint32_t pred = ((filter_k < problem_size_.K && filter_c < problem_size_.C) ? 1u : 0); + + int pred_idx = c + s * ThreadMap::Iterations::kContiguous; + + predicates_ |= (pred << pred_idx); + } + } + + pointer_ += ( + filter_k_ * params.layout.stride()[2] + column + ) * sizeof_bits::value / 8; + + set_iteration_index(0); + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + + pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + CUTLASS_HOST_DEVICE + void advance() { + + LongIndex next = params_.inc_next_rs; + + // moves to the next tile + ++filter_rs_; + if (filter_rs_ == params_.RS) { + + filter_rs_ = 0; + next = params_.inc_next_k; + filter_k_ += params_.filter_k_delta; + } + + // Clear predicates if needed + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + if (filter_k_ + s * ThreadMap::Delta::kStrided >= problem_size_.K) { + uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous); + predicates_ = (predicates_ & (~kClearMask)); + } + } + + pointer_ += next; + } + + /// Returns true if the current coordinate is within the filter tensor W + CUTLASS_HOST_DEVICE + bool valid() { + LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous; + return (predicates_ & (1u << pred_idx)); + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + return reinterpret_cast(pointer_ + + iteration_contiguous_ * ThreadMap::Delta::kContiguous * sizeof_bits::value / 8); + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv2dDgradFilterTileAccessIteratorOptimized &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + + // Move to the next K coordinate within the tile + pointer_ += params_.inc_next_strided; + + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv2dProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.C % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + return Status::kSuccess; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + + diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h new file mode 100644 index 0000000000..d32da7c3bf --- /dev/null +++ b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h @@ -0,0 +1,525 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) + matrix from memory. + + This iterator assumes TensorNHWC layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" +#include "cutlass/conv/threadblock/conv2d_params.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + typename Shape_, + typename Element_, + typename ThreadMap_, + conv::StrideSupport StrideSupport_ = conv::StrideSupport::kStrided +> +class Conv2dDgradOutputGradientTileAccessIteratorAnalytic; +///////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dDgradOutputGradientTileAccessIteratorAnalytic strided dgrad needs special handling using +// unscaled coordinations +template < + typename Shape_, + typename Element_, + typename ThreadMap_ +> +class Conv2dDgradOutputGradientTileAccessIteratorAnalytic < + Shape_, + Element_, + ThreadMap_, + conv::StrideSupport::kStrided +> { +public: + + // + // Types + // + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorNHWC; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using TensorCoord = typename Layout::TensorCoord; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic; + static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided; + static int const kConvDim = 2; + using ConvProblemSize = typename conv::Conv2dProblemSize; + + static_assert(sizeof_bits::value >= 8, + "DGRAD requires elements of size 8b or greater."); + + // + // Simpligying assertions + // + + static_assert(ThreadMap::Iterations::kContiguous == 1, + "Require Iterations::kContiguous == 1"); + + // + // Parameters structure + // + + using Params = Conv2dAnalyticParams; + +private: + + Params const ¶ms_; + Conv2dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + int filter_k_; + int filter_r_; + int filter_s_; + + int offset_n_[ThreadMap::Iterations::kStrided]; + int offset_w_[ThreadMap::Iterations::kStrided]; + int offset_h_[ThreadMap::Iterations::kStrided]; + +private: + + /// Returns the coordinate in the output tensor Dy that is currently pointed to + /// by the iterator but DOES NOT scale by the convolution stride. This is needed + /// to compute predicates in the valid() method. The return value of the public at() + /// method is correctly scaled. + CUTLASS_HOST_DEVICE + TensorCoord unscaled_at_() const { + int n = offset_n_[iteration_strided_]; + int h = offset_h_[iteration_strided_]; + int w = offset_w_[iteration_strided_]; + + int r = filter_r_; + int s = filter_s_; + + if (problem_size_.mode == Mode::kConvolution) { + r = (problem_size_.R - 1 - r); + s = (problem_size_.S - 1 - s); + } + + int p = (h + problem_size_.pad_h - r * problem_size_.dilation_h); + int q = (w + problem_size_.pad_w - s * problem_size_.dilation_w); + + return TensorCoord(n, p, q, filter_k_); + } + +public: + + CUTLASS_HOST_DEVICE + Conv2dDgradOutputGradientTileAccessIteratorAnalytic( + Params const ¶ms, + Conv2dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() // threadblock offset - units are whole CTA tiles + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)), + filter_k_(0), + filter_r_(0), + filter_s_(0) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + filter_k_ = threadblock_offset.column() + thread_coord.contiguous(); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + int offset_nhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided; + + offset_n_[s] = offset_nhw / (problem_size_.H * problem_size_.W); + int residual = offset_nhw % (problem_size_.H * problem_size_.W); + + offset_h_[s] = residual / problem_size_.W; + offset_w_[s] = residual % problem_size_.W; + } + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + CUTLASS_HOST_DEVICE + void advance() { + // move to the next tile + ++filter_s_; + if (filter_s_ < problem_size_.S) { + return; + } + filter_s_ = 0; + ++filter_r_; + if (filter_r_ < problem_size_.R) { + return; + } + filter_r_ = 0; + + filter_k_ += Shape_::kColumn * problem_size_.split_k_slices; + } + + /// Returns the coordinate in the output tensor Dy that is currently pointed to + /// by the iterator. + CUTLASS_HOST_DEVICE + TensorCoord at() const { + + TensorCoord coord = unscaled_at_(); + + return TensorCoord( + coord.n(), + coord.h() / problem_size_.stride_h, + coord.w() / problem_size_.stride_w, + coord.c()); + } + + + /// Returns true if the current coordinate is within the output tensor Dy + CUTLASS_HOST_DEVICE + bool valid() const { + + TensorCoord unscaled_coord = unscaled_at_(); + TensorCoord coord = at(); + + return + !(unscaled_coord.h() % problem_size_.stride_h) && !(unscaled_coord.w() % problem_size_.stride_w) && + coord.n() < problem_size_.N && + coord.h() >= 0 && coord.h() < problem_size_.P && + coord.w() >= 0 && coord.w() < problem_size_.Q && + coord.c() < problem_size_.K; + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + + TensorCoord coord = at(); + LongIndex offset = params_.layout(coord); + + return reinterpret_cast(pointer_ + offset * sizeof_bits::value / 8); + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv2dDgradOutputGradientTileAccessIteratorAnalytic &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv2dProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.K % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + return Status::kSuccess; + } + +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2dDgradOutputGradientTileAccessIteratorAnalytic for unity strides can be optimized by +// eliminating modulo arithmetic to compute unscaled coordinates +template < + typename Shape_, + typename Element_, + typename ThreadMap_ +> +class Conv2dDgradOutputGradientTileAccessIteratorAnalytic < + Shape_, + Element_, + ThreadMap_, + conv::StrideSupport::kUnity +> { +public: + + // + // Types + // + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorNHWC; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using TensorCoord = typename Layout::TensorCoord; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic; + static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity; + static int const kConvDim = 2; + using ConvProblemSize = typename conv::Conv2dProblemSize; + + static_assert(sizeof_bits::value >= 8, + "DGRAD requires elements of size 8b or greater."); + + // + // Simpligying assertions + // + + static_assert(ThreadMap::Iterations::kContiguous == 1, + "Require Iterations::kContiguous == 1"); + + // + // Parameters structure + // + + struct Params { + + Layout layout; + + // + // Methods + // + CUTLASS_HOST_DEVICE + Params() { } + + CUTLASS_HOST_DEVICE + Params( + Conv2dProblemSize const &problem_size, + Layout const &layout + ): layout(layout) { + + } + }; + +private: + + Params const ¶ms_; + Conv2dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + int filter_k_; + int filter_r_; + int filter_s_; + + int offset_n_[ThreadMap::Iterations::kStrided]; + int offset_w_[ThreadMap::Iterations::kStrided]; + int offset_h_[ThreadMap::Iterations::kStrided]; + +public: + + CUTLASS_HOST_DEVICE + Conv2dDgradOutputGradientTileAccessIteratorAnalytic( + Params const ¶ms, + Conv2dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() // threadblock offset - units are whole CTA tiles + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)), + filter_k_(0), + filter_r_(0), + filter_s_(0) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + filter_k_ = threadblock_offset.column() + thread_coord.contiguous(); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + int offset_nhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided; + + offset_n_[s] = offset_nhw / (problem_size_.H * problem_size_.W); + int residual = offset_nhw % (problem_size_.H * problem_size_.W); + + offset_h_[s] = residual / problem_size_.W; + offset_w_[s] = residual % problem_size_.W; + } + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + CUTLASS_HOST_DEVICE + void advance() { + // move to the next tile + ++filter_s_; + if (filter_s_ < problem_size_.S) { + return; + } + filter_s_ = 0; + ++filter_r_; + if (filter_r_ < problem_size_.R) { + return; + } + filter_r_ = 0; + + filter_k_ += Shape_::kColumn * problem_size_.split_k_slices; + } + + /// Returns the coordinate in the output tensor Dy that is currently pointed to + /// by the iterator. + CUTLASS_HOST_DEVICE + TensorCoord at() const { + + int n = offset_n_[iteration_strided_]; + int h = offset_h_[iteration_strided_]; + int w = offset_w_[iteration_strided_]; + + int r = filter_r_; + int s = filter_s_; + + if (problem_size_.mode == Mode::kConvolution) { + r = (problem_size_.R - 1 - r); + s = (problem_size_.S - 1 - s); + } + + int p = (h + problem_size_.pad_h - r * problem_size_.dilation_h) / problem_size_.stride_h; + int q = (w + problem_size_.pad_w - s * problem_size_.dilation_w) / problem_size_.stride_w; + + return TensorCoord(n, p, q, filter_k_); + + } + + + /// Returns true if the current coordinate is within the output tensor Dy + CUTLASS_HOST_DEVICE + bool valid() const { + + TensorCoord coord = at(); + + return coord.n() < problem_size_.N && + coord.h() >= 0 && coord.h() < problem_size_.P && + coord.w() >= 0 && coord.w() < problem_size_.Q && + coord.c() < problem_size_.K; + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + + TensorCoord coord = at(); + LongIndex offset = params_.layout(coord); + + return reinterpret_cast(pointer_ + offset * sizeof_bits::value / 8); + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv2dDgradOutputGradientTileAccessIteratorAnalytic &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv2dProblemSize const &problem_size) { + + // Conv2dDgradFilterTileAccessIteratorAnalytic unity stride specialization + // only supports (stride_h, stride_w) = (1, 1) + if (problem_size.stride() != MatrixCoord({1, 1})) { + return Status::kErrorNotSupported; + } + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.K % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + return Status::kSuccess; + } + +}; +///////////////////////////////////////////////////////////////////////////////////////////////// +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + + diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h new file mode 100644 index 0000000000..71299cf578 --- /dev/null +++ b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h @@ -0,0 +1,437 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) + matrix from memory. + + This iterator assumes TensorNHWC layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" +#include "cutlass/conv/threadblock/conv2d_params.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename ThreadMap_, + conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity +> +class Conv2dDgradOutputGradientTileAccessIteratorOptimized { +public: + + static_assert(StrideSupport_ == conv::StrideSupport::kUnity, + "Only unit-stride dgrad is supported at this time."); + + // + // Types + // + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorNHWC; + using TensorCoord = typename Layout::TensorCoord; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized; + static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity; + static int const kConvDim = 2; + using ConvProblemSize = typename conv::Conv2dProblemSize; + + using Mask = uint64_t; + + // + // Simplifying assertions + // + static_assert(ThreadMap::Iterations::kContiguous == 1, + "Require Iterations::kContiguous == 1"); + + // + // Parameters structure + // + + struct Params : Conv2dDgradOutputGradientIteratorOptimizedParams { + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Params() { } + + CUTLASS_HOST_DEVICE + Params(Conv2dDgradOutputGradientIteratorOptimizedParams const &base): + Conv2dDgradOutputGradientIteratorOptimizedParams(base) { } + + CUTLASS_HOST_DEVICE + Params( + Conv2dProblemSize const &problem_size, + Layout const &layout + ): + Conv2dDgradOutputGradientIteratorOptimizedParams( + problem_size, + layout, + sizeof_bits::value, + {Shape::kRow, Shape::kColumn}, + ThreadMap::kThreads, + ThreadMap::kElementsPerAccess, + {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided}, + {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided} + ) { } + }; + +private: + + Conv2dDgradOutputGradientIteratorOptimizedParams const ¶ms_; + Conv2dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + + // One pointer per access + char const *pointer_[ThreadMap::Iterations::kStrided]; + + // current filter position (r, s) + int filter_r_; + int filter_s_; + int filter_k_; + + Index masks_[ThreadMap::Iterations::kStrided][2]; + +public: + + CUTLASS_HOST_DEVICE + Conv2dDgradOutputGradientTileAccessIteratorOptimized( + Conv2dDgradOutputGradientIteratorOptimizedParams const ¶ms, + Conv2dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() // tile index - units are threadblock-scoped tiles + ): + params_(params), + problem_size_(problem_size), + filter_k_(0), + filter_r_(0), + filter_s_(0) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + filter_k_ = threadblock_offset.column() + thread_coord.contiguous(); + + int offset_n[ThreadMap::Iterations::kStrided]; + int offset_h[ThreadMap::Iterations::kStrided]; + int offset_w[ThreadMap::Iterations::kStrided]; + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + + pointer_[s] = reinterpret_cast(ptr); + + int offset_nhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided; + + // The subseqnet fast_divmod() operations are equivalent to the following logical computation: + // + // + // offset_n[s] = offset_nhw / (problem_size_.H * problem_size_.W); + // int residual = offset_nhw % (problem_size_.H * problem_size_.W); + // + // offset_h[s] = residual / problem_size_.W; + // offset_w[s] = residual % problem_size_.W; + // + + int residual; + + params_.hw_divmod(offset_n[s], residual, offset_nhw); + params_.w_divmod(offset_h[s], offset_w[s], residual); + + TensorCoord coord = at_(offset_n[s], offset_h[s], offset_w[s], 0, 0); + + pointer_[s] += params_.layout(coord) * sizeof_bits::value / 8; + } + + clear_mask(); + + CUTLASS_PRAGMA_NO_UNROLL + for (int r = 0; r < problem_size_.R; ++r) { + CUTLASS_PRAGMA_UNROLL + for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) { + + int r_ = r; + if (problem_size_.mode == Mode::kConvolution) { + r_ = problem_size_.R - 1 - r; + } + + int p = offset_h[s_idx] + problem_size_.pad_h - r_ * problem_size_.dilation_h; + + bool pred = (offset_n[s_idx] < problem_size_.N && p >= 0 && p < problem_size_.P); + masks_[s_idx][0] |= (pred << r); + } + } + + CUTLASS_PRAGMA_NO_UNROLL + for (int s = 0; s < problem_size_.S; ++s) { + CUTLASS_PRAGMA_UNROLL + for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) { + + int s_ = s; + if (problem_size_.mode == Mode::kConvolution) { + s_ = problem_size_.S - 1 - s; + } + + int q = offset_w[s_idx] + problem_size_.pad_w - s_ * problem_size_.dilation_w; + + bool pred = (q >= 0 && q < problem_size_.Q); + masks_[s_idx][1] |= (pred << s); + } + } + + if (filter_k_ >= problem_size.K) { + clear_mask(); + } + + set_iteration_index(0); + } + +private: + + /// Returns the coordinate in the output gradient tensor dy that is correspoinding to + // output nhw and filter position k, r, s + CUTLASS_HOST_DEVICE + TensorCoord at_(int n, int h, int w, int r, int s) const { + + if (problem_size_.mode == Mode::kConvolution) { + r = problem_size_.R - 1 - r; + s = problem_size_.S - 1 - s; + } + + int p = h + problem_size_.pad_h - r * problem_size_.dilation_h; + int q = w + problem_size_.pad_w - s * problem_size_.dilation_w; + + return TensorCoord(n, p, q, filter_k_); + } + + /// Adds a pointer offset in units of element + CUTLASS_HOST_DEVICE + void add_byte_offset_(LongIndex byte_offset) { + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + pointer_[s] += byte_offset; + } + } + + /// Clears the predicates + CUTLASS_HOST_DEVICE + void clear_mask_(bool clear) { + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + + // We are using inline PTX assembly here to avoid an CUDA C++ compilation + // artifact in which control flow instructions are generated. Instead, our + // intent is to predicate the mov instructions. + #if defined(__CUDA_ARCH__) + asm volatile( + "{\n" + " .reg .pred p;\n" + " .reg .u32 m;" + " mov.u32 m, %2;" + " setp.ne.b32 p, %1, 0;\n" + " @p mov.u32 m, 0;\n" + " mov.u32 %0, m;\n" + "}\n" + : + "=r"(masks_[s][0]) + : + "r"((int)clear), + "r"(masks_[s][0]) + ); + asm volatile( + "{\n" + " .reg .pred p;\n" + " .reg .u32 m;" + " mov.u32 m, %2;" + " setp.ne.b32 p, %1, 0;\n" + " @p mov.u32 m, 0;\n" + " mov.u32 %0, m;\n" + "}\n" + : + "=r"(masks_[s][1]) + : + "r"((int)clear), + "r"(masks_[s][1]) + ); + #else + if (clear) { + masks_[s][0] = 0; + masks_[s][1] = 0; + } + #endif + } + } + +public: + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + add_byte_offset_(pointer_offset * sizeof_bits::value / 8); + } + + CUTLASS_HOST_DEVICE + void advance() { + + int next_idx = 0; + + // moves to the next tile + ++filter_s_; + if (filter_s_ == problem_size_.S) { + filter_s_ = 0; + ++filter_r_; + + if (filter_r_ < problem_size_.R) { + next_idx = 1; + } + else { + filter_r_ = 0; + next_idx = 2; + } + } + + add_byte_offset_(params_.inc_next[next_idx]); + + if (next_idx == 2) { + filter_k_ += params_.filter_k_delta; + } + + clear_mask_(filter_k_ >= problem_size_.K); + } + + /// Clears the predicates + CUTLASS_HOST_DEVICE + void clear_mask() { + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + masks_[s][0] = Mask(0); + masks_[s][1] = Mask(0); + } + } + + CUTLASS_HOST_DEVICE + bool valid() { + + return + (masks_[iteration_strided_][0] & (Index(1) << filter_r_)) && + (masks_[iteration_strided_][1] & (Index(1) << filter_s_)); + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + + return reinterpret_cast(pointer_[iteration_strided_]); + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv2dDgradOutputGradientTileAccessIteratorOptimized &operator++() { + + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv2dProblemSize const &problem_size) { + + // This is specialized for unit stride + if (problem_size.stride() != MatrixCoord({1, 1})) { + return Status::kErrorNotSupported; + } + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.K % (128/sizeof_bits::value)) { + return Status::kErrorNotSupported; + } + + // Limit on filter size + if (problem_size.R > 32 || problem_size.S > 32) { + return Status::kErrorNotSupported; + } + return Status::kSuccess; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + + diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h new file mode 100644 index 0000000000..92dd705d6b --- /dev/null +++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h @@ -0,0 +1,274 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile) + matrix from memory. + + This iterator assumes TensorNHWC or TensorNCxHWx layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" +#include "cutlass/conv/threadblock/conv2d_params.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename Layout_, + typename ThreadMap_ +> +class Conv2dFpropActivationTileAccessIteratorAnalytic { +public: + + // + // Types + // + + using Shape = Shape_; + using Element = Element_; + using Layout = Layout_; + using TensorCoord = typename Layout::TensorCoord; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic; + static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided; + static int const kConvDim = 2; + using ConvProblemSize = typename conv::Conv2dProblemSize; + + // + // Simplifying assertions + // + static_assert(ThreadMap::Iterations::kContiguous == 1, + "Require Iterations::kContiguous == 1"); + + // + // Parameters structure + // + + using Params = Conv2dAnalyticParams; + +private: + + Params const ¶ms_; + Conv2dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + int filter_c_; + int filter_r_; + int filter_s_; + + int offset_n_[ThreadMap::Iterations::kStrided]; + int offset_p_[ThreadMap::Iterations::kStrided]; + int offset_q_[ThreadMap::Iterations::kStrided]; + +public: + + CUTLASS_HOST_DEVICE + Conv2dFpropActivationTileAccessIteratorAnalytic( + Params const ¶ms, + Conv2dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() // tile index - units are threadblock-scoped tiles + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)), + filter_c_(0), + filter_r_(0), + filter_s_(0) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + filter_c_ = threadblock_offset.column() + thread_coord.contiguous(); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + int offset_npq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided; + + offset_n_[s] = offset_npq / (problem_size_.P * problem_size_.Q); + int residual = offset_npq % (problem_size_.P * problem_size_.Q); + + offset_p_[s] = residual / problem_size_.Q; + offset_q_[s] = residual % problem_size_.Q; + } + + set_iteration_index(0); + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + CUTLASS_HOST_DEVICE + void advance() { + // moves to the next tile + ++filter_s_; + if (filter_s_ < problem_size_.S) { + return; + } + filter_s_ = 0; + ++filter_r_; + if (filter_r_ < problem_size_.R) { + return; + } + filter_r_ = 0; + + filter_c_ += Shape::kColumn * problem_size_.split_k_slices; + } + + /// Returns the coordinate in the activations tensor X that is currently pointed to + /// by the iterator. + CUTLASS_HOST_DEVICE + TensorCoord at() const { + int n = offset_n_[iteration_strided_]; + int p = offset_p_[iteration_strided_]; + int q = offset_q_[iteration_strided_]; + + int r = filter_r_; + int s = filter_s_; + + if (problem_size_.mode == Mode::kConvolution) { + r = (problem_size_.R - 1 - filter_r_); + s = (problem_size_.S - 1 - filter_s_); + } + + int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h; + int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w; + + return TensorCoord(n, h, w, filter_c_); + } + + /// Returns true if the current coordinate is within the activations tensor X + CUTLASS_HOST_DEVICE + bool valid() const { + + TensorCoord coord = at(); + + return coord.n() < problem_size_.N && + coord.h() >= 0 && coord.h() < problem_size_.H && + coord.w() >= 0 && coord.w() < problem_size_.W && + coord.c() < problem_size_.C; + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + + TensorCoord coord = at(); + LongIndex offset = params_.layout(coord); + + AccessType const *ptr = reinterpret_cast(pointer_ + offset * sizeof_bits::value / 8); + + return ptr; + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv2dFpropActivationTileAccessIteratorAnalytic &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv2dProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.C % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + if (platform::is_same>::value) { + if (problem_size.C % 32) { + return Status::kErrorInvalidProblem; + } + } + + if (platform::is_same>::value) { + if (problem_size.C % 64) { + return Status::kErrorInvalidProblem; + } + } + + return Status::kSuccess; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h new file mode 100644 index 0000000000..afb015d352 --- /dev/null +++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h @@ -0,0 +1,438 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile) + matrix from memory. + + This iterator assumes TensorNHWC or TensorNCxHWx layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" +#include "cutlass/conv/threadblock/conv2d_params.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename Layout_, + typename ThreadMap_ +> +class Conv2dFpropActivationTileAccessIteratorOptimized { +public: + + // + // Types + // + + using Shape = Shape_; + using Element = Element_; + using Layout = Layout_; + using TensorCoord = typename Layout::TensorCoord; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized; + static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided; + static int const kConvDim = 2; + using ConvProblemSize = typename conv::Conv2dProblemSize; + + using Mask = uint64_t; + + // + // Simplifying assertions + // + static_assert(ThreadMap::Iterations::kContiguous == 1, + "Require Iterations::kContiguous == 1"); + + // + // Parameters structure + // + + struct Params : Conv2dFpropActivationIteratorOptimizedParams { + + CUTLASS_HOST_DEVICE + Params() { } + + CUTLASS_HOST_DEVICE + Params(Conv2dFpropActivationIteratorOptimizedParams const &base): + Conv2dFpropActivationIteratorOptimizedParams(base) { } + + CUTLASS_HOST_DEVICE + Params( + Conv2dProblemSize const &problem_size, + Layout const &layout + ): + Conv2dFpropActivationIteratorOptimizedParams( + problem_size, + layout, + sizeof_bits::value, + {Shape::kRow, Shape::kColumn}, + ThreadMap::kThreads, + ThreadMap::kElementsPerAccess, + {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided}, + {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided} + ) { + + } + }; + +private: + + Conv2dFpropActivationIteratorOptimizedParams const ¶ms_; + Conv2dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + + // One pointer per access + char const *pointer_[ThreadMap::Iterations::kStrided]; + + // current filter position (r, s) + int filter_r_; + int filter_s_; + int filter_c_; + + Index masks_[ThreadMap::Iterations::kStrided][2]; + +public: + + CUTLASS_HOST_DEVICE + Conv2dFpropActivationTileAccessIteratorOptimized( + Conv2dFpropActivationIteratorOptimizedParams const ¶ms, + Conv2dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() // tile index - units are threadblock-scoped tiles + ): + params_(params), + problem_size_(problem_size), + filter_c_(0), + filter_r_(0), + filter_s_(0) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + filter_c_ = threadblock_offset.column() + thread_coord.contiguous(); + + int offset_n[ThreadMap::Iterations::kStrided]; + int offset_p[ThreadMap::Iterations::kStrided]; + int offset_q[ThreadMap::Iterations::kStrided]; + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + + pointer_[s] = reinterpret_cast(ptr); + + int offset_npq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided; + + // The subseqnet fast_divmod() operations are equivalent to the following logical computation: + // + // + // offset_n[s] = offset_npq / (problem_size_.P * problem_size_.Q); + // int residual = offset_npq % (problem_size_.P * problem_size_.Q); + // + // offset_p[s] = residual / problem_size_.Q; + // offset_q[s] = residual % problem_size_.Q; + // + + int residual; + + params.pq_divmod(offset_n[s], residual, offset_npq); + params.q_divmod(offset_p[s], offset_q[s], residual); + + TensorCoord coord = at_(offset_n[s], offset_p[s], offset_q[s], 0, 0); + + pointer_[s] += params_.layout(coord) * sizeof_bits::value / 8; + } + + clear_mask(); + + CUTLASS_PRAGMA_NO_UNROLL + for (int r = 0; r < problem_size_.R; ++r) { + CUTLASS_PRAGMA_UNROLL + for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) { + + int r_ = r; + if (problem_size_.mode == Mode::kConvolution) { + r_ = problem_size_.R - 1 - r; + } + + int h = offset_p[s_idx] * problem_size_.stride_h - problem_size_.pad_h + r_ * problem_size_.dilation_h; + + bool pred = (offset_n[s_idx] < problem_size_.N && h >= 0 && h < problem_size_.H); + masks_[s_idx][0] |= (pred << r); + } + } + + CUTLASS_PRAGMA_NO_UNROLL + for (int s = 0; s < problem_size_.S; ++s) { + CUTLASS_PRAGMA_UNROLL + for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) { + + int s_ = s; + if (problem_size_.mode == Mode::kConvolution) { + s_ = problem_size_.S - 1 - s; + } + + int w = offset_q[s_idx] * problem_size_.stride_w - problem_size_.pad_w + s_ * problem_size_.dilation_w; + + bool pred = (w >= 0 && w < problem_size_.W); + masks_[s_idx][1] |= (pred << s); + } + } + + if (filter_c_ >= problem_size.C) { + clear_mask(); + } + + set_iteration_index(0); + } + +private: + + /// Returns the coordinate in the activations tensor X that is correspoinding to + // output npq and filter position r, s + CUTLASS_HOST_DEVICE + TensorCoord at_(int n, int p, int q, int r, int s) const { + + if (problem_size_.mode == Mode::kConvolution) { + r = problem_size_.R - 1 - r; + s = problem_size_.S - 1 - s; + } + + int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h; + int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w; + + return TensorCoord(n, h, w, filter_c_); + } + + /// Adds a pointer offset in units of element + CUTLASS_HOST_DEVICE + void add_byte_offset_(LongIndex byte_offset) { + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + pointer_[s] += byte_offset; + } + } + + /// Clears the predicates + CUTLASS_HOST_DEVICE + void clear_mask_(bool clear) { + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + + // We are using inline PTX assembly here to avoid an CUDA C++ compilation + // artifact in which control flow instructions are generated. Instead, our + // intent is to predicate the mov instructions. + #if defined(__CUDA_ARCH__) + asm volatile( + "{\n" + " .reg .pred p;\n" + " .reg .u32 m;" + " mov.u32 m, %2;" + " setp.ne.b32 p, %1, 0;\n" + " @p mov.u32 m, 0;\n" + " mov.u32 %0, m;\n" + "}\n" + : + "=r"(masks_[s][0]) + : + "r"((int)clear), + "r"(masks_[s][0]) + ); + asm volatile( + "{\n" + " .reg .pred p;\n" + " .reg .u32 m;" + " mov.u32 m, %2;" + " setp.ne.b32 p, %1, 0;\n" + " @p mov.u32 m, 0;\n" + " mov.u32 %0, m;\n" + "}\n" + : + "=r"(masks_[s][1]) + : + "r"((int)clear), + "r"(masks_[s][1]) + ); + #else + if (clear) { + masks_[s][0] = 0; + masks_[s][1] = 0; + } + #endif + } + } + +public: + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + add_byte_offset_(pointer_offset * sizeof_bits::value / 8); + } + + CUTLASS_HOST_DEVICE + void advance() { + + int next_idx = 0; + + // moves to the next tile + ++filter_s_; + if (filter_s_ == problem_size_.S) { + filter_s_ = 0; + ++filter_r_; + + if (filter_r_ < problem_size_.R) { + next_idx = 1; + } + else { + filter_r_ = 0; + next_idx = 2; + } + } + + add_byte_offset_(params_.inc_next[next_idx]); + + if (next_idx == 2) { + filter_c_ += params_.filter_c_delta; + } + + clear_mask_(filter_c_ >= problem_size_.C); + } + + /// Clears the predicates + CUTLASS_HOST_DEVICE + void clear_mask() { + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + masks_[s][0] = Mask(0); + masks_[s][1] = Mask(0); + } + } + + CUTLASS_HOST_DEVICE + bool valid() { + + return + (masks_[iteration_strided_][0] & (Index(1) << filter_r_)) && + (masks_[iteration_strided_][1] & (Index(1) << filter_s_)); + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + + return reinterpret_cast(pointer_[iteration_strided_]); + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv2dFpropActivationTileAccessIteratorOptimized &operator++() { + + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv2dProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.C % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + if (platform::is_same>::value) { + if (problem_size.C % 32) { + return Status::kErrorInvalidProblem; + } + } + + if (platform::is_same>::value) { + if (problem_size.C % 64) { + return Status::kErrorInvalidProblem; + } + } + + // Conv2dFpropActivationTileAccessIteratorOptimized has constraint on filter positions + // due to the number of mask bits. + if (problem_size.R > 32 || problem_size.S > 32) { + return Status::kErrorNotSupported; + } + return Status::kSuccess; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h new file mode 100644 index 0000000000..6547e9c5ba --- /dev/null +++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h @@ -0,0 +1,252 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) + matrix from memory. + + This iterator assumes TensorNHWC or TensorCxRSKx layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" +#include "cutlass/conv/threadblock/conv2d_params.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename Layout_, + typename ThreadMap_ +> +class Conv2dFpropFilterTileAccessIteratorAnalytic { +public: + + // + // Types + // + + using Shape = Shape_; + using Element = Element_; + using Layout = Layout_; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using TensorCoord = typename Layout::TensorCoord; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic; + static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided; + static int const kConvDim = 2; + using ConvProblemSize = typename conv::Conv2dProblemSize; + + // + // Simplifying assertions + // + static_assert(ThreadMap::Iterations::kContiguous == 1, + "Require Iterations::kContiguous == 1"); + + // + // Parameters structure + // + + using Params = Conv2dAnalyticParams; + +private: + + Params const ¶ms_; + Conv2dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + int filter_r_; + int filter_s_; + int filter_c_; + + int offset_k_[ThreadMap::Iterations::kStrided]; + +public: + + CUTLASS_HOST_DEVICE + Conv2dFpropFilterTileAccessIteratorAnalytic( + Params const ¶ms, + Conv2dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)), + filter_r_(0), + filter_s_(0), + filter_c_(0) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + filter_c_ = threadblock_offset.row() + thread_coord.contiguous(); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + offset_k_[s] = threadblock_offset.column() + thread_coord.strided() + s * ThreadMap::Delta::kStrided; + } + + set_iteration_index(0); + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += pointer_offset * 8 / sizeof_bits::value; + } + + CUTLASS_HOST_DEVICE + void advance() { + // moves to the next tile + ++filter_s_; + if (filter_s_ < problem_size_.S) { + return; + } + filter_s_ = 0; + + ++filter_r_; + if (filter_r_ < problem_size_.R) { + return; + } + filter_r_ = 0; + + filter_c_ += Shape::kRow * problem_size_.split_k_slices; + } + + /// Returns the coordinate in the filter tensor W that is currently pointed to + /// by the iterator. + CUTLASS_HOST_DEVICE + TensorCoord at() const { + + int k = offset_k_[iteration_strided_]; + + return TensorCoord(k, filter_r_, filter_s_, filter_c_); + } + + /// Returns true if the current coordinate is within the activations tensor W + CUTLASS_HOST_DEVICE + bool valid() const { + + TensorCoord coord = at(); + + return coord.n() < problem_size_.K && + coord.c() < problem_size_.C; + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + + TensorCoord coord = at(); + LongIndex offset = params_.layout(coord); + + return reinterpret_cast(pointer_ + offset * sizeof_bits::value / 8); + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv2dFpropFilterTileAccessIteratorAnalytic &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv2dProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.K % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + if (platform::is_same>::value) { + if (problem_size.K % 32) { + return Status::kErrorInvalidProblem; + } + } + + if (platform::is_same>::value) { + if (problem_size.K % 64) { + return Status::kErrorInvalidProblem; + } + } + + return Status::kSuccess; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + + diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h new file mode 100644 index 0000000000..bf0d1d3124 --- /dev/null +++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h @@ -0,0 +1,282 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) + matrix from memory. + + This iterator assumes TensorNHWC or TensorCxRSKx layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" + +#include "cutlass/conv/threadblock/conv2d_params.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename Layout_, + typename ThreadMap_ +> +class Conv2dFpropFilterTileAccessIteratorOptimized{ +public: + + // + // Types + // + + using Shape = Shape_; + using Element = Element_; + using Layout = Layout_; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using TensorCoord = typename Layout::TensorCoord; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized; + static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided; + static int const kConvDim = 2; + using ConvProblemSize = typename conv::Conv2dProblemSize; + + // + // Simplifying assertions + // + static_assert(ThreadMap::Iterations::kContiguous == 1, + "Require Iterations::kContiguous == 1"); + + // + // Parameters structure + // + + struct Params : Conv2dFpropFilterIteratorOptimizedParams { + + CUTLASS_HOST_DEVICE + Params() { } + + CUTLASS_HOST_DEVICE + Params(Conv2dFpropFilterIteratorOptimizedParams const &base): + Conv2dFpropFilterIteratorOptimizedParams(base) { } + + CUTLASS_HOST_DEVICE + Params( + Conv2dProblemSize const &problem_size, + Layout const &layout + ): + Conv2dFpropFilterIteratorOptimizedParams( + problem_size, + layout, + sizeof_bits::value, + {Shape::kRow, Shape::kColumn}, + ThreadMap::kThreads, + ThreadMap::kElementsPerAccess, + {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided}, + {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided} + ) { + + } + }; + +private: + + Conv2dFpropFilterIteratorOptimizedParams const ¶ms_; + Conv2dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + uint32_t predicates_; + int filter_rs_; + int filter_c_; + + // + // Assertions + // + + // We map predicates into bits packed in this uint32_t container + static_assert(ThreadMap::Iterations::kStrided < sizeof(predicates_) * 8, + "Currently, the number of loads per iteration is limited by the size of the predicates container."); + +public: + + CUTLASS_HOST_DEVICE + Conv2dFpropFilterTileAccessIteratorOptimized( + Conv2dFpropFilterIteratorOptimizedParams const ¶ms, + Conv2dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)), + predicates_(0), + filter_rs_(0), + filter_c_(0) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + filter_c_ = threadblock_offset.row() + thread_coord.contiguous(); + Index column = threadblock_offset.column() + thread_coord.strided(); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + uint32_t pred = ((column + s * ThreadMap::Delta::kStrided < problem_size_.K) ? 1u : 0); + predicates_ |= (pred << s); + } + + if (filter_c_ >= problem_size.C) { + predicates_ = 0u; + } + + pointer_ += ( + params_.layout({filter_c_, column}) + ) * sizeof_bits::value / 8; + + set_iteration_index(0); + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + CUTLASS_HOST_DEVICE + void advance() { + + LongIndex next = params_.inc_next_rs; + + // moves to the next tile + ++filter_rs_; + if (filter_rs_ == params_.RS) { + + filter_rs_ = 0; + next = params_.inc_next_c; + filter_c_ += params_.filter_c_delta; + } + + if (filter_c_ >= problem_size_.C) { + predicates_ = 0; + } + + pointer_ += next; + } + + /// Returns true if the current coordinate is within the filter tensor W + CUTLASS_HOST_DEVICE + bool valid() { + return (predicates_ & (1u << iteration_strided_)); + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + return reinterpret_cast(pointer_); + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv2dFpropFilterTileAccessIteratorOptimized &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + + // Move to the next K coordinate within the tile + pointer_ += params_.inc_next_k; + + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv2dProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.C % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + if (platform::is_same>::value) { + if (problem_size.K % 32) { + return Status::kErrorInvalidProblem; + } + } + + if (platform::is_same>::value) { + if (problem_size.K % 64) { + return Status::kErrorInvalidProblem; + } + } + + return Status::kSuccess; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/conv/threadblock/conv2d_params.h b/include/cutlass/conv/threadblock/conv2d_params.h new file mode 100644 index 0000000000..ac6b2e3095 --- /dev/null +++ b/include/cutlass/conv/threadblock/conv2d_params.h @@ -0,0 +1,609 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! + \file + \brief Extracts the host-params objects into non-template code. +*/ + +#pragma once + +#define TRACE_CONV_PARAMS_INITIALIZERS_ENABLED 0 + +#include "cutlass/cutlass.h" +#include "cutlass/fast_math.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" + +#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED +#include +#endif + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Params structure used for all Conv2d analytic tile iterators +template< typename Layout_ = layout::TensorNHWC > +struct Conv2dAnalyticParams { + + using Layout = Layout_; + + Layout layout; + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Conv2dAnalyticParams() { } + + CUTLASS_HOST_DEVICE + Conv2dAnalyticParams( + Conv2dProblemSize const &problem_size, + Layout const &layout + ): layout(layout) { + + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED + +CUTLASS_HOST_DEVICE +void TraceIteratorParams( + char const *conv_operator, + char const *operand, + int element_size_bits, + MatrixCoord threadblock_shape, + int thread_count, + int access_size, + layout::PitchLinearCoord threadmap_iterations, + layout::PitchLinearCoord threadmap_delta +) { + +#if !defined(__CUDA_ARCH__) + + char const *fname = "conv_iterator_params.csv"; + + std::ifstream test(fname); + bool file_exists = test.is_open(); + + if (file_exists) { + test.close(); + } + + std::ofstream trace("conv_iterator_params.csv", std::ofstream::app); + + if (!file_exists) { + trace + << "Operator,Operand,ElementSize,CtaRows,CtaColumns,ThreadCount,AccessSize," + << "IterationsContiguous,IterationsStrided,DeltaContiguous,DeltaStrided\n"; + } + + trace << conv_operator << "," << operand << "," << element_size_bits << "," + << threadblock_shape.row() << "," << threadblock_shape.column() + << "," << thread_count << "," << access_size + << "," << threadmap_iterations.contiguous() << "," << threadmap_iterations.strided() + << "," << threadmap_delta.contiguous() << "," << threadmap_delta.strided() << "\n"; +#endif +} + +#define TRACE_CONV_INITIALIZERS(conv_op, operand, element_size, cta_shape, thread_count, access_size, iterations, delta) \ + TraceIteratorParams(conv_op, operand, element_size, cta_shape, thread_count, access_size, iterations, delta); + +#else + +#define TRACE_CONV_INITIALIZERS(conv_op, operand, element_size, cta_shape, thread_count, access_size, iterations, delta) {} + +#endif + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Parameters structure used for Conv2dFpropActivationTileIteratorOptimized +template< typename Layout_ = layout::TensorNHWC > +struct Conv2dFpropActivationIteratorOptimizedParams; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Parameters structure used for Conv2dFpropActivationTileIteratorOptimized +template<> +struct Conv2dFpropActivationIteratorOptimizedParams { + + using Layout = layout::TensorNHWC; + + Layout layout; + + int64_t inc_next[3]; // {next S, next R, next C} + int filter_c_delta; // number of logical elements to add to filter_c_ + int PQ; // product of P*Q + + FastDivmod pq_divmod; + FastDivmod q_divmod; + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Conv2dFpropActivationIteratorOptimizedParams() { } + + CUTLASS_HOST_DEVICE + Conv2dFpropActivationIteratorOptimizedParams( + Conv2dProblemSize const &problem_size, + Layout const &layout, ///< layout object + int element_size_bits, ///< size of each element in bits + MatrixCoord threadblock_shape, + int thread_count, + int access_size, + layout::PitchLinearCoord threadmap_iterations, + layout::PitchLinearCoord threadmap_delta + ): + layout(layout), PQ(problem_size.P * problem_size.Q), pq_divmod(PQ), q_divmod(problem_size.Q) { + + TRACE_CONV_INITIALIZERS("conv2d_fprop", "activation", + element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta); + + int conv_sign = (problem_size.mode == Mode::kConvolution ? -1 : 1); + + // next S + inc_next[0] = conv_sign * (int64_t(layout.stride()[0]) * problem_size.dilation_w) * element_size_bits / 8; + + // next R + inc_next[1] = conv_sign * ( + int64_t(layout.stride()[1]) * problem_size.dilation_h + - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w + ) * element_size_bits / 8; + + // next C + inc_next[2] = ( + threadblock_shape.column() * problem_size.split_k_slices + - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h + - conv_sign * int64_t(problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w + ) * element_size_bits / 8; + + // logical offset added to internal channel counter - units are elements, not bytes + filter_c_delta = threadblock_shape.column() * problem_size.split_k_slices; + } +}; + +/// Parameters structure used for Conv2dFpropActivationTileIteratorOptimized +template +struct Conv2dFpropActivationIteratorOptimizedParams> { + static int const kInterleaved = Interleaved_; + + using Layout = layout::TensorNCxHWx; + + Layout layout; + + int64_t inc_next[3]; // {next S, next R, next C} + int filter_c_delta; // number of logical elements to add to filter_c_ + int PQ; // product of P*Q + + FastDivmod pq_divmod; + FastDivmod q_divmod; + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Conv2dFpropActivationIteratorOptimizedParams() { } + + CUTLASS_HOST_DEVICE + Conv2dFpropActivationIteratorOptimizedParams( + Conv2dProblemSize const &problem_size, + Layout const &layout, ///< layout object + int element_size_bits, ///< size of each element in bits + MatrixCoord threadblock_shape, + int thread_count, + int access_size, + layout::PitchLinearCoord threadmap_iterations, + layout::PitchLinearCoord threadmap_delta + ): + layout(layout), PQ(problem_size.P * problem_size.Q), pq_divmod(PQ), q_divmod(problem_size.Q) { + + TRACE_CONV_INITIALIZERS("conv2d_fprop", "activation", + element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta); + + int conv_sign = (problem_size.mode == Mode::kConvolution ? -1 : 1); + + // next S + inc_next[0] = conv_sign * (kInterleaved * problem_size.dilation_w) * element_size_bits / 8; + + // next R + inc_next[1] = conv_sign * ( + int64_t(layout.stride()[0]) * problem_size.dilation_h + - (problem_size.S - 1) * kInterleaved * problem_size.dilation_w + ) * element_size_bits / 8; + + // next C + inc_next[2] = ( + threadblock_shape.column() * problem_size.split_k_slices / kInterleaved * int64_t(layout.stride()[1]) + - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[0] * problem_size.dilation_h + - conv_sign * int64_t(problem_size.S - 1) * kInterleaved * problem_size.dilation_w + ) * element_size_bits / 8; + + // logical offset added to internal channel counter - units are elements, not bytes + filter_c_delta = threadblock_shape.column() * problem_size.split_k_slices; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template< typename Layout_ = layout::TensorNHWC > +struct Conv2dFpropFilterIteratorOptimizedParams; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +struct Conv2dFpropFilterIteratorOptimizedParams +{ + + using Layout = layout::TensorNHWC; + + Layout layout; + int RS; + int filter_c_delta; + + int64_t inc_next_k; // offset in units of bytes to next K position + int64_t inc_next_rs; // offset in units of bytes to next RS position + int64_t inc_next_c; // offset in units of bytes to next C position + + // + // Methods + // + CUTLASS_HOST_DEVICE + Conv2dFpropFilterIteratorOptimizedParams() { } + + CUTLASS_HOST_DEVICE + Conv2dFpropFilterIteratorOptimizedParams( + Conv2dProblemSize const &problem_size, + Layout const &layout, + int element_size_bits, ///< size of each element in bits + MatrixCoord threadblock_shape, + int thread_count, + int access_size, + layout::PitchLinearCoord threadmap_iterations, + layout::PitchLinearCoord threadmap_delta + ): + layout(layout) { + + TRACE_CONV_INITIALIZERS("conv2d_fprop", "filter", + element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta); + + RS = problem_size.R * problem_size.S; + + inc_next_k = (int64_t(layout.stride()[2]) * threadmap_delta.strided() * element_size_bits) / 8; + + inc_next_rs = + ( int64_t(layout.stride()[0]) + - int64_t(layout.stride()[2]) * (threadmap_iterations.strided() - 1) * threadmap_delta.strided() + ) * element_size_bits / 8; + + inc_next_c = + ( + threadblock_shape.row() * problem_size.split_k_slices + - int64_t(RS - 1) * layout.stride()[0] + - int64_t(threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2] + ) * element_size_bits / 8; + + filter_c_delta = threadblock_shape.row() * problem_size.split_k_slices; + } +}; + +template +struct Conv2dFpropFilterIteratorOptimizedParams> +{ + static int const kInterleaved = Interleaved_; + using Layout = layout::TensorCxRSKx; + + Layout layout; + int RS; + int filter_c_delta; + + int64_t inc_next_k; // offset in units of bytes to next K position + int64_t inc_next_rs; // offset in units of bytes to next RS position + int64_t inc_next_c; // offset in units of bytes to next C position + + // + // Methods + // + CUTLASS_HOST_DEVICE + Conv2dFpropFilterIteratorOptimizedParams() { } + + CUTLASS_HOST_DEVICE + Conv2dFpropFilterIteratorOptimizedParams( + Conv2dProblemSize const &problem_size, + Layout const &layout, + int element_size_bits, ///< size of each element in bits + MatrixCoord threadblock_shape, + int thread_count, + int access_size, + layout::PitchLinearCoord threadmap_iterations, + layout::PitchLinearCoord threadmap_delta + ): + layout(layout) { + + TRACE_CONV_INITIALIZERS("conv2d_fprop", "filter", + element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta); + + RS = problem_size.R * problem_size.S; + + inc_next_k = (kInterleaved * threadmap_delta.strided() * element_size_bits) / 8; + + inc_next_rs = + ( int64_t(layout.stride()[0]) + - kInterleaved * (threadmap_iterations.strided() - 1) * threadmap_delta.strided() + ) * element_size_bits / 8; + + inc_next_c = + ( + threadblock_shape.row() * problem_size.split_k_slices / kInterleaved * int64_t(layout.stride()[2]) + - int64_t(RS - 1) * layout.stride()[0] + - int64_t(threadmap_iterations.strided() - 1) * threadmap_delta.strided() * kInterleaved + ) * element_size_bits / 8; + + filter_c_delta = threadblock_shape.row() * problem_size.split_k_slices; + } +}; + +/// Parameters object for Conv2d DGRAD OutputGradient (dy) iterator +struct Conv2dDgradOutputGradientIteratorOptimizedParams { + + using Layout = layout::TensorNHWC; + + Layout layout; + + int64_t inc_next[3]; // {next S, next R, next K} + + int filter_k_delta; // number of logical elements to add to filter_k_ + + int HW; // product of H*W + + FastDivmod hw_divmod; + FastDivmod w_divmod; + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Conv2dDgradOutputGradientIteratorOptimizedParams() { } + + CUTLASS_HOST_DEVICE + Conv2dDgradOutputGradientIteratorOptimizedParams( + Conv2dProblemSize const &problem_size, + Layout const &layout, + int element_size_bits, ///< size of each element in bits + MatrixCoord threadblock_shape, + int thread_count, + int access_size, + layout::PitchLinearCoord threadmap_iterations, + layout::PitchLinearCoord threadmap_delta + ): + layout(layout), HW(problem_size.H *problem_size.W), hw_divmod(HW), w_divmod(problem_size.W) { + + TRACE_CONV_INITIALIZERS("conv2d_dgrad", "output_gradient", + element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta); + + int conv_sign = (problem_size.mode == Mode::kConvolution ? 1 : -1); + + // next S + inc_next[0] = conv_sign * (layout.stride()[0] * problem_size.dilation_w) * element_size_bits / 8; + + // next R + inc_next[1] = conv_sign * ( + layout.stride()[1] * problem_size.dilation_h + - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w + ) * element_size_bits / 8; + + // next K + inc_next[2] = ( + threadblock_shape.column() * problem_size.split_k_slices + - conv_sign * (problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h + - conv_sign * (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w + ) * element_size_bits / 8; + + // logical offset added to internal channel counter - units are elements, not bytes + filter_k_delta = threadblock_shape.column() * problem_size.split_k_slices; + } +}; + +/// Parameters object for Conv2d DGRAD Filter (w) iterator +struct Conv2dDgradFilterIteratorOptimizedParams { + + using Layout = layout::TensorNHWC; + + Layout layout; + int RS; + int filter_k_delta; + + int64_t inc_next_strided; // offset in units of bytes to next K coordinate within tile + int64_t inc_next_rs; // offset in units of bytes to next RS position + int64_t inc_next_k; // offset in units of bytes to next K position in subsequent tile + + // + // Methods + // + CUTLASS_HOST_DEVICE + Conv2dDgradFilterIteratorOptimizedParams() { } + + CUTLASS_HOST_DEVICE + Conv2dDgradFilterIteratorOptimizedParams( + Conv2dProblemSize const &problem_size, + Layout const &layout, + int element_size_bits, ///< size of each element in bits + MatrixCoord threadblock_shape, + int thread_count, + int access_size, + layout::PitchLinearCoord threadmap_iterations, + layout::PitchLinearCoord threadmap_delta + ): + layout(layout), RS(problem_size.R * problem_size.S) { + + TRACE_CONV_INITIALIZERS("conv2d_dgrad", "filter", + element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta); + + inc_next_strided = (layout.stride()[2] * threadmap_delta.strided() * element_size_bits) / 8; + + inc_next_rs = + ( layout.stride()[0] + - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2] + ) * element_size_bits / 8; + + inc_next_k = + ( + threadblock_shape.row() * problem_size.split_k_slices * layout.stride()[2] + - (problem_size.R * problem_size.S - 1) * layout.stride()[0] + - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2] + ) * element_size_bits / 8; + + filter_k_delta = threadblock_shape.row() * problem_size.split_k_slices; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Parameters object for Conv2d WGRAD Output Gradient (dy) iterator +struct Conv2dWgradOutputGradientIteratorOptimizedParams { + + using Layout = layout::TensorNHWC; + + Layout layout; + + int NPQ; // precomputd product of N*P*Q for clearing predicates + + FastDivmod pq_divmod; + FastDivmod q_divmod; + + int64_t offset_next_strided; // offset in units of bytes to next npq coordinate within tile + int64_t offset_next_contiguous; // offset in units of bytes to next k coordinate within tile + int64_t inc_next_npq; // offset in units of bytes to next npq position in subsequent tile + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Conv2dWgradOutputGradientIteratorOptimizedParams() { } + + CUTLASS_HOST_DEVICE + Conv2dWgradOutputGradientIteratorOptimizedParams( + Conv2dProblemSize const &problem_size, + Layout const &layout, + int element_size_bits, ///< size of each element in bits + MatrixCoord threadblock_shape, + int thread_count, + int access_size, + layout::PitchLinearCoord threadmap_iterations, + layout::PitchLinearCoord threadmap_delta + ): + layout(layout), + NPQ(problem_size.N * problem_size.P * problem_size.Q), + pq_divmod(problem_size.P * problem_size.Q), + q_divmod(problem_size.Q) { + + TRACE_CONV_INITIALIZERS("conv2d_wgrad", "output_gradient", + element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta); + + // Incremental offsets in unites of bytes (number of elements) * sizeof_bits::value / 8 + offset_next_strided = (threadmap_delta.strided() * layout.stride()[0]) + * element_size_bits / 8; + + offset_next_contiguous = (threadmap_delta.contiguous()) + * element_size_bits / 8; + + inc_next_npq = (threadblock_shape.column() * problem_size.split_k_slices * layout.stride()[0]) + * element_size_bits / 8; + } +}; + +struct Conv2dWgradActivationIteratorOptimizedParams { + + using Layout = layout::TensorNHWC; + + Layout layout; + + FastDivmod sc_divmod; + FastDivmod pq_divmod; + FastDivmod q_divmod; + FastDivmod c_divmod; + + // + // Methods + // + CUTLASS_HOST_DEVICE + Conv2dWgradActivationIteratorOptimizedParams() { } + + CUTLASS_HOST_DEVICE + Conv2dWgradActivationIteratorOptimizedParams( + Conv2dProblemSize const &problem_size, + Layout const &layout + ): + layout(layout), + sc_divmod(problem_size.S * problem_size.C), + pq_divmod(problem_size.P * problem_size.Q), + q_divmod(problem_size.Q), + c_divmod(problem_size.C) { + + } + + CUTLASS_HOST_DEVICE + Conv2dWgradActivationIteratorOptimizedParams( + Conv2dProblemSize const &problem_size, + Layout const &layout, + int element_size_bits, ///< size of each element in bits + MatrixCoord threadblock_shape, + int thread_count, + int access_size, + layout::PitchLinearCoord threadmap_iterations, + layout::PitchLinearCoord threadmap_delta + ): + Conv2dWgradActivationIteratorOptimizedParams( + problem_size, + layout + ) { + + TRACE_CONV_INITIALIZERS("conv2d_wgrad", "activation", + element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/conv/threadblock/conv2d_tile_iterator.h b/include/cutlass/conv/threadblock/conv2d_tile_iterator.h new file mode 100644 index 0000000000..ce52017e37 --- /dev/null +++ b/include/cutlass/conv/threadblock/conv2d_tile_iterator.h @@ -0,0 +1,170 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template wraps the tile access iterator concept to load whole tiles from tensors in + memory used for implicit GEMM convolution. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template +class TileIterator { +public: + using TileAccessIterator = TileAccessIterator_; + + using Shape = typename TileAccessIterator::Shape; + using Element = typename TileAccessIterator::Element; + using Layout = typename TileAccessIterator::Layout; + using TensorCoord = typename Layout::TensorCoord; + using ThreadMap = typename TileAccessIterator::ThreadMap; + using AccessType = typename TileAccessIterator::AccessType; + using TensorRef = typename TileAccessIterator::TensorRef; + using Index = typename TileAccessIterator::Index; + using LongIndex = typename TileAccessIterator::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = TileAccessIterator::kIteratorAlgorithm; + static StrideSupport const kStrideSupport = TileAccessIterator::kStrideSupport; + using Params = typename TileAccessIterator::Params; + static int const kConvDim = TileAccessIterator::kConvDim; + using ConvProblemSize = typename TileAccessIterator::ConvProblemSize; + + /// Fragment object to be loaded or stored + using Fragment = cutlass::Array< + Element, + ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>; + +private: + + /// Internal state + TileAccessIterator tile_access_iterator_; + +public: + + /// Constructor + CUTLASS_HOST_DEVICE + TileIterator( + Params const ¶ms, + ConvProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() + ): + tile_access_iterator_(params, problem_size, ptr, thread_idx, threadblock_offset) { } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + tile_access_iterator_.add_pointer_offset(pointer_offset); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + TileIterator &operator++() { + tile_access_iterator_.advance(); + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + TileIterator operator++(int) { + TileIterator self(*this); + operator++(); + return self; + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_pointer_offset(Fragment &frag, Index pointer_offset) { + + frag.clear(); + AccessType *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + + cutlass::arch::global_load< + AccessType, + sizeof(AccessType) + >( + frag_ptr[c + s * ThreadMap::Iterations::kContiguous], + tile_access_iterator_.get() + pointer_offset, + tile_access_iterator_.valid() + ); + + ++tile_access_iterator_; + } + } + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load(Fragment &frag) { + tile_access_iterator_.set_iteration_index(0); + load_with_pointer_offset(frag, 0); + } + + CUTLASS_DEVICE + void advance() { + tile_access_iterator_.advance(); + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(ConvProblemSize const &problem_size) { + + // dispatch to iterator implementation + return TileAccessIterator::can_implement(problem_size); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h new file mode 100644 index 0000000000..13d8338c2f --- /dev/null +++ b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h @@ -0,0 +1,254 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM B (activation tile) + matrix from memory. + + This iterator assumes TensorNHWC layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" +#include "cutlass/conv/threadblock/conv2d_params.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename ThreadMap_ +> +class Conv2dWgradActivationTileAccessIteratorAnalytic { +public: + + // + // Types + // + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorNHWC; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using TensorCoord = typename Layout::TensorCoord; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic; + static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided; + static int const kConvDim = 2; + using ConvProblemSize = typename conv::Conv2dProblemSize; + + static_assert(sizeof_bits::value >= 8, + "WGRAD requires elements of size 8b or greater."); + + // + // Parameters structure + // + + using Params = Conv2dAnalyticParams; + +private: + + Params const ¶ms_; + Conv2dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + // Filter postion (r,s,c) in contiguous dimension stays constant for each gemm_iteration_k + int filter_r_[ThreadMap::Iterations::kContiguous]; + int filter_s_[ThreadMap::Iterations::kContiguous]; + int filter_c_[ThreadMap::Iterations::kContiguous]; + + int offset_npq_[ThreadMap::Iterations::kStrided]; + +public: + + CUTLASS_HOST_DEVICE + Conv2dWgradActivationTileAccessIteratorAnalytic( + Params const ¶ms, + Conv2dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)) + { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + // initialize r,s,c filter position for every contiguous iteration + CUTLASS_PRAGMA_UNROLL + for(int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + + int rsc_offset = threadblock_offset.column() + thread_coord.contiguous() + + c * ThreadMap::Delta::kContiguous; + + filter_r_[c] = rsc_offset / (problem_size_.S * problem_size_.C); + int residual = rsc_offset % (problem_size_.S * problem_size_.C); + + filter_s_[c] = residual / problem_size_.C; + filter_c_[c] = residual % problem_size_.C; + } + + // initialize n, p, q offset for every strided iteration + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + + offset_npq_[s] = threadblock_offset.row() + thread_coord.strided() + + s * ThreadMap::Delta::kStrided; + } + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + CUTLASS_HOST_DEVICE + void advance() { + + // moves to the next GEMM-K offset (offset_npq_) in GEMM-B by a CTA-K tile + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + offset_npq_[s] += Shape::kRow * problem_size_.split_k_slices; + } + } + + /// Returns the coordinate in the activation tensor x that is currently pointed to + /// by the iterator. + CUTLASS_HOST_DEVICE + TensorCoord at() const { + + int r = filter_r_[iteration_contiguous_]; + int s = filter_s_[iteration_contiguous_]; + + if (problem_size_.mode == Mode::kConvolution) { + r = (problem_size_.R - 1 - r); + s = (problem_size_.S - 1 - s); + } + + int n = offset_npq_[iteration_strided_] / (problem_size_.P * problem_size_.Q); + int residual = offset_npq_[iteration_strided_] % (problem_size_.P * problem_size_.Q); + + int p = residual / problem_size_.Q; + int q = residual % problem_size_.Q; + + int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h; + int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w; + + return TensorCoord(n, h, w, filter_c_[iteration_contiguous_]); + } + + /// Returns true if the current coordinate is within the activation tensor x + CUTLASS_HOST_DEVICE + bool valid() const { + TensorCoord coord = at(); + + return coord.n() < problem_size_.N && + coord.h() >= 0 && coord.h() < problem_size_.H && + coord.w() >= 0 && coord.w() < problem_size_.W && + coord.c() < problem_size_.C; + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + + TensorCoord coord = at(); + LongIndex offset = params_.layout(coord); + + return reinterpret_cast(pointer_ + offset * sizeof_bits::value / 8); + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv2dWgradActivationTileAccessIteratorAnalytic &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv2dProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.K % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + return Status::kSuccess; + } + +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + + diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h new file mode 100644 index 0000000000..74a887794b --- /dev/null +++ b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h @@ -0,0 +1,273 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM B (activation tile) + matrix from memory. + + This iterator assumes TensorNHWC layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename ThreadMap_ +> +class Conv2dWgradActivationTileAccessIteratorOptimized { +public: + + // + // Types + // + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorNHWC; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using TensorCoord = typename Layout::TensorCoord; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic; + static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided; + static int const kConvDim = 2; + using ConvProblemSize = typename conv::Conv2dProblemSize; + + static_assert(sizeof_bits::value >= 8, + "WGRAD requires elements of size 8b or greater."); + + // + // Parameters structure + // + + using Params = Conv2dWgradActivationIteratorOptimizedParams; + +private: + + Conv2dWgradActivationIteratorOptimizedParams const ¶ms_; + Conv2dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + // Precomputed effective filter postion (r,s) in contiguous dimension stays constant for each gemm_iteration_k + // required for npq -> nhw translation + int precomputed_filter_r_[ThreadMap::Iterations::kContiguous]; + int precomputed_filter_s_[ThreadMap::Iterations::kContiguous]; + + // Channel dimension in contiguous dimension stays constant for each gemm_iteration_k + int filter_c_[ThreadMap::Iterations::kContiguous]; + + int offset_npq_[ThreadMap::Iterations::kStrided]; + +public: + + CUTLASS_HOST_DEVICE + Conv2dWgradActivationTileAccessIteratorOptimized( + Conv2dWgradActivationIteratorOptimizedParams const ¶ms, + Conv2dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)) + { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + // initialize r,s,c filter position for every contiguous iteration + CUTLASS_PRAGMA_UNROLL + for(int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + + int rsc_offset = threadblock_offset.column() + thread_coord.contiguous() + + c * ThreadMap::Delta::kContiguous; + + // The subseqnet fast_divmod() operations are equivalent to the following logical computation: + // + // + // filter_r_[c] = rsc_offset / (problem_size_.S * problem_size_.C); + // int residual = rsc_offset % (problem_size_.S * problem_size_.C); + // + // filter_s_[c] = residual / problem_size_.C; + // filter_c_[c] = residual % problem_size_.C; + + int residual; + params_.sc_divmod(precomputed_filter_r_[c], residual, rsc_offset); + params_.c_divmod(precomputed_filter_s_[c], filter_c_[c], residual); + + int r = precomputed_filter_r_[c]; + int s = precomputed_filter_s_[c]; + + if (problem_size_.mode == Mode::kConvolution) { + r = (problem_size_.R - 1 - r); + s = (problem_size_.S - 1 - s); + } + + precomputed_filter_r_[c] = - problem_size_.pad_h + r * problem_size_.dilation_h; + precomputed_filter_s_[c] = - problem_size_.pad_w + s * problem_size_.dilation_w; + + } + + // initialize n, p, q offset for every strided iteration + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + + offset_npq_[s] = threadblock_offset.row() + thread_coord.strided() + + s * ThreadMap::Delta::kStrided; + } + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + CUTLASS_HOST_DEVICE + void advance() { + + // moves to the next GEMM-K offset (offset_npq_) in GEMM-B by a CTA-K tile + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + offset_npq_[s] += Shape::kRow * problem_size_.split_k_slices; + } + } + + /// Returns the coordinate in the activation tensor x that is currently pointed to + /// by the iterator. + CUTLASS_HOST_DEVICE + TensorCoord at() const { + + // The subseqnet fast_divmod() operations are equivalent to the following logical computation: + // + // + // int n = offset_npq_[iteration_strided_] / (problem_size_.P * problem_size_.Q); + // int residual = offset_npq_[iteration_strided_] % (problem_size_.P * problem_size_.Q); + // + // int p = residual / problem_size_.Q; + // int q = residual % problem_size_.Q; + + int residual, n, p, q; + + params_.pq_divmod(n, residual, offset_npq_[iteration_strided_]); + params_.q_divmod(p, q, residual); + + int h = p * problem_size_.stride_h + precomputed_filter_r_[iteration_contiguous_]; + int w = q * problem_size_.stride_w + precomputed_filter_s_[iteration_contiguous_]; + + return TensorCoord(n, h, w, filter_c_[iteration_contiguous_]); + } + + /// Returns true if the current coordinate is within the activation tensor x + CUTLASS_HOST_DEVICE + bool valid() const { + TensorCoord coord = at(); + + return coord.n() < problem_size_.N && + coord.h() >= 0 && coord.h() < problem_size_.H && + coord.w() >= 0 && coord.w() < problem_size_.W && + coord.c() < problem_size_.C; + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + + TensorCoord coord = at(); + LongIndex offset = params_.layout(coord); + + return reinterpret_cast(pointer_ + offset * sizeof_bits::value / 8); + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv2dWgradActivationTileAccessIteratorOptimized &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv2dProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.K % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + return Status::kSuccess; + } + +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h new file mode 100644 index 0000000000..84c788d6d4 --- /dev/null +++ b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h @@ -0,0 +1,234 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) + matrix from memory. + + This iterator assumes TensorNHWC layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv3d_problem_size.h" +#include "cutlass/conv/threadblock/conv2d_params.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename ThreadMap_ +> +class Conv2dWgradOutputGradientTileAccessIteratorAnalytic { +public: + + // + // Types + // + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorNHWC; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using TensorCoord = typename Layout::TensorCoord; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic; + static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided; + static int const kConvDim = 2; + using ConvProblemSize = typename conv::Conv2dProblemSize; + + static_assert(sizeof_bits::value >= 8, + "WGRAD requires elements of size 8b or greater."); + + // + // Parameters structure + // + + using Params = Conv2dAnalyticParams; + +private: + + Params const ¶ms_; + Conv2dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + int filter_k_[ThreadMap::Iterations::kContiguous]; + + int offset_npq_[ThreadMap::Iterations::kStrided]; + +public: + + CUTLASS_HOST_DEVICE + Conv2dWgradOutputGradientTileAccessIteratorAnalytic( + Params const ¶ms, + Conv2dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + // initialize filter_k for every contiguous iteration + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + filter_k_[c] = threadblock_offset.row() + thread_coord.contiguous() + + c * ThreadMap::Delta::kContiguous; + } + + // initialize n, p, q offset for every strided iteration + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + offset_npq_[s] = threadblock_offset.column() + thread_coord.strided() + + s * ThreadMap::Delta::kStrided; + + } + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + CUTLASS_HOST_DEVICE + void advance() { + // moves to the next GEMM-K offset (offset_npq_) in GEMM-A by a CTA-K tile + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + offset_npq_[s] += Shape::kColumn * problem_size_.split_k_slices; + } + } + + /// Returns the coordinate in the output gradient tensor Dy that is currently pointed to + /// by the iterator. + CUTLASS_HOST_DEVICE + TensorCoord at() const { + + int npq = offset_npq_[iteration_strided_]; + + int n = npq / (problem_size_.P * problem_size_.Q); + int residual = npq % (problem_size_.P * problem_size_.Q); + + int p = residual / problem_size_.Q; + int q = residual % problem_size_.Q; + + return TensorCoord(n, p, q, filter_k_[iteration_contiguous_]); + } + + + /// Returns true if the current coordinate is within the output gradient tensor Dy + CUTLASS_HOST_DEVICE + bool valid() const { + TensorCoord coord = at(); + + return coord.n() < problem_size_.N && + coord.h() < problem_size_.P && + coord.w() < problem_size_.Q && + coord.c() < problem_size_.K; + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + + TensorCoord coord = at(); + LongIndex offset = params_.layout(coord); + + return reinterpret_cast(pointer_ + offset * sizeof_bits::value / 8); + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv2dWgradOutputGradientTileAccessIteratorAnalytic &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv2dProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.C % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + return Status::kSuccess; + } + +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + + diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h new file mode 100644 index 0000000000..4a20cb1d8b --- /dev/null +++ b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h @@ -0,0 +1,300 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) + matrix from memory. + + This iterator assumes TensorNHWC layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename ThreadMap_ +> +class Conv2dWgradOutputGradientTileAccessIteratorOptimized { +public: + + // + // Types + // + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorNHWC; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using TensorCoord = typename Layout::TensorCoord; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized; + static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided; + static int const kConvDim = 2; + using ConvProblemSize = typename conv::Conv2dProblemSize; + + static_assert(sizeof_bits::value >= 8, + "WGRAD requires elements of size 8b or greater."); + + // + // Parameters structure + // + + struct Params : Conv2dWgradOutputGradientIteratorOptimizedParams { + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Params() { } + + CUTLASS_HOST_DEVICE + Params(Conv2dWgradOutputGradientIteratorOptimizedParams const &base): + Conv2dWgradOutputGradientIteratorOptimizedParams(base) { } + + CUTLASS_HOST_DEVICE + Params( + Conv2dProblemSize const &problem_size, + Layout const &layout + ): + Conv2dWgradOutputGradientIteratorOptimizedParams( + problem_size, + layout, + sizeof_bits::value, + {Shape::kRow, Shape::kColumn}, + ThreadMap::kThreads, + ThreadMap::kElementsPerAccess, + {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided}, + {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided} + ) { } + }; + +private: + + Conv2dWgradOutputGradientIteratorOptimizedParams const ¶ms_; + Conv2dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + uint32_t predicates_; + int filter_k_; + int offset_npq_; + +public: + + CUTLASS_HOST_DEVICE + Conv2dWgradOutputGradientTileAccessIteratorOptimized( + Conv2dWgradOutputGradientIteratorOptimizedParams const ¶ms, + Conv2dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)), + predicates_(0), + filter_k_(0), + offset_npq_(0) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + filter_k_ = threadblock_offset.row() + thread_coord.contiguous(); + offset_npq_ = threadblock_offset.column() + thread_coord.strided(); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + + int filter_k = filter_k_ + c * ThreadMap::Delta::kContiguous; + int offset_npq = offset_npq_ + s * ThreadMap::Delta::kStrided; + + bool predicate = valid_(at_(offset_npq, filter_k)); + + uint32_t pred = (predicate ? 1u : 0); + + int pred_idx = c + s * ThreadMap::Iterations::kContiguous; + + predicates_ |= (pred << pred_idx); + } + } + + // Offset pointer to (iteration_strided_, iteration_contiguous_) = (0, 0) + pointer_ += ( + offset_npq_ * params.layout.stride()[0] + filter_k_ + ) * sizeof_bits::value / 8; + + set_iteration_index(0); + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + CUTLASS_HOST_DEVICE + void advance() { + // moves to the next GEMM-K offset (offset_npq_) in GEMM-A by a CTA-K tile + offset_npq_ += Shape::kColumn * problem_size_.split_k_slices; + + // Clear predicates if needed + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + if (offset_npq_ + s * ThreadMap::Delta::kStrided >= params_.NPQ) { + uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous); + predicates_ = (predicates_ & (~kClearMask)); + } + } + + pointer_ += params_.inc_next_npq; + } + +private: + /// Returns the coordinate in the output gradient tensor Dy that is pointed to + /// by offset_npq and k. + CUTLASS_HOST_DEVICE + TensorCoord at_(int offset_npq, int k) const { + + // The subseqnet fast_divmod() operations are equivalent to the following logical computation: + // + // + // int npq = offset_npq; + // int n = npq / (problem_size_.P * problem_size_.Q); + // int residual = npq % (problem_size_.P * problem_size_.Q); + // + // int p = residual / problem_size_.Q; + // int q = residual % problem_size_.Q; + + int residual, n, p, q; + + params_.pq_divmod(n, residual, offset_npq); + params_.q_divmod(p, q, residual); + + return TensorCoord(n, p, q, k); + } + + /// Returns true if the coord is within the output gradient tensor Dy + CUTLASS_HOST_DEVICE + bool valid_(TensorCoord coord) const { + + return coord.n() < problem_size_.N && + coord.c() < problem_size_.K; + } + +public: + + /// Returns true if the current coordinate is within the output gradient tensor Dy + CUTLASS_HOST_DEVICE + bool valid() const { + + LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous; + return (predicates_ & (1u << pred_idx)); + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + + return reinterpret_cast( + pointer_ + + iteration_strided_ * params_.offset_next_strided + + iteration_contiguous_ * params_.offset_next_contiguous + ); + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv2dWgradOutputGradientTileAccessIteratorOptimized &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv2dProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.C % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + return Status::kSuccess; + } + +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + + diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h new file mode 100644 index 0000000000..0033568278 --- /dev/null +++ b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h @@ -0,0 +1,263 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) + matrix from memory. + + This iterator assumes TensorNDHWC layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv3d_problem_size.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename ThreadMap_ +> +class Conv3dDgradFilterTileAccessIteratorAnalytic { +public: + + // + // Types + // + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorNDHWC; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using TensorCoord = typename Layout::TensorCoord; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic; + static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided; + static int const kConvDim = 3; + using ConvProblemSize = typename conv::Conv3dProblemSize; + + static_assert(sizeof_bits::value >= 8, + "DGRAD requires elements of size 8b or larger."); + + // + // Parameters structure + // + + struct Params { + + Layout layout; + + // + // Methods + // + CUTLASS_HOST_DEVICE + Params() { } + + CUTLASS_HOST_DEVICE + Params( + Conv3dProblemSize const &problem_size, + Layout const &layout + ): layout(layout) { + + } + }; + +private: + + Params const ¶ms_; + Conv3dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + // For a fixed filter position (t,r,s) find and fill offset_k_, offset_c_ in strided and contiguous dimension + int filter_t_; + int filter_r_; + int filter_s_; + int offset_k_[ThreadMap::Iterations::kStrided]; + int offset_c_[ThreadMap::Iterations::kContiguous]; + +public: + + CUTLASS_HOST_DEVICE + Conv3dDgradFilterTileAccessIteratorAnalytic( + Params const ¶ms, + Conv3dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)), + filter_t_(0), + filter_r_(0), + filter_s_(0) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + offset_c_[c] = threadblock_offset.column() + thread_coord.contiguous() + + c * ThreadMap::Delta::kContiguous; + } + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + offset_k_[s] = + threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided; + } + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + CUTLASS_HOST_DEVICE + void advance() { + // moves to the next tile + ++filter_s_; + if (filter_s_ < problem_size_.S) { + return; + } + filter_s_ = 0; + ++filter_r_; + if (filter_r_ < problem_size_.R) { + return; + } + filter_r_ = 0; + ++filter_t_; + if (filter_t_ < problem_size_.T) { + return; + } + filter_t_ = 0; + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + offset_k_[s] += Shape::kRow * problem_size_.split_k_slices; + } + } + + /// Returns the coordinate in the filter tensor w that is currently pointed to + /// by the iterator. + CUTLASS_HOST_DEVICE + TensorCoord at() const { + + int c = offset_c_[iteration_contiguous_]; + int k = offset_k_[iteration_strided_]; + + return TensorCoord(k, filter_t_, filter_r_, filter_s_, c); + } + + /// Returns true if the current coordinate is within the filter tensor w + CUTLASS_HOST_DEVICE + bool valid() const { + + TensorCoord coord = at(); + + return coord.n() < problem_size_.K && coord.c() < problem_size_.C; + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + + TensorCoord coord = at(); + LongIndex offset = params_.layout(coord); + + return reinterpret_cast(pointer_ + offset * sizeof_bits::value / 8); + + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv3dDgradFilterTileAccessIteratorAnalytic &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv3dProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.C % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + return Status::kSuccess; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + + diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h new file mode 100644 index 0000000000..47e7de46a0 --- /dev/null +++ b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h @@ -0,0 +1,331 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) + matrix from memory. + + This iterator assumes TensorNDHWC layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv3d_problem_size.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + typename Shape_, + typename Element_, + typename ThreadMap_, + conv::StrideSupport StrideSupport_ = conv::StrideSupport::kStrided +> +class Conv3dDgradOutputGradientTileAccessIteratorAnalytic; +///////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv3dDgradOutputGradientTileAccessIteratorAnalytic strided dgrad needs special handling using +// unscaled coordinations +template < + typename Shape_, + typename Element_, + typename ThreadMap_ +> +class Conv3dDgradOutputGradientTileAccessIteratorAnalytic < + Shape_, + Element_, + ThreadMap_, + conv::StrideSupport::kStrided +> { +public: + + // + // Types + // + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorNDHWC; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using TensorCoord = typename Layout::TensorCoord; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic; + static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided; + static int const kConvDim = 3; + using ConvProblemSize = typename conv::Conv3dProblemSize; + + static_assert(sizeof_bits::value >= 8, + "DGRAD requires elements of size 8b or greater."); + + // + // Simpligying assertions + // + + static_assert(ThreadMap::Iterations::kContiguous == 1, + "Require Iterations::kContiguous == 1"); + + // + // Parameters structure + // + + struct Params { + + Layout layout; + + // + // Methods + // + CUTLASS_HOST_DEVICE + Params() { } + + CUTLASS_HOST_DEVICE + Params( + ConvProblemSize const &problem_size, + Layout const &layout + ): layout(layout) { + + } + }; + +private: + + Params const ¶ms_; + ConvProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + int filter_k_; + int filter_t_; + int filter_r_; + int filter_s_; + + int offset_n_[ThreadMap::Iterations::kStrided]; + int offset_d_[ThreadMap::Iterations::kStrided]; + int offset_w_[ThreadMap::Iterations::kStrided]; + int offset_h_[ThreadMap::Iterations::kStrided]; + +private: + + /// Returns the coordinate in the output tensor Dy that is currently pointed to + /// by the iterator but DOES NOT scale by the convolution stride. This is needed + /// to compute predicates in the valid() method. The return value of the public at() + /// method is correctly scaled. + CUTLASS_HOST_DEVICE + TensorCoord unscaled_at_() const { + int n = offset_n_[iteration_strided_]; + int d = offset_d_[iteration_strided_]; + int h = offset_h_[iteration_strided_]; + int w = offset_w_[iteration_strided_]; + + int t = filter_t_; + int r = filter_r_; + int s = filter_s_; + + if (problem_size_.mode == Mode::kConvolution) { + t = (problem_size_.T - 1 - t); + r = (problem_size_.R - 1 - r); + s = (problem_size_.S - 1 - s); + } + + int z = (d + problem_size_.pad_d - t * problem_size_.dilation_d); + int p = (h + problem_size_.pad_h - r * problem_size_.dilation_h); + int q = (w + problem_size_.pad_w - s * problem_size_.dilation_w); + + return TensorCoord(n, z, p, q, filter_k_); + } + +public: + + CUTLASS_HOST_DEVICE + Conv3dDgradOutputGradientTileAccessIteratorAnalytic( + Params const ¶ms, + ConvProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() // threadblock offset - units are whole CTA tiles + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)), + filter_k_(0), + filter_t_(0), + filter_r_(0), + filter_s_(0) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + filter_k_ = threadblock_offset.column() + thread_coord.contiguous(); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + int offset_ndhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided; + + offset_n_[s] = offset_ndhw / (problem_size_.D * problem_size_.H * problem_size_.W); + int residual = offset_ndhw % (problem_size_.D * problem_size_.H * problem_size_.W); + + offset_d_[s] = residual / (problem_size_.H * problem_size_.W); + residual = residual % (problem_size_.H * problem_size_.W); + + offset_h_[s] = residual / problem_size_.W; + offset_w_[s] = residual % problem_size_.W; + } + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + CUTLASS_HOST_DEVICE + void advance() { + // move to the next tile + ++filter_s_; + if (filter_s_ < problem_size_.S) { + return; + } + filter_s_ = 0; + ++filter_r_; + if (filter_r_ < problem_size_.R) { + return; + } + filter_r_ = 0; + ++filter_t_; + if (filter_t_ < problem_size_.T) { + return; + } + filter_t_ = 0; + + filter_k_ += Shape_::kColumn * problem_size_.split_k_slices; + } + + /// Returns the coordinate in the output tensor Dy that is currently pointed to + /// by the iterator. + CUTLASS_HOST_DEVICE + TensorCoord at() const { + + TensorCoord coord = unscaled_at_(); + + return TensorCoord( + coord.n(), + coord.d() / problem_size_.stride_d, + coord.h() / problem_size_.stride_h, + coord.w() / problem_size_.stride_w, + coord.c()); + } + + + /// Returns true if the current coordinate is within the output tensor Dy + CUTLASS_HOST_DEVICE + bool valid() const { + + TensorCoord unscaled_coord = unscaled_at_(); + TensorCoord coord = at(); + + return + !(unscaled_coord.d() % problem_size_.stride_d) && + !(unscaled_coord.h() % problem_size_.stride_h) && + !(unscaled_coord.w() % problem_size_.stride_w) && + coord.n() < problem_size_.N && + coord.d() >= 0 && coord.d() < problem_size_.Z && + coord.h() >= 0 && coord.h() < problem_size_.P && + coord.w() >= 0 && coord.w() < problem_size_.Q && + coord.c() < problem_size_.K; + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + + TensorCoord coord = at(); + LongIndex offset = params_.layout(coord); + + return reinterpret_cast(pointer_ + offset * sizeof_bits::value / 8); + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv3dDgradOutputGradientTileAccessIteratorAnalytic &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(ConvProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.K % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + return Status::kSuccess; + } + +}; +///////////////////////////////////////////////////////////////////////////////////////////////// +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + + diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h new file mode 100644 index 0000000000..f5d14b5b10 --- /dev/null +++ b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h @@ -0,0 +1,296 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile) + matrix from memory. + + This iterator assumes TensorNDHWC layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv3d_problem_size.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename ThreadMap_ +> +class Conv3dFpropActivationTileAccessIteratorAnalytic { +public: + + // + // Types + // + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorNDHWC; + using TensorCoord = typename Layout::TensorCoord; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic; + static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided; + static int const kConvDim = 3; + using ConvProblemSize = typename conv::Conv3dProblemSize; + + // + // Simplifying assertions + // + static_assert(ThreadMap::Iterations::kContiguous == 1, + "Require Iterations::kContiguous == 1"); + + // + // Parameters structure + // + + struct Params { + + Layout layout; + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Params() { } + + CUTLASS_HOST_DEVICE + Params( + ConvProblemSize const &problem_size, + Layout const &layout + ): layout(layout) { + + } + }; + +private: + + Params const ¶ms_; + ConvProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + int filter_t_; + int filter_r_; + int filter_s_; + int filter_c_; + + int offset_n_[ThreadMap::Iterations::kStrided]; + int offset_z_[ThreadMap::Iterations::kStrided]; + int offset_p_[ThreadMap::Iterations::kStrided]; + int offset_q_[ThreadMap::Iterations::kStrided]; + +public: + + CUTLASS_HOST_DEVICE + Conv3dFpropActivationTileAccessIteratorAnalytic( + Params const ¶ms, + ConvProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() // tile index - units are threadblock-scoped tiles + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)), + filter_t_(0), + filter_r_(0), + filter_s_(0), + filter_c_(0) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + filter_c_ = threadblock_offset.column() + thread_coord.contiguous(); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + int offset_nzpq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided; + + offset_n_[s] = offset_nzpq / (problem_size_.Z * problem_size_.P * problem_size_.Q); + int residual = offset_nzpq % (problem_size_.Z * problem_size_.P * problem_size_.Q); + + offset_z_[s] = residual / (problem_size_.P * problem_size_.Q); + residual = residual % (problem_size_.P * problem_size_.Q); + + offset_p_[s] = residual / problem_size_.Q; + offset_q_[s] = residual % problem_size_.Q; + } + + set_iteration_index(0); + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + CUTLASS_HOST_DEVICE + void advance() { + // moves to the next tile + ++filter_s_; + if (filter_s_ < problem_size_.S) { + return; + } + filter_s_ = 0; + ++filter_r_; + if (filter_r_ < problem_size_.R) { + return; + } + filter_r_ = 0; + ++filter_t_; + if (filter_t_ < problem_size_.T) { + return; + } + filter_t_ = 0; + + filter_c_ += Shape::kColumn * problem_size_.split_k_slices; + } + + /// Returns the coordinate in the activations tensor X that is currently pointed to + /// by the iterator. + CUTLASS_HOST_DEVICE + TensorCoord at() const { + int n = offset_n_[iteration_strided_]; + int z = offset_z_[iteration_strided_]; + int p = offset_p_[iteration_strided_]; + int q = offset_q_[iteration_strided_]; + + int t = filter_t_; + int r = filter_r_; + int s = filter_s_; + + if (problem_size_.mode == Mode::kConvolution) { + t = (problem_size_.T - 1 - filter_t_); + r = (problem_size_.R - 1 - filter_r_); + s = (problem_size_.S - 1 - filter_s_); + } + + int d = z * problem_size_.stride_d - problem_size_.pad_d + t * problem_size_.dilation_d; + int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h; + int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w; + + return TensorCoord(n, d, h, w, filter_c_); + } + + /// Returns true if the current coordinate is within the activations tensor X + CUTLASS_HOST_DEVICE + bool valid() const { + + TensorCoord coord = at(); + + return coord.n() < problem_size_.N && + coord.d() >= 0 && coord.d() < problem_size_.D && + coord.h() >= 0 && coord.h() < problem_size_.H && + coord.w() >= 0 && coord.w() < problem_size_.W && + coord.c() < problem_size_.C; + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + + TensorCoord coord = at(); + LongIndex offset = params_.layout(coord); + + AccessType const *ptr = reinterpret_cast(pointer_ + offset * sizeof_bits::value / 8); + + return ptr; + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv3dFpropActivationTileAccessIteratorAnalytic &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(ConvProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.C % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + return Status::kSuccess; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + + diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h new file mode 100644 index 0000000000..bad6598baf --- /dev/null +++ b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h @@ -0,0 +1,262 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) + matrix from memory. + + This iterator assumes TensorNDHWC layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv3d_problem_size.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename ThreadMap_ +> +class Conv3dFpropFilterTileAccessIteratorAnalytic { +public: + + // + // Types + // + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorNDHWC; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using TensorCoord = typename Layout::TensorCoord; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic; + static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided; + static int const kConvDim = 3; + using ConvProblemSize = typename conv::Conv3dProblemSize; + + // + // Simplifying assertions + // + static_assert(ThreadMap::Iterations::kContiguous == 1, + "Require Iterations::kContiguous == 1"); + + // + // Parameters structure + // + + struct Params { + + Layout layout; + + // + // Methods + // + CUTLASS_HOST_DEVICE + Params() { } + + CUTLASS_HOST_DEVICE + Params( + ConvProblemSize const &problem_size, + Layout const &layout + ): layout(layout) { + + } + }; + +private: + + Params const ¶ms_; + ConvProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + int filter_t_; + int filter_r_; + int filter_s_; + int filter_c_; + + int offset_k_[ThreadMap::Iterations::kStrided]; + +public: + + CUTLASS_HOST_DEVICE + Conv3dFpropFilterTileAccessIteratorAnalytic( + Params const ¶ms, + ConvProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)), + filter_t_(0), + filter_r_(0), + filter_s_(0), + filter_c_(0) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + filter_c_ = threadblock_offset.row() + thread_coord.contiguous(); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + offset_k_[s] = threadblock_offset.column() + thread_coord.strided() + s * ThreadMap::Delta::kStrided; + } + + set_iteration_index(0); + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += pointer_offset * 8 / sizeof_bits::value; + } + + CUTLASS_HOST_DEVICE + void advance() { + // moves to the next tile + ++filter_s_; + if (filter_s_ < problem_size_.S) { + return; + } + filter_s_ = 0; + + ++filter_r_; + if (filter_r_ < problem_size_.R) { + return; + } + filter_r_ = 0; + + ++filter_t_; + if (filter_t_ < problem_size_.T) { + return; + } + filter_t_ = 0; + + filter_c_ += Shape::kRow * problem_size_.split_k_slices; + } + + /// Returns the coordinate in the filter tensor W that is currently pointed to + /// by the iterator. + CUTLASS_HOST_DEVICE + TensorCoord at() const { + + int k = offset_k_[iteration_strided_]; + + return TensorCoord(k, filter_t_, filter_r_, filter_s_, filter_c_); + } + + /// Returns true if the current coordinate is within the activations tensor W + CUTLASS_HOST_DEVICE + bool valid() const { + + TensorCoord coord = at(); + + return coord.n() < problem_size_.K && + coord.c() < problem_size_.C; + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + + TensorCoord coord = at(); + LongIndex offset = params_.layout(coord); + + return reinterpret_cast(pointer_ + offset * sizeof_bits::value / 8); + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv3dFpropFilterTileAccessIteratorAnalytic &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(ConvProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.K % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + return Status::kSuccess; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + + diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h new file mode 100644 index 0000000000..0ad49abd31 --- /dev/null +++ b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h @@ -0,0 +1,281 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM B (activation tile) + matrix from memory. + + This iterator assumes TensorNDHWC layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv3d_problem_size.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename ThreadMap_ +> +class Conv3dWgradActivationTileAccessIteratorAnalytic { +public: + + // + // Types + // + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorNDHWC; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using TensorCoord = typename Layout::TensorCoord; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic; + static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided; + static int const kConvDim = 3; + using ConvProblemSize = typename conv::Conv3dProblemSize; + + static_assert(sizeof_bits::value >= 8, + "WGRAD requires elements of size 8b or greater."); + + // + // Parameters structure + // + + struct Params { + + Layout layout; + + // + // Methods + // + CUTLASS_HOST_DEVICE + Params() { } + + CUTLASS_HOST_DEVICE + Params( + Conv3dProblemSize const &problem_size, + Layout const &layout + ): layout(layout) { + + } + }; + +private: + + Params const ¶ms_; + Conv3dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + // Filter postion (t,r,s,c) in contiguous dimension stays constant for each gemm_iteration_k + int filter_t_[ThreadMap::Iterations::kContiguous]; + int filter_r_[ThreadMap::Iterations::kContiguous]; + int filter_s_[ThreadMap::Iterations::kContiguous]; + int filter_c_[ThreadMap::Iterations::kContiguous]; + + int offset_nzpq_[ThreadMap::Iterations::kStrided]; + +public: + + CUTLASS_HOST_DEVICE + Conv3dWgradActivationTileAccessIteratorAnalytic( + Params const ¶ms, + Conv3dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + // initialize t,r,s,c filter position for every contiguous iteration + CUTLASS_PRAGMA_UNROLL + for(int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + + int trsc_offset = threadblock_offset.column() + thread_coord.contiguous() + + c * ThreadMap::Delta::kContiguous; + + filter_t_[c] = trsc_offset / (problem_size_.R * problem_size_.S * problem_size_.C); + int residual = trsc_offset % (problem_size_.R * problem_size_.S * problem_size_.C); + + filter_r_[c] = residual / (problem_size_.S * problem_size_.C); + residual = residual % (problem_size_.S * problem_size_.C); + + filter_s_[c] = residual / problem_size_.C; + filter_c_[c] = residual % problem_size_.C; + + } + + // initialize n, z, p, q offset for every strided iteration + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + + offset_nzpq_[s] = threadblock_offset.row() + thread_coord.strided() + + s * ThreadMap::Delta::kStrided; + } + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + CUTLASS_HOST_DEVICE + void advance() { + + // moves to the next GEMM-K offset (offset_nzpq_) in GEMM-B by a CTA-K tile + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + offset_nzpq_[s] += Shape::kRow * problem_size_.split_k_slices; + } + } + + /// Returns the coordinate in the activation tensor x that is currently pointed to + /// by the iterator. + CUTLASS_HOST_DEVICE + TensorCoord at() const { + + int t = filter_t_[iteration_contiguous_]; + int r = filter_r_[iteration_contiguous_]; + int s = filter_s_[iteration_contiguous_]; + + if (problem_size_.mode == Mode::kConvolution) { + t = (problem_size_.T - 1 - t); + r = (problem_size_.R - 1 - r); + s = (problem_size_.S - 1 - s); + } + + int n = offset_nzpq_[iteration_strided_] / (problem_size_.Z * problem_size_.P * problem_size_.Q); + int residual = offset_nzpq_[iteration_strided_] % (problem_size_.Z * problem_size_.P * problem_size_.Q); + + int z = residual / (problem_size_.P * problem_size_.Q); + residual = residual % (problem_size_.P * problem_size_.Q); + + int p = residual / problem_size_.Q; + int q = residual % problem_size_.Q; + + int d = z * problem_size_.stride_d - problem_size_.pad_d + t * problem_size_.dilation_d; + int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h; + int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w; + + return TensorCoord(n, d, h, w, filter_c_[iteration_contiguous_]); + } + + /// Returns true if the current coordinate is within the activation tensor x + CUTLASS_HOST_DEVICE + bool valid() const { + TensorCoord coord = at(); + + return coord.n() < problem_size_.N && + coord.d() >= 0 && coord.d() < problem_size_.D && + coord.h() >= 0 && coord.h() < problem_size_.H && + coord.w() >= 0 && coord.w() < problem_size_.W && + coord.c() < problem_size_.C; + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_DEVICE + AccessType const *get() const { + + TensorCoord coord = at(); + LongIndex offset = params_.layout(coord); + + return reinterpret_cast(pointer_ + offset * sizeof_bits::value / 8); + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv3dWgradActivationTileAccessIteratorAnalytic &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv3dProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.K % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + return Status::kSuccess; + } + +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + + diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h new file mode 100644 index 0000000000..35c4643052 --- /dev/null +++ b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h @@ -0,0 +1,346 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM B (activation tile) + matrix from memory. + + This iterator assumes TensorNDHWC layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv3d_problem_size.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename ThreadMap_ +> +class Conv3dWgradActivationTileAccessIteratorOptimized { +public: + + // + // Types + // + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorNDHWC; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using TensorCoord = typename Layout::TensorCoord; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized; + static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided; + static int const kConvDim = 3; + using ConvProblemSize = typename conv::Conv3dProblemSize; + + static_assert(sizeof_bits::value >= 8, + "WGRAD requires elements of size 8b or greater."); + + // + // Parameters structure + // + + struct Params { + + Layout layout; + + int RSC; // product of R*S*C + unsigned rsc_mul; // precomputed quantities for fast computation of div/% by RSC + unsigned rsc_shr; // in device code. + + int SC; // product of S*C + unsigned sc_mul; // precomputed quantities for fast computation of div/% by SC + unsigned sc_shr; // in device code. + + unsigned c_mul; // precomputed quantities for fast computation of div/% by C + unsigned c_shr; // in device code. + + int ZPQ; // product of Z*P*Q + unsigned zpq_mul; // precomputed quantities for fast computation of div/% by ZPQ + unsigned zpq_shr; // in device code. + + int PQ; // product of P*Q + unsigned pq_mul; // precomputed quantities for fast computation of div/% by PQ + unsigned pq_shr; // in device code. + + unsigned q_mul; // precomputed quantities for fast computation of div/% by Q + unsigned q_shr; // in device code. + + // + // Methods + // + CUTLASS_HOST_DEVICE + Params() { } + + CUTLASS_HOST_DEVICE + Params( + Conv3dProblemSize const &problem_size, + Layout const &layout + ): layout(layout) { + + // Precompute several quantities for fast modulo arithmetic. + RSC = problem_size.R * problem_size.S * problem_size.C; + find_divisor(rsc_mul, rsc_shr, RSC); + + SC = problem_size.S * problem_size.C; + find_divisor(sc_mul, sc_shr, SC); + + find_divisor(c_mul, c_shr, problem_size.C); + + ZPQ = problem_size.Z * problem_size.P * problem_size.Q; + find_divisor(zpq_mul, zpq_shr, ZPQ); + + PQ = problem_size.P * problem_size.Q; + find_divisor(pq_mul, pq_shr, PQ); + + find_divisor(q_mul, q_shr, problem_size.Q); + + } + }; + +private: + + Params const ¶ms_; + Conv3dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + // Precomputed effective filter postion (t,r,s) in contiguous dimension stays constant for each gemm_iteration_k + // required for nzpq -> ndhw translation + int precomputed_filter_t_[ThreadMap::Iterations::kContiguous]; + int precomputed_filter_r_[ThreadMap::Iterations::kContiguous]; + int precomputed_filter_s_[ThreadMap::Iterations::kContiguous]; + + // Channel dimension in contiguous dimension stays constant for each gemm_iteration_k + int filter_c_[ThreadMap::Iterations::kContiguous]; + + int offset_nzpq_[ThreadMap::Iterations::kStrided]; + +public: + + CUTLASS_HOST_DEVICE + Conv3dWgradActivationTileAccessIteratorOptimized( + Params const ¶ms, + Conv3dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + // initialize t,r,s,c filter position for every contiguous iteration + CUTLASS_PRAGMA_UNROLL + for(int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + + int trsc_offset = threadblock_offset.column() + thread_coord.contiguous() + + c * ThreadMap::Delta::kContiguous; + + // The subseqnet fast_divmod() operations are equivalent to the following logical computation: + // + // + // filter_t_[c] = trsc_offset / (problem_size_.R * problem_size_.S * problem_size_.C); + // int residual = trsc_offset % (problem_size_.R * problem_size_.S * problem_size_.C); + // + // filter_r_[c] = residual / (problem_size_.S * problem_size_.C); + // residual = residual % (problem_size_.S * problem_size_.C); + // + // filter_s_[c] = residual / problem_size_.C; + // filter_c_[c] = residual % problem_size_.C; + + int residual; + fast_divmod(precomputed_filter_t_[c], residual, trsc_offset, params_.RSC, params_.rsc_mul, params_.rsc_shr); + fast_divmod(precomputed_filter_r_[c], residual, residual, params_.SC, params_.sc_mul, params_.sc_shr); + fast_divmod(precomputed_filter_s_[c], filter_c_[c], residual, problem_size_.C, params_.c_mul, params_.c_shr); + + int t = precomputed_filter_t_[c]; + int r = precomputed_filter_r_[c]; + int s = precomputed_filter_s_[c]; + + if (problem_size_.mode == Mode::kConvolution) { + t = (problem_size_.T - 1 - t); + r = (problem_size_.R - 1 - r); + s = (problem_size_.S - 1 - s); + } + + // efective t,r,s for every contiguous dimension + precomputed_filter_t_[c] = - problem_size_.pad_d + t * problem_size_.dilation_d; + precomputed_filter_r_[c] = - problem_size_.pad_h + r * problem_size_.dilation_h; + precomputed_filter_s_[c] = - problem_size_.pad_w + s * problem_size_.dilation_w; + + + } + + // initialize n, z, p, q offset for every strided iteration + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + + offset_nzpq_[s] = threadblock_offset.row() + thread_coord.strided() + + s * ThreadMap::Delta::kStrided; + } + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + CUTLASS_HOST_DEVICE + void advance() { + + // moves to the next GEMM-K offset (offset_nzpq_) in GEMM-B by a CTA-K tile + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + offset_nzpq_[s] += Shape::kRow * problem_size_.split_k_slices; + } + } + + /// Returns the coordinate in the activation tensor x that is currently pointed to + /// by the iterator. + + CUTLASS_HOST_DEVICE + TensorCoord at() const { + + // The subseqnet fast_divmod() operations are equivalent to the following logical computation: + // + // + // int n = offset_nzpq_[iteration_strided_] / (problem_size_.Z * problem_size_.P * problem_size_.Q); + // int residual = offset_nzpq_[iteration_strided_] % (problem_size_.Z * problem_size_.P * problem_size_.Q); + // + // int z = residual / (problem_size_.P * problem_size_.Q); + // residual = residual % (problem_size_.P * problem_size_.Q); + // + // int p = residual / problem_size_.Q; + // int q = residual % problem_size_.Q; + + int residual, n, z, p, q; + fast_divmod(n, residual, offset_nzpq_[iteration_strided_], params_.ZPQ, params_.zpq_mul, params_.zpq_shr); + fast_divmod(z, residual, residual, params_.PQ, params_.pq_mul, params_.pq_shr); + fast_divmod(p, q, residual, problem_size_.Q, params_.q_mul, params_.q_shr); + + int d = z * problem_size_.stride_d + precomputed_filter_t_[iteration_contiguous_]; + int h = p * problem_size_.stride_h + precomputed_filter_r_[iteration_contiguous_];; + int w = q * problem_size_.stride_w + precomputed_filter_s_[iteration_contiguous_]; + + return TensorCoord(n, d, h, w, filter_c_[iteration_contiguous_]); + } + + /// Returns true if the current coordinate is within the activation tensor x + CUTLASS_HOST_DEVICE + bool valid() const { + TensorCoord coord = at(); + + return coord.n() < problem_size_.N && + coord.d() >= 0 && coord.d() < problem_size_.D && + coord.h() >= 0 && coord.h() < problem_size_.H && + coord.w() >= 0 && coord.w() < problem_size_.W && + coord.c() < problem_size_.C; + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_DEVICE + AccessType const *get() const { + + TensorCoord coord = at(); + LongIndex offset = params_.layout(coord); + + return reinterpret_cast(pointer_ + offset * sizeof_bits::value / 8); + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv3dWgradActivationTileAccessIteratorOptimized &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv3dProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.K % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + return Status::kSuccess; + } + +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + + diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h new file mode 100644 index 0000000000..74017c09f6 --- /dev/null +++ b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h @@ -0,0 +1,256 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) + matrix from memory. + + This iterator assumes TensorNDHWC layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv3d_problem_size.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename ThreadMap_ +> +class Conv3dWgradOutputGradientTileAccessIteratorAnalytic { +public: + + // + // Types + // + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorNDHWC; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using TensorCoord = typename Layout::TensorCoord; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic; + static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided; + static int const kConvDim = 3; + using ConvProblemSize = typename conv::Conv3dProblemSize; + + static_assert(sizeof_bits::value >= 8, + "WGRAD requires elements of size 8b or greater."); + + // + // Parameters structure + // + + struct Params { + + Layout layout; + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Params() { } + + CUTLASS_HOST_DEVICE + Params( + Conv3dProblemSize const &problem_size, + Layout const &layout + ): layout(layout) { + + } + }; + +private: + + Params const ¶ms_; + Conv3dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + int filter_k_[ThreadMap::Iterations::kContiguous]; + + int offset_nzpq_[ThreadMap::Iterations::kStrided]; + +public: + + CUTLASS_HOST_DEVICE + Conv3dWgradOutputGradientTileAccessIteratorAnalytic( + Params const ¶ms, + Conv3dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)) { + + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + // initialize filter_k for every contiguous iteration + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + filter_k_[c] = threadblock_offset.row() + thread_coord.contiguous() + + c * ThreadMap::Delta::kContiguous; + } + + // initialize n, p, q offset for every strided iteration + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + offset_nzpq_[s] = threadblock_offset.column() + thread_coord.strided() + + s * ThreadMap::Delta::kStrided; + + } + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + CUTLASS_HOST_DEVICE + void advance() { + // moves to the next GEMM-K offset (offset_nzpq_) in GEMM-A by a CTA-K tile + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + offset_nzpq_[s] += Shape::kColumn * problem_size_.split_k_slices; + } + } + + /// Returns the coordinate in the output gradient tensor Dy that is currently pointed to + /// by the iterator. + CUTLASS_HOST_DEVICE + TensorCoord at() const { + + int nzpq = offset_nzpq_[iteration_strided_]; + + int n = nzpq / (problem_size_.Z * problem_size_.P * problem_size_.Q); + int residual = nzpq % (problem_size_.Z * problem_size_.P * problem_size_.Q); + + int z = residual / (problem_size_.P * problem_size_.Q); + residual = residual % (problem_size_.P * problem_size_.Q); + + int p = residual / problem_size_.Q; + int q = residual % problem_size_.Q; + + return TensorCoord(n, z, p, q, filter_k_[iteration_contiguous_]); + } + + + /// Returns true if the current coordinate is within the output gradient tensor Dy + CUTLASS_HOST_DEVICE + bool valid() const { + TensorCoord coord = at(); + + return coord.n() < problem_size_.N && + coord.d() < problem_size_.Z && + coord.h() < problem_size_.P && + coord.w() < problem_size_.Q && + coord.c() < problem_size_.K; + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + + TensorCoord coord = at(); + LongIndex offset = params_.layout(coord); + + return reinterpret_cast(pointer_ + offset * sizeof_bits::value / 8); + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv3dWgradOutputGradientTileAccessIteratorAnalytic &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv3dProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.C % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + return Status::kSuccess; + } + +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + + diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h new file mode 100644 index 0000000000..2cab09d1f3 --- /dev/null +++ b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h @@ -0,0 +1,330 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) + matrix from memory. + + This iterator assumes TensorNDHWC layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv3d_problem_size.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename ThreadMap_ +> +class Conv3dWgradOutputGradientTileAccessIteratorOptimized { +public: + + // + // Types + // + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorNDHWC; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using TensorCoord = typename Layout::TensorCoord; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized; + static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided; + static int const kConvDim = 3; + using ConvProblemSize = typename conv::Conv3dProblemSize; + + static_assert(sizeof_bits::value >= 8, + "WGRAD requires elements of size 8b or greater."); + + // + // Parameters structure + // + + struct Params { + + Layout layout; + + int NZPQ; // precomputd product of N*Z*P*Q for clearing predicates + int ZPQ; // product of Z*P*Q + unsigned zpq_mul; // precomputed quantities for fast computation of div/% by ZPQ + unsigned zpq_shr; // in device code. + + int PQ; // product of P*Q + unsigned pq_mul; // precomputed quantities for fast computation of div/% by PQ + unsigned pq_shr; // in device code. + + unsigned q_mul; // precomputed quantities for fast computation of div/% by Q + unsigned q_shr; // in device code. + + LongIndex offset_next_strided; // offset in units of bytes to next nzpq coordinate within tile + LongIndex offset_next_contiguous; // offset in units of bytes to next k coordinate within tile + LongIndex inc_next_nzpq; // offset in units of bytes to next nzpq position in subsequent tile + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Params() { } + + CUTLASS_HOST_DEVICE + Params( + Conv3dProblemSize const &problem_size, + Layout const &layout + ): layout(layout) { + + // Incremental offsets in unites of bytes (number of elements) * sizeof_bits::value / 8 + offset_next_strided = (ThreadMap::Delta::kStrided * layout.stride()[0]) + * sizeof_bits::value / 8; + + offset_next_contiguous = (ThreadMap::Delta::kContiguous) + * sizeof_bits::value / 8; + + inc_next_nzpq = (Shape::kColumn * problem_size.split_k_slices * layout.stride()[0]) + * sizeof_bits::value / 8; + + // Precompute several quantities for fast modulo arithmetic. + NZPQ = problem_size.N * problem_size.Z * problem_size.P * problem_size.Q; + ZPQ = problem_size.Z * problem_size.P * problem_size.Q; + find_divisor(zpq_mul, zpq_shr, ZPQ); + + PQ = problem_size.P * problem_size.Q; + find_divisor(pq_mul, pq_shr, PQ); + + find_divisor(q_mul, q_shr, problem_size.Q); + + } + }; + +private: + + Params const ¶ms_; + Conv3dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + uint32_t predicates_; + int filter_k_; + int offset_nzpq_; + +public: + + CUTLASS_HOST_DEVICE + Conv3dWgradOutputGradientTileAccessIteratorOptimized( + Params const ¶ms, + Conv3dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)), + predicates_(0), + filter_k_(0), + offset_nzpq_(0) { + + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + filter_k_ = threadblock_offset.row() + thread_coord.contiguous(); + offset_nzpq_ = threadblock_offset.column() + thread_coord.strided(); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + + int filter_k = filter_k_ + c * ThreadMap::Delta::kContiguous; + int offset_nzpq = offset_nzpq_ + s * ThreadMap::Delta::kStrided; + + bool predicate = valid_(at_(offset_nzpq, filter_k)); + + uint32_t pred = (predicate ? 1u : 0); + + int pred_idx = c + s * ThreadMap::Iterations::kContiguous; + + predicates_ |= (pred << pred_idx); + } + } + + // Offset pointer to (iteration_strided_, iteration_contiguous_) = (0, 0) + pointer_ += ( + offset_nzpq_ * params.layout.stride()[0] + filter_k_ + ) * sizeof_bits::value / 8; + + set_iteration_index(0); + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + CUTLASS_HOST_DEVICE + void advance() { + // moves to the next GEMM-K offset (offset_npq_) in GEMM-A by a CTA-K tile + offset_nzpq_ += Shape::kColumn * problem_size_.split_k_slices; + + // Clear predicates if needed + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + if (offset_nzpq_ + s * ThreadMap::Delta::kStrided >= params_.NZPQ) { + uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous); + predicates_ = (predicates_ & (~kClearMask)); + } + } + pointer_ += params_.inc_next_nzpq; + } + +private: + /// Returns the coordinate in the output gradient tensor Dy that is (offset_nzpq, k) pointed to + /// by the iterator. + CUTLASS_HOST_DEVICE + TensorCoord at_(int offset_nzpq, int k) const { + + // The subseqnet fast_divmod() operations are equivalent to the following logical computation: + // + // + // int nzpq = offset_nzpq_; + // int n = nzpq / (problem_size_.Z * problem_size_.P * problem_size_.Q); + // int residual = nzpq % (problem_size_.Z * problem_size_.P * problem_size_.Q); + // + // int z = residual / (problem_size_.P * problem_size_.Q); + // residual = residual % (problem_size_.P * problem_size_.Q); + // + // int p = residual / problem_size_.Q; + // int q = residual % problem_size_.Q; + + int residual, n, z, p, q; + fast_divmod(n, residual, offset_nzpq, params_.ZPQ, params_.zpq_mul, params_.zpq_shr); + fast_divmod(z, residual, residual, params_.PQ, params_.pq_mul, params_.pq_shr); + fast_divmod(p, q, residual, problem_size_.Q, params_.q_mul, params_.q_shr); + + return TensorCoord(n, z, p, q, k); + } + + /// Returns true if the coord is within the output gradient tensor Dy + CUTLASS_HOST_DEVICE + bool valid_(TensorCoord coord) const { + + return coord.n() < problem_size_.N && + coord.c() < problem_size_.K; + } + +public: + + /// Returns true if the current coordinate is within the output gradient tensor Dy + CUTLASS_HOST_DEVICE + bool valid() const { + + LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous; + return (predicates_ & (1u << pred_idx)); + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + + return reinterpret_cast( + pointer_ + + iteration_strided_ * params_.offset_next_strided + + iteration_contiguous_ * params_.offset_next_contiguous + ); + + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv3dWgradOutputGradientTileAccessIteratorOptimized &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv3dProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.C % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + return Status::kSuccess; + } + +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + + diff --git a/include/cutlass/conv/threadblock/implicit_gemm_multistage.h b/include/cutlass/conv/threadblock/implicit_gemm_multistage.h new file mode 100644 index 0000000000..1702847c10 --- /dev/null +++ b/include/cutlass/conv/threadblock/implicit_gemm_multistage.h @@ -0,0 +1,480 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a multistage threadblock-scoped Implicit GEMM Convolution kernel. +*/ + +#pragma once + +#include "cutlass/aligned_buffer.h" +#include "cutlass/arch/memory.h" +#include "cutlass/array.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/numeric_types.h" +#include "cutlass/arch/cache_operation.h" +#include "cutlass/gemm/threadblock/mma_base.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Iterates over tiles of A operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorA_, + /// Iterates over tiles of A operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorA_, + /// Cache operation for operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorB_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB_, + /// Cache operation for operand B + cutlass::arch::CacheOperation::Kind CacheOpB, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Number of stages, + int Stages, + /// Used for partial specialization + typename Enable = bool> +class ImplicitGemmMultistage : + public gemm::threadblock::MmaBase { +public: + ///< Base class + using Base = gemm::threadblock::MmaBase; + ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using Shape = Shape_; + ///< Iterates over tiles of A operand in global memory + using IteratorA = IteratorA_; + ///< Iterates over tiles of B operand in global memory + using IteratorB = IteratorB_; + ///< Policy describing tuning details + using Policy = Policy_; + + using SmemIteratorA = SmemIteratorA_; + using SmemIteratorB = SmemIteratorB_; + + static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB; + + // + // Dependent types + // + + /// Fragment of accumulator tile + + using ElementC = typename Policy::Operator::ElementC; + using FragmentC = typename Policy::Operator::FragmentC; + + /// Warp-level Mma + using Operator = typename Policy::Operator; + + /// Internal structure exposed for introspection. + struct Detail { + + static_assert(Base::kWarpGemmIterations > 1, + "The pipelined structure requires at least two warp-level " + "GEMM operations."); + + /// Number of cp.async instructions to load one stage of operand A + static int const AsyncCopyIterationsPerStageA = + IteratorA::ThreadMap::Iterations::kCount; + + /// Number of cp.async instructions to load one stage of operand B + static int const AsyncCopyIterationsPerStageB = + IteratorB::ThreadMap::Iterations::kCount; + + /// Number of stages + static int const kStages = Stages; + + /// Number of cp.async instructions to load on group of operand A + static int const kAccessesPerGroupA = + (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations; + + /// Number of cp.async instructions to load on group of operand B + static int const kAccessesPerGroupB = + (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations; + }; + + private: + + using WarpLoadedFragmentA = typename Operator::FragmentA; + using WarpLoadedFragmentB = typename Operator::FragmentB; + using WarpTransformedFragmentA = typename Operator::TransformedFragmentA; + using WarpTransformedFragmentB = typename Operator::TransformedFragmentB; + + private: + + // + // Data members + // + + /// Iterator to write threadblock-scoped tile of A operand to shared memory + SmemIteratorA smem_iterator_A_; + + /// Iterator to write threadblock-scoped tile of B operand to shared memory + SmemIteratorB smem_iterator_B_; + +public: + + /// Construct from tensor references + CUTLASS_DEVICE + ImplicitGemmMultistage( + ///< Shared storage needed for internal use by threadblock-scoped GEMM + typename Base::SharedStorage &shared_storage, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx + ): + Base(shared_storage, thread_idx, warp_idx, lane_idx), + smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx), + smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) + { + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + + int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN); + int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN); + + int warp_idx_m = warp_idx_mn % Base::WarpCount::kM; + int warp_idx_n = warp_idx_mn / Base::WarpCount::kM; + + // Add per-warp offsets in units of warp-level tiles + this->warp_tile_iterator_A_.add_tile_offset( + {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k}); + this->warp_tile_iterator_B_.add_tile_offset( + {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n}); + } + + CUTLASS_DEVICE + void copy_tiles_and_advance( + IteratorA &iterator_A, IteratorB &iterator_B, + int group_start_A = 0, int group_start_B = 0) { + + iterator_A.set_iteration_index(group_start_A); + this->smem_iterator_A_.set_iteration_index(group_start_A); + + // Async Copy for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) { + + if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) { + typename IteratorA::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_A_.get()); + + int const kSrcBytes = sizeof_bits::value * + IteratorA::ThreadMap::kElementsPerAccess / 8; + + cutlass::arch::cp_async_zfill( + dst_ptr, iterator_A.get(), iterator_A.valid()); + + ++iterator_A; + + ++this->smem_iterator_A_; + } + } + + iterator_B.set_iteration_index(group_start_B); + + this->smem_iterator_B_.set_iteration_index(group_start_B); + + // Async Copy for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) { + if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) { + typename IteratorB::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_B_.get()); + + int const kSrcBytes = sizeof_bits::value * + IteratorB::ThreadMap::kElementsPerAccess / 8; + + cutlass::arch::cp_async_zfill( + dst_ptr, iterator_B.get(), iterator_B.valid()); + + ++iterator_B; + ++this->smem_iterator_B_; + } + } + } + + /// Perform a threadblock-scoped matrix multiply-accumulate + CUTLASS_DEVICE + void operator()( + ///< problem size of GEMM + int gemm_k_iterations, + ///< destination accumulator tile + FragmentC &accum, + ///< iterator over A operand in global memory + IteratorA iterator_A, + ///< iterator over B operand in global memory + IteratorB iterator_B, + ///< initial value of accumulator + FragmentC const &src_accum, + ///< Imaginary strides used for planar-complex only - ignored here + int64_t imag_stride_A = 0, + int64_t imag_stride_B = 0) { + + // + // Prologue + // + + // Issue several complete stages + CUTLASS_PRAGMA_UNROLL + for (int stage = 0; stage < Base::kStages - 1; + ++stage, --gemm_k_iterations) { + + iterator_A.set_iteration_index(0); + this->smem_iterator_A_.set_iteration_index(0); + + // Async Copy for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) { + typename IteratorA::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_A_.get()); + + int const kSrcBytes = + sizeof_bits::value * + IteratorA::ThreadMap::kElementsPerAccess / 8; + + cutlass::arch::cp_async_zfill( + dst_ptr, iterator_A.get(), iterator_A.valid()); + + ++iterator_A; + ++this->smem_iterator_A_; + } + + iterator_B.set_iteration_index(0); + this->smem_iterator_B_.set_iteration_index(0); + + // Async Copy for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) { + typename IteratorB::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_B_.get()); + + int const kSrcBytes = + sizeof_bits::value * + IteratorB::ThreadMap::kElementsPerAccess / 8; + + cutlass::arch::cp_async_zfill( + dst_ptr, iterator_B.get(), iterator_B.valid()); + + ++iterator_B; + ++this->smem_iterator_B_; + } + + // Move to the next stage + iterator_A.advance(); + iterator_B.advance(); + + this->smem_iterator_A_.add_tile_offset({0, 1}); + this->smem_iterator_B_.add_tile_offset({1, 0}); + + // Inserts a fence to group cp.async instructions into stages. + cutlass::arch::cp_async_fence(); + } + + // Perform accumulation in the 'd' output operand + accum = src_accum; + + // Waits until kStages-2 stages have committed. + cutlass::arch::cp_async_wait(); + __syncthreads(); + + // Pair of fragments used to overlap shared memory loads and math + // instructions + WarpLoadedFragmentA warp_loaded_frag_A[2]; + WarpLoadedFragmentB warp_loaded_frag_B[2]; + WarpTransformedFragmentA warp_transformed_frag_A[2]; + WarpTransformedFragmentB warp_transformed_frag_B[2]; + + Operator warp_mma; + + this->warp_tile_iterator_A_.set_kgroup_index(0); + this->warp_tile_iterator_B_.set_kgroup_index(0); + + this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]); + this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + // Start issuing the first group of the next stage outside of the mainloop + copy_tiles_and_advance(iterator_A, iterator_B); + + int smem_write_stage_idx = Base::kStages - 1; + int smem_read_stage_idx = 0; + + warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0], + warp_loaded_frag_A[0], warp_loaded_frag_B[0]); + + // + // Mainloop + // + + CUTLASS_GEMM_LOOP + for (; gemm_k_iterations > (-Base::kStages + 1);) { + // + // Loop over GEMM K dimension + // + + // Computes a warp-level GEMM on data held in shared memory + // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; + ++warp_mma_k) { + + // Load warp-level tiles from shared memory, wrapping to k offset if + // this is the last group as the case may be. + + this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations); + this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations); + + this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + if (warp_mma_k > 0) + warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2], + warp_transformed_frag_B[warp_mma_k % 2], + warp_loaded_frag_A[warp_mma_k % 2], + warp_loaded_frag_B[warp_mma_k % 2]); + + // Issue global->shared copies for the next stage + int group_start_iteration_A, group_start_iteration_B; + + if (warp_mma_k + 1 == Base::kWarpGemmIterations) { + group_start_iteration_A = 0; + group_start_iteration_B = 0; + } else { + group_start_iteration_A = + (warp_mma_k + 1) * Detail::kAccessesPerGroupA; + group_start_iteration_B = + (warp_mma_k + 1) * Detail::kAccessesPerGroupB; + } + + copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, + group_start_iteration_B); + + warp_mma( + accum, + warp_transformed_frag_A[warp_mma_k % 2], + warp_transformed_frag_B[warp_mma_k % 2], + accum + ); + + if (warp_mma_k + 1 == Base::kWarpGemmIterations) + warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2], + warp_transformed_frag_B[(warp_mma_k + 1) % 2], + warp_loaded_frag_A[(warp_mma_k + 1) % 2], + warp_loaded_frag_B[(warp_mma_k + 1) % 2]); + + if (warp_mma_k + 2 == Base::kWarpGemmIterations) { + // Inserts a fence to group cp.async instructions into stages. + cutlass::arch::cp_async_fence(); + + // Waits until kStages-2 stages of cp.async have committed + arch::cp_async_wait(); + __syncthreads(); + + // Move to the next stage + iterator_A.advance(); + iterator_B.advance(); + + this->smem_iterator_A_.add_tile_offset({0, 1}); + this->smem_iterator_B_.add_tile_offset({1, 0}); + + // Add negative offsets to return iterators to the 'start' of the + // circular buffer in shared memory + if (smem_write_stage_idx == (Base::kStages - 1)) { + this->smem_iterator_A_.add_tile_offset({0, -Base::kStages}); + this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0}); + smem_write_stage_idx = 0; + } else { + ++smem_write_stage_idx; + } + + if (smem_read_stage_idx == (Base::kStages - 1)) { + this->warp_tile_iterator_A_.add_tile_offset( + {0, -Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations}); + this->warp_tile_iterator_B_.add_tile_offset( + {-Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations, + 0}); + smem_read_stage_idx = 0; + } else { + ++smem_read_stage_idx; + } + + --gemm_k_iterations; + } + } + + } + + // Insert fence and wait for all outstanding cp.async operations to commit. + cutlass::arch::cp_async_fence(); + cutlass::arch::cp_async_wait<0>(); + __syncthreads(); + + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h b/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h new file mode 100644 index 0000000000..0d56ab6b3f --- /dev/null +++ b/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h @@ -0,0 +1,313 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a double-buffered threadblock-scoped GEMM kernel. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/aligned_buffer.h" +#include "cutlass/numeric_conversion.h" + +#include "cutlass/numeric_types.h" +#include "cutlass/matrix_shape.h" + +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/mma_base.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Iterates over tiles of A operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator) + typename IteratorA_, + /// Iterates over tiles of A operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorA_, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator) + typename IteratorB_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB_, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Transformation applied to A operand + typename TransformA_ = NumericArrayConverter< + typename SmemIteratorA_::Element, + typename IteratorA_::Element, + IteratorA_::Fragment::kElements>, + /// + /// Transformation applied to A operand + typename TransformB_ = NumericArrayConverter< + typename SmemIteratorB_::Element, + typename IteratorB_::Element, + IteratorB_::Fragment::kElements>, + /// Used for partial specialization + typename Enable = bool +> +class ImplicitGemmPipelined : public gemm::threadblock::MmaBase { +public: + + ///< Base class + using Base = gemm::threadblock::MmaBase; + + using Shape = Shape_; ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using IteratorA = IteratorA_; ///< Iterates over tiles of A operand in global memory + using IteratorB = IteratorB_; ///< Iterates over tiles of B operand in global memory + using ElementC = ElementC_; ///< Data type of accumulator matrix + using LayoutC = LayoutC_; ///< Layout of accumulator matrix + using Policy = Policy_; ///< Policy describing tuning details + + using SmemIteratorA = SmemIteratorA_; + using SmemIteratorB = SmemIteratorB_; + + using TransformA = TransformA_; + using TransformB = TransformB_; + + // + // Dependent types + // + + /// Fragment of operand A loaded from global memory + using FragmentA = typename IteratorA::Fragment; + + /// Fragment of operand B loaded from global memory + using FragmentB = typename IteratorB::Fragment; + + /// Fragment of accumulator tile + using FragmentC = typename Policy::Operator::FragmentC; + + /// Warp-level Mma + using Operator = typename Policy::Operator; + + /// Obtain the arch tag from the warp-level operator + using ArchTag = typename Policy::Operator::ArchTag; + + /// Complex transform on A operand + static ComplexTransform const kTransformA = Operator::kTransformA; + + /// Complex transform on B operand + static ComplexTransform const kTransformB = Operator::kTransformB; + + // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline) + static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2"); + +private: + + using WarpFragmentA = typename Operator::FragmentA; + using WarpFragmentB = typename Operator::FragmentB; + +protected: + + /// Iterator to write threadblock-scoped tile of A operand to shared memory + SmemIteratorA smem_iterator_A_; + + /// Iterator to write threadblock-scoped tile of B operand to shared memory + SmemIteratorB smem_iterator_B_; + +public: + + /// Construct from tensor references + CUTLASS_DEVICE + ImplicitGemmPipelined( + typename Base::SharedStorage &shared_storage, ///< Shared storage needed for internal use by threadblock-scoped GEMM + int thread_idx, ///< ID within the threadblock + int warp_idx, ///< ID of warp + int lane_idx ///< ID of each thread within a warp + ): + Base(shared_storage, thread_idx, warp_idx, lane_idx), + smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx), + smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) { + + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + + int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN); + int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN); + + int warp_idx_m = warp_idx_mn % Base::WarpCount::kM; + int warp_idx_n = warp_idx_mn / Base::WarpCount::kM; + + // Add per-warp offsets in units of warp-level tiles + this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k}); + this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n}); + } + + /// Perform a threadblock-scoped matrix multiply-accumulate + CUTLASS_DEVICE + void operator()( + int gemm_k_iterations, ///< number of iterations of the mainloop + FragmentC &accum, ///< destination accumulator tile + IteratorA iterator_A, ///< iterator over A operand in global memory + IteratorB iterator_B, ///< iterator over B operand in global memory + FragmentC const &src_accum, ///< source accumulator tile + TransformA transform_A = TransformA(), ///< transformation applied to A fragment + TransformB transform_B = TransformB()) { ///< transformation applied to B fragment + + // + // Prologue + // + + // Perform accumulation in the 'd' output operand + accum = src_accum; + + FragmentA tb_frag_A; + FragmentB tb_frag_B; + + tb_frag_A.clear(); + tb_frag_B.clear(); + + // The last kblock is loaded in the prolog + iterator_A.load(tb_frag_A); + iterator_B.load(tb_frag_B); + + ++iterator_A; + ++iterator_B; + + this->smem_iterator_A_.store(transform_A(tb_frag_A)); + this->smem_iterator_B_.store(transform_B(tb_frag_B)); + + ++this->smem_iterator_A_; + ++this->smem_iterator_B_; + + __syncthreads(); + + // Pair of fragments used to overlap shared memory loads and math instructions + WarpFragmentA warp_frag_A[2]; + WarpFragmentB warp_frag_B[2]; + + this->warp_tile_iterator_A_.set_kgroup_index(0); + this->warp_tile_iterator_B_.set_kgroup_index(0); + + this->warp_tile_iterator_A_.load(warp_frag_A[0]); + this->warp_tile_iterator_B_.load(warp_frag_B[0]); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + Operator warp_mma; + + int smem_write_stage_idx = 1; + + // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing + // shared memory loads (which have the tighest latency requirement). + + // + // Mainloop + // + + // Note: The main loop does not support Base::kWarpGemmIterations == 2. + CUTLASS_GEMM_LOOP + for (; gemm_k_iterations > 0; --gemm_k_iterations) { + // + // Loop over GEMM K dimension + // + + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) { + + // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group + // as the case may be. + + if (warp_mma_k == Base::kWarpGemmIterations - 1) { + + // Write fragments to shared memory + this->smem_iterator_A_.store(transform_A(tb_frag_A)); + + this->smem_iterator_B_.store(transform_B(tb_frag_B)); + + __syncthreads(); + + ++this->smem_iterator_A_; + ++this->smem_iterator_B_; + + // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory + if (smem_write_stage_idx == 1) { + this->smem_iterator_A_.add_tile_offset({0, -Base::kStages}); + this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0}); + } + else { + this->warp_tile_iterator_A_.add_tile_offset( + {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations}); + this->warp_tile_iterator_B_.add_tile_offset( + {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, + 0}); + } + + smem_write_stage_idx ^= 1; + } + + this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations); + this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations); + + this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + if (warp_mma_k == 0) { + + iterator_A.load(tb_frag_A); + iterator_B.load(tb_frag_B); + + ++iterator_A; + ++iterator_B; + } + + warp_mma(accum, warp_frag_A[warp_mma_k % 2], + warp_frag_B[warp_mma_k % 2], accum); + } + } + + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/core_io.h b/include/cutlass/core_io.h index 1f624f1fa8..bd69a707d3 100644 --- a/include/cutlass/core_io.h +++ b/include/cutlass/core_io.h @@ -38,6 +38,9 @@ #include "cutlass/layout/pitch_linear.h" #include "cutlass/tensor_view.h" #include "cutlass/gemm/gemm.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" +#include "cutlass/conv/conv3d_problem_size.h" /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -156,13 +159,23 @@ namespace gemm { template inline std::ostream & operator<<(std::ostream &out, GemmShape const &gemm_shape) { - out << "cutlass::GemmShape::(kM, kN, kK) {" + out << "cutlass::gemm::GemmShape::(kM, kN, kK) {" << cutlass::gemm::GemmShape::kM <<"," << cutlass::gemm::GemmShape::kN <<"," << cutlass::gemm::GemmShape::kK << "}"; return out; } +/// Default printing to ostream for GemmCoord +inline +std::ostream & operator<<(std::ostream &out, GemmCoord const &gemm_coord) { + out << "cutlass::gemm::GemmCoord:: {" + << gemm_coord.m() <<"," + << gemm_coord.n() <<"," + << gemm_coord.k() << "}"; + return out; +} + } //namespace gemm /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -185,5 +198,44 @@ std::ostream & operator<<(std::ostream &out, PitchLinearShape { /// Functionally required for serial reduction in the epilogue CUTLASS_HOST_DEVICE - void set_k_partition(int k_partition) { + void set_k_partition(int k_partition, int k_partition_count) { if (k_partition) { beta_ = ElementCompute(1); } @@ -354,7 +354,7 @@ class LinearCombinationClamp { CUTLASS_PRAGMA_UNROLL for (int i = 0; i < kCount; ++i) { - scaled_accumulator[i] = static_cast(intermediate[i]); + scaled_accumulator[i] = __float2int_rn(intermediate[i]); } // Convert to destination numeric type @@ -385,7 +385,7 @@ class LinearCombinationClamp { CUTLASS_PRAGMA_UNROLL for (int i = 0; i < kCount; ++i) { - scaled_accumulator[i] = static_cast(intermediate[i]); + scaled_accumulator[i] = __float2int_rn(intermediate[i]); } // Convert to destination numeric type @@ -495,7 +495,7 @@ class FastLinearCombinationClamp { /// Functionally required for serial reduction in the epilogue CUTLASS_HOST_DEVICE - void set_k_partition(int k_partition) { + void set_k_partition(int k_partition, int k_partition_count) { if (k_partition) { beta_ = ElementCompute(1); } diff --git a/include/cutlass/epilogue/thread/linear_combination_planar_complex.h b/include/cutlass/epilogue/thread/linear_combination_planar_complex.h index 3934af1041..68f334bdb8 100644 --- a/include/cutlass/epilogue/thread/linear_combination_planar_complex.h +++ b/include/cutlass/epilogue/thread/linear_combination_planar_complex.h @@ -134,7 +134,7 @@ class LinearCombinationPlanarComplex { /// Functionally required for serial reduction in the epilogue CUTLASS_HOST_DEVICE - void set_k_partition(int k_partition) { + void set_k_partition(int k_partition, int k_partition_count) { if (k_partition) { beta_ = ElementCompute(1); } diff --git a/include/cutlass/epilogue/thread/linear_combination_relu.h b/include/cutlass/epilogue/thread/linear_combination_relu.h index 7a2fa9e8af..7a41404791 100644 --- a/include/cutlass/epilogue/thread/linear_combination_relu.h +++ b/include/cutlass/epilogue/thread/linear_combination_relu.h @@ -28,6 +28,7 @@ #pragma once +#include #include "cutlass/cutlass.h" #include "cutlass/numeric_types.h" #include "cutlass/array.h" @@ -77,7 +78,6 @@ class LinearCombinationRelu { ElementCompute threshold; ///< minimum value that is output ElementCompute const *alpha_ptr; ///< pointer to accumulator scalar - if not null, loads it from memory ElementCompute const *beta_ptr; ///< pointer to source scalar - if not null, loads it from memory - ElementCompute const *threshold_ptr; ///< pointer to threshold scalar - if not null, loads from memory // // Methods // @@ -88,15 +88,14 @@ class LinearCombinationRelu { beta(ElementCompute(0)), threshold(ElementCompute(0)), alpha_ptr(nullptr), - beta_ptr(nullptr), - threshold_ptr(nullptr) { } + beta_ptr(nullptr) { } CUTLASS_HOST_DEVICE Params( ElementCompute alpha, ElementCompute beta, ElementCompute threshold = ElementCompute(0) - ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr), threshold_ptr(nullptr) { + ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) { } @@ -104,8 +103,8 @@ class LinearCombinationRelu { Params( ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr, - ElementCompute const *threshold_ptr = nullptr - ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), threshold_ptr(threshold_ptr) { + ElementCompute threshold = ElementCompute(0) + ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { } }; @@ -128,7 +127,7 @@ class LinearCombinationRelu { alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha); beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta); - threshold_ = (params.threshold_ptr ? *params.threshold_ptr : params.threshold); + threshold_ = params.threshold; } /// Returns true if source is needed @@ -139,10 +138,16 @@ class LinearCombinationRelu { /// Functionally required for serial reduction in the epilogue CUTLASS_HOST_DEVICE - void set_k_partition(int k_partition) { + void set_k_partition(int k_partition, int k_partition_count) { if (k_partition) { beta_ = ElementCompute(1); } + + if (k_partition != k_partition_count - 1) { + // set to NaN to make ReLU no-op for all except last k partitions + int64_t allones = -1; + threshold_ = reinterpret_cast(allones); + } } /// Computes linear scaling: D = alpha * accumulator + beta * source @@ -205,7 +210,6 @@ class LinearCombinationRelu { } }; - ///////////////////////////////////////////////////////////////////////////////////////////////// // Conditional guards to enable partial specialization for packed integers @@ -245,7 +249,6 @@ class LinearCombinationRelu { ElementCompute threshold; ///< minimum value that is output ElementCompute const *alpha_ptr; ///< pointer to accumulator scalar - if not null, loads it from memory ElementCompute const *beta_ptr; ///< pointer to source scalar - if not null, loads it from memory - ElementCompute const *threshold_ptr; ///< pointer to threshold scalar - if not null, loads from memory // // Methods // @@ -256,15 +259,14 @@ class LinearCombinationRelu { beta(ElementCompute(0)), threshold(ElementCompute(0)), alpha_ptr(nullptr), - beta_ptr(nullptr), - threshold_ptr(nullptr) { } + beta_ptr(nullptr) { } CUTLASS_HOST_DEVICE Params( ElementCompute alpha, ElementCompute beta, ElementCompute threshold = ElementCompute(0) - ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr), threshold_ptr(nullptr) { + ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) { } @@ -272,8 +274,8 @@ class LinearCombinationRelu { Params( ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr, - ElementCompute const *threshold_ptr = nullptr - ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), threshold_ptr(threshold_ptr) { + ElementCompute threshold = ElementCompute(0) + ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { } }; @@ -296,7 +298,7 @@ class LinearCombinationRelu { alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha); beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta); - threshold_ = (params.threshold_ptr ? *params.threshold_ptr : params.threshold); + threshold_ = params.threshold; } /// Returns true if source is needed @@ -307,10 +309,16 @@ class LinearCombinationRelu { /// Functionally required for serial reduction in the epilogue CUTLASS_HOST_DEVICE - void set_k_partition(int k_partition) { + void set_k_partition(int k_partition, int k_partition_count) { if (k_partition) { beta_ = ElementCompute(1); } + + if (k_partition != k_partition_count - 1) { + // set to NaN to make ReLU no-op for all except last k partitions + int64_t allones = -1; + threshold_ = reinterpret_cast(allones); + } } /// Computes linear scaling: D = alpha * accumulator + beta * source @@ -331,26 +339,41 @@ class LinearCombinationRelu { multiplies mul_add_source; multiply_add mul_add_accumulator; - ReLu relu; + ReLu relu; intermediate = mul_add_source(beta_, converted_source); // X = beta * C + uniform intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate); // D = alpha * Accum + X - // Convert floats back to INT - FragmentAccumulator scaled_accumulator; - - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < kCount; ++i) { - scaled_accumulator[i] = static_cast(intermediate[i]); - } - // Compute threshold optionally - scaled_accumulator = relu(threshold_, scaled_accumulator); - - // Convert to destination numeric type - NumericArrayConverter destination_converter; + intermediate = relu(threshold_, intermediate); - return destination_converter(scaled_accumulator); + if (platform::is_same::value || + platform::is_same::value || + platform::is_same::value || + platform::is_same::value || + platform::is_same::value || + platform::is_same::value || + platform::is_same::value || + platform::is_same::value || + platform::is_same::value) { + // Convert floats back to INT + FragmentAccumulator scaled_accumulator; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kCount; ++i) { + scaled_accumulator[i] = __float2int_rn(intermediate[i]); + } + + // Convert to destination numeric type + NumericArrayConverter + destination_converter; + + return destination_converter(scaled_accumulator); + } else { + NumericArrayConverter + destination_converter; + return destination_converter(intermediate); + } } /// Computes linear scaling: D = alpha * accumulator @@ -367,25 +390,48 @@ class LinearCombinationRelu { ComputeFragment intermediate; multiplies mul_accumulator; - ReLu relu; + ReLu relu; intermediate = mul_accumulator(alpha_, converted_accumulator); // D = alpha * Accum + // Compute threshold optionally + intermediate = relu(threshold_, intermediate); + // Convert floats back to INT FragmentAccumulator scaled_accumulator; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < kCount; ++i) { - scaled_accumulator[i] = static_cast(intermediate[i]); + scaled_accumulator[i] = __float2int_rn(intermediate[i]); } - // Compute threshold optionally - scaled_accumulator = relu(threshold_, scaled_accumulator); - - // Convert to destination numeric type - NumericArrayConverter destination_converter; - - return destination_converter(scaled_accumulator); + if (platform::is_same::value || + platform::is_same::value || + platform::is_same::value || + platform::is_same::value || + platform::is_same::value || + platform::is_same::value || + platform::is_same::value || + platform::is_same::value || + platform::is_same::value) { + // Convert floats back to INT + FragmentAccumulator scaled_accumulator; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kCount; ++i) { + scaled_accumulator[i] = __float2int_rn(intermediate[i]); + } + + // Convert to destination numeric type + NumericArrayConverter + destination_converter; + + return destination_converter(scaled_accumulator); + } else { + NumericArrayConverter + destination_converter; + return destination_converter(intermediate); + } } }; @@ -398,4 +444,3 @@ class LinearCombinationRelu { } // namespace cutlass ///////////////////////////////////////////////////////////////////////////////////////////////// - diff --git a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h index 3a65c49acf..dbefd2258c 100644 --- a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h +++ b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h @@ -133,7 +133,7 @@ class LinearCombinationSigmoid { /// Functionally required for serial reduction in the epilogue CUTLASS_HOST_DEVICE - void set_k_partition(int k_partition) { + void set_k_partition(int k_partition, int k_partition_count) { if (k_partition) { beta_ = ElementCompute(1); } diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h index 8390ee0b47..08b829be1d 100644 --- a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h @@ -367,6 +367,52 @@ struct DefaultInterleavedEpilogueTensorOp { //////////////////////////////////////////////////////////////////////////////// +/// Defines sensible defaults for epilogues for TensorOps which uses +/// intereleaved output layout. For this case, shared memory is not needed. +template +struct DefaultInterleavedConvEpilogue { + using Shape = Shape_; + using WarpMmaTensorOp = WarpMmaTensorOp_; + static int const kPartitionsK = PartitionsK; + using OutputOp = OutputOp_; + static int const kElementsPerAccess = ElementsPerAccess; + + using ElementOutput = typename OutputOp::ElementOutput; + using ElementAccumulator = typename WarpMmaTensorOp::ElementC; + + // + // Thread map + // + using OutputTileThreadMap = typename cutlass::epilogue::threadblock:: + DefaultInterleavedConvThreadMapTensorOp< + Shape, typename WarpMmaTensorOp::Shape, kPartitionsK, ElementOutput, + kElementsPerAccess, InterleavedK>::Type; + + using OutputTileIterator = + cutlass::epilogue::threadblock::InterleavedConvPredicatedTileIterator< + OutputTileThreadMap, ElementOutput, InterleavedK>; + + using AccumulatorFragmentIterator = + cutlass::epilogue::warp::FragmentIteratorTensorOp< + typename WarpMmaTensorOp::Shape, + typename WarpMmaTensorOp::Policy::Operator::Shape, + typename WarpMmaTensorOp::Policy::Operator::ElementC, + typename WarpMmaTensorOp::Policy::Operator::FragmentC, + // can reuse the gemm version here to do element selection + layout::ColumnMajorInterleaved>; + + // + // Define the epilogue + // + using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue< + Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator, + AccumulatorFragmentIterator, OutputOp, InterleavedK, IsBetaZero>; +}; + +//////////////////////////////////////////////////////////////////////////////// + } // namespace threadblock } // namespace epilogue } // namespace cutlass diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h index 96e4335cab..752b1ee9b4 100644 --- a/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h @@ -144,6 +144,55 @@ struct DefaultInterleavedThreadMapTensorOp { Detail::kThreads, kElementsPerAccess, sizeof_bits::value>; }; + +//////////////////////////////////////////////////////////////////////////////// + +/// Defines the optimal thread map for TensorOp accumulator layouts +template +struct DefaultInterleavedConvThreadMapTensorOp { + using ThreadblockShape = ThreadblockShape_; + using WarpShape = WarpShape_; + static int const kPartitionsK = PartitionsK; + using Element = Element_; + static int const kElementsPerAccess = ElementsPerAccess; + static int const kInterleavedK = InterleavedK; + + // + // Definitions + // + + struct Detail { + /// Tensor Operations fundamentally perform operations on 8 rows + static int const kTensorOpRows = 8; + static int const kWarpSize = 32; + + static_assert(!(ThreadblockShape::kM % WarpShape::kM) && + !(ThreadblockShape::kN % WarpShape::kN), + "Divisibility"); + + /// Number of warps + using WarpCount = + gemm::GemmShape; + + /// Number of participating threads + static int const kThreads = WarpCount::kCount * kWarpSize; + }; + + // + // ThreadMap + // + + /// ThreadMap to be used by epilogue::MaskedTileIterator satisfying concept + /// InterleavedOutputTileThreadMap + using Type = InterleavedConvOutputTileThreadMap< + MatrixShape, + MatrixShape, + Detail::kThreads, kElementsPerAccess, sizeof_bits::value>; +}; + //////////////////////////////////////////////////////////////////////////////// } // namespace threadblock diff --git a/include/cutlass/epilogue/threadblock/output_iterator_parameter.h b/include/cutlass/epilogue/threadblock/output_iterator_parameter.h new file mode 100644 index 0000000000..8cfba768c1 --- /dev/null +++ b/include/cutlass/epilogue/threadblock/output_iterator_parameter.h @@ -0,0 +1,92 @@ +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" +#include "cutlass/conv/conv3d_problem_size.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/tensor_ref.h" + +namespace cutlass { +namespace epilogue { +namespace threadblock { + +template< + typename TensorLayout_, ///! The original output tensor layout + typename OutputIteratorLayout_, ///! Layout used by epilogue output iterator + typename TensorRef_, ///! Input tensor to epilogue output iterator + conv::Operator ConvOperator, ///! Convolutional operator (Fprop, Dgrad, Wgrad) + typename ConvProblemSize_ ///! Convolutional operator on 2D or 3D problem +> +struct ConvOutputIteratorParameter { + + using TensorLayout = TensorLayout_; + using OutputIteratorLayout = OutputIteratorLayout_; + using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord; + using TensorRef = TensorRef_; + static conv::Operator const kConvolutionalOperator = ConvOperator; + using ConvProblemSize = ConvProblemSize_; + + /// Wgrad stride idx for implicit gemm algorithm + // Conv2d row-major matrix (KxRSC) + // Conv3d row-major matrix (KxTRSC) + static int const kWgradStrideIdx = + platform::is_same::value ? 2 : 3; + + /// This chooses the appropriate stride element of the C tensor. + static int const kTensorStrideIdx = + (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradStrideIdx : 0); + + + CUTLASS_HOST_DEVICE + static OutputIteratorLayout layout(const TensorRef & ref) { + return ref.stride(kTensorStrideIdx); + } + + CUTLASS_HOST_DEVICE + static OutputTensorCoord extent(ConvProblemSize problem_size) { + return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn(); + } + +}; + + + +template < + int InterleavedK, + typename TensorRef_, + conv::Operator ConvOperator, + typename ConvProblemSize_ +> +struct ConvOutputIteratorParameter< + layout::TensorNCxHWx, + layout::TensorNCxHWx, + TensorRef_, + ConvOperator, + ConvProblemSize_> +{ + + using TensorLayout = typename layout::TensorNCxHWx; + using OutputIteratorLayout = typename layout::TensorNCxHWx; + using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord; + using TensorRef = TensorRef_; + static conv::Operator const kConvolutionalOperator = ConvOperator; + using ConvProblemSize = ConvProblemSize_; + + CUTLASS_HOST_DEVICE + static OutputIteratorLayout layout(const TensorRef & ref) { + return ref.stride(); + } + + CUTLASS_HOST_DEVICE + static OutputTensorCoord extent(ConvProblemSize problem_size) { + return problem_size.output_extent(); + } + +}; + +} // namespace threadblock +} // namespace epilogue +} // namespace cutlass diff --git a/include/cutlass/epilogue/threadblock/output_tile_thread_map.h b/include/cutlass/epilogue/threadblock/output_tile_thread_map.h index 4eb5e3784b..cfe13cc167 100644 --- a/include/cutlass/epilogue/threadblock/output_tile_thread_map.h +++ b/include/cutlass/epilogue/threadblock/output_tile_thread_map.h @@ -488,6 +488,68 @@ struct InterleavedOutputTileThreadMap { } }; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template metaprogram for partitioning a 4D interleaved layout across warps +/// to achieve several performance objectives: +/// +/// - coalesced memory accesses in units of 64 Byte lines +/// - minimal address arithmetic +/// - minimal predicate calculations +/// +template +struct InterleavedConvOutputTileThreadMap { + using WarpCount = WarpCount_; + + static int const kWarpSize = 32; + static int const kThreads = Threads; + static int const kWarpCount = kThreads / kWarpSize; + + static int const kElementsPerAccess = ElementsPerAccess; + static int const kElementSize = ElementSize; + + // + // Metaprogram computation + // + + struct Detail {}; + + // + // Output + // + + using Iterations = Iterations_; + + using Delta = MatrixShape; + + /// Initial offset function + CUTLASS_HOST_DEVICE + static MatrixCoord initial_offset(int thread_idx) { + int warp_idx = thread_idx / kWarpSize; + int lane_idx = thread_idx % kWarpSize; + + // Compute warp location + MatrixCoord warp_footprint{ + Delta::kRow * Iterations::kRow, + Delta::kColumn * Iterations::kColumn, + }; + + MatrixCoord warp_offset{warp_idx % WarpCount::kRow, + warp_idx / WarpCount::kRow}; + + // Compute per-lane offset + MatrixCoord thread_offset_in_warp{lane_idx / 4, + (lane_idx % 4) * kElementsPerAccess}; + + MatrixCoord thread_offset_in_threadblock_tile = + warp_footprint * warp_offset + thread_offset_in_warp; + + return thread_offset_in_threadblock_tile; + } +}; + //////////////////////////////////////////////////////////////////////////////// } // namespace threadblock diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h index 05af759a5e..1be50cbd90 100644 --- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h +++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h @@ -43,6 +43,7 @@ #include "cutlass/epilogue/threadblock/output_tile_thread_map.h" #include "cutlass/arch/arch.h" #include "cutlass/arch/memory.h" +#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h" //////////////////////////////////////////////////////////////////////////////// @@ -102,68 +103,20 @@ class PredicatedTileIterator { // Parameters struct // - struct Params { - - // - // Data members - // - - LongIndex stride; ///< stride in bytes between rows - - LongIndex increment_row; ///< increment quantity (in bytes) to advance when moving between rows - LongIndex increment_group; ///< increment quantity (in bytes) to advance when moving to the next group - LongIndex increment_cluster; ///< increment quantity (in bytes) to advance when moving to the next cluster - - LongIndex advance_row; ///< amount to add to move to the next 'row' position - LongIndex advance_group; ///< amount to add to move to the next 'group' position - LongIndex advance_cluster; ///< amount to add to move to the next 'cluster' position - LongIndex advance_tile; ///< amount to add to move to the next 'tile' - - // - // Methods - // - - CUTLASS_HOST_DEVICE - Status initialize(Index stride_) { - - stride = LongIndex(stride_); - - increment_row = stride * ThreadMap::Delta::kRow; - - increment_group = stride * ThreadMap::Delta::kGroup - - stride * ThreadMap::Delta::kRow * (ThreadMap::Iterations::kRow - 1); - - increment_cluster = stride * ThreadMap::Delta::kCluster - - stride * ThreadMap::Delta::kGroup * (ThreadMap::Iterations::kGroup - 1) - - stride * ThreadMap::Delta::kRow * (ThreadMap::Iterations::kRow - 1); - - advance_row = stride * ThreadMap::Shape::kRow; - - advance_group = stride * (ThreadMap::Shape::kGroup - 1) * ThreadMap::Shape::kRow * ThreadMap::Count::kRow; - - advance_cluster = - stride * - ThreadMap::Count::kGroup * ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;; - - advance_tile = - stride * - ThreadMap::Shape::kGroup * - ThreadMap::Shape::kRow * - ThreadMap::Shape::kCluster * - ThreadMap::Shape::kTile; - - return Status::kSuccess; - } + /// Uses a non-template class + struct Params : PredicatedTileIteratorParams { CUTLASS_HOST_DEVICE - Params() { - initialize(0); - } + Params() { } CUTLASS_HOST_DEVICE - Params(Layout const &layout) { - - initialize(layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess); + Params(Layout const &layout): + PredicatedTileIteratorParams( + layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess, + make_OutputTileThreadMapDesc() + ) + { + } }; @@ -207,7 +160,7 @@ class PredicatedTileIterator { // /// Parameters structure containing reference and precomputed state. - Params params_; + PredicatedTileIteratorParams params_; /// Byte-level pointer uint8_t *byte_pointer_; @@ -239,12 +192,13 @@ class PredicatedTileIterator { /// Constructor CUTLASS_DEVICE PredicatedTileIterator( - Params const & params, + PredicatedTileIteratorParams const & params, Element *pointer, TensorCoord extent, int thread_idx, TensorCoord threadblock_offset = TensorCoord() - ): params_(params) + ): + params_(params) { TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset; @@ -745,6 +699,309 @@ class InterleavedPredicatedTileIterator { }; /////////////////////////////////////////////////////////////////////////////// + +/// Tile iterator used to load output tile from shared memory in epilogue. +/// +/// Satisfies: ReadableTileIterator | InterleavedMaskedTileIterator | ForwardTileIterator +/// +template < + typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap) + typename Element_, ///< Element data type + int InterleavedN ///< Number of Interleaved N +> +class InterleavedConvPredicatedTileIterator { +public: + using ThreadMap = ThreadMap_; + + using Element = Element_; + + using Layout = layout::TensorNCxHWx; + using TensorRef = TensorRef; + using ConstTensorRef = typename TensorRef::ConstTensorRef; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + using TensorCoord = Tensor4DCoord; + + static int const kElementsPerAccess = ThreadMap::kElementsPerAccess; + static int const kThreads = ThreadMap::kThreads; + static int const kIterations = ThreadMap::Iterations::kCount; + + /// Fragment object + using Fragment = Array; + + /// Memory access size + using AccessType = AlignedArray; + + // + // Parameters struct + // + + struct Params { + + // + // Data members + // + + LongIndex stride_col; ///< stride in bytes between columns + LongIndex stride_row; ///< stride in bytes between rows + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Status initialize(typename Layout::Stride stride_) { + stride_col = stride_[1]; + stride_row = stride_[2]; + + return Status::kSuccess; + } + + CUTLASS_HOST_DEVICE + Params() { + initialize(cutlass::make_Coord(0, 0, 0)); + } + + CUTLASS_HOST_DEVICE + Params(Layout const &layout) { + + initialize(layout.stride()); + } + }; + + /// Mask object + struct Mask { + static int const kCount = + (ThreadMap::Iterations::kRow < 8) ? 8 : ThreadMap::Iterations::kRow; + + /// Predicate state + bool predicates[kCount]; + + // + // Mask + // + CUTLASS_HOST_DEVICE + Mask() { + enable(); + } + + ///< Efficiently disables all accesses guarded by mask + CUTLASS_HOST_DEVICE void clear() { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kCount; ++i) { + predicates[i] = false; + } + } + + ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask + CUTLASS_DEVICE void enable() { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kCount; ++i) { + predicates[i] = true; + } + } + }; + +private: + + // + // Data members + // + + /// Parameters structure containing reference and precomputed state. + Params params_; + + /// Byte-level pointer + uint8_t *byte_pointer_; + + /// Array of boolean values to contain steady-state predicates + Mask mask_; + + /// Extent of the matrix tile in columns + Index extent_col_; + + /// Extent of the matrix tile in rows + Index extent_row_; + + /// Extent of the matrix tile in pq + Index extent_pq_; + + /// A thread's starting row position (assuming steady-state predicates have + /// been computed) + Index thread_start_row_; + + /// A thread's starting column position (assuming steady-state predicates have + /// been computed) + Index thread_start_col_; + + /// Internal iteration counter + LongIndex iteration_row_; + LongIndex iteration_col_; + + uint32_t pq_mul_; + + uint32_t pq_shr_; + +private: + + // + // Methods + // + +public: + + // + // Methods + // + + /// Constructor + CUTLASS_DEVICE + InterleavedConvPredicatedTileIterator( + Params const & params, + Element *pointer, + TensorCoord extent, + int thread_idx, + MatrixCoord threadblock_offset + ): + params_(params) { + MatrixCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset; + + extent_col_ = extent.c(); + extent_pq_ = extent.h() * extent.w(); + extent_row_ = extent.n() * extent_pq_; + + find_divisor(pq_mul_, pq_shr_, extent_pq_); + + thread_start_row_ = thread_offset.row(); + thread_start_col_ = thread_offset.column(); + + // Initialize predicates + CUTLASS_PRAGMA_UNROLL + for (int r = 0; r < ThreadMap::Iterations::kRow; ++r) { + mask_.predicates[r] = + ((thread_offset.row() + ThreadMap::Delta::kRow * r) < extent_row_); + } + + // Initialize pointer + byte_pointer_ = reinterpret_cast(pointer) + + ((thread_start_col_ / InterleavedN) * params_.stride_col + + (thread_start_col_ % InterleavedN)) * + sizeof_bits::value / 8; + + // Initialize internal state counter + iteration_row_ = iteration_col_ = 0; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + byte_pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load(Fragment &frag) { + + int col_offset = iteration_col_ * ThreadMap::Delta::kColumn; + bool col_guard = ((thread_start_col_ + col_offset) < extent_col_); + bool guard = col_guard && mask_.predicates[iteration_row_]; + + int n, pq_rem; + + fast_divmod(n, pq_rem, + thread_start_row_ + iteration_row_ * ThreadMap::Delta::kRow, + extent_pq_, pq_mul_, pq_shr_); + + uint8_t *byte_pointer = + byte_pointer_ + (n * params_.stride_row + pq_rem * InterleavedN) * + sizeof_bits::value / 8; + AccessType *frag_ptr = reinterpret_cast(&frag); + AccessType const *memory_pointer = + reinterpret_cast(byte_pointer); + + cutlass::arch::global_load< + AccessType, + sizeof(AccessType) + >( + *frag_ptr, + (void *)memory_pointer, + guard); + } + + /// Stores a fragment to memory + CUTLASS_DEVICE + void store(Fragment const &frag) { + + int col_offset = iteration_col_ * ThreadMap::Delta::kColumn; + bool col_guard = ((thread_start_col_ + col_offset) < extent_col_); + bool guard = col_guard && mask_.predicates[iteration_row_]; + + int n, pq_rem; + + fast_divmod(n, pq_rem, + thread_start_row_ + iteration_row_ * ThreadMap::Delta::kRow, + extent_pq_, pq_mul_, pq_shr_); + + uint8_t *byte_pointer = + byte_pointer_ + (n * params_.stride_row + pq_rem * InterleavedN) * + sizeof_bits::value / 8; + AccessType const *frag_ptr = reinterpret_cast(&frag); + AccessType *memory_pointer = reinterpret_cast(byte_pointer); + + if (guard) { + *memory_pointer = *frag_ptr; + } + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int iteration) { + iteration_row_ = iteration % ThreadMap::Iterations::kRow; + iteration_col_ = iteration / ThreadMap::Iterations::kRow; + } + + /// Advances to the next position to load or store + CUTLASS_HOST_DEVICE + InterleavedConvPredicatedTileIterator &operator++() { + + ++iteration_row_; + + if (iteration_row_ == ThreadMap::Iterations::kRow) { + + iteration_row_ = 0; + ++iteration_col_; + byte_pointer_ += params_.stride_col; + + if (iteration_col_ == ThreadMap::Iterations::kColumn) { + iteration_col_ = 0; + } + } + + return *this; + } + + ///< Efficiently disables all accesses guarded by mask + CUTLASS_DEVICE void clear_mask() { + mask_.clear(); + } + + ///< Efficiently enables all accesses guarded by mask + CUTLASS_DEVICE void enable_mask() { + mask_.enable(); + } + + ///< Sets the mask + CUTLASS_DEVICE void get_mask(Mask &mask) { + return mask_; + } + + ///< Sets the mask + CUTLASS_DEVICE void set_mask(Mask const &mask) { + mask_ = mask; + } +}; + /////////////////////////////////////////////////////////////////////////////// } // namespace threadblock diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h new file mode 100644 index 0000000000..a08e1e0616 --- /dev/null +++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h @@ -0,0 +1,227 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +struct OutputTileShapeDesc { + + int column; + int row; + int group; + int cluster; + int tile; + + // + // Methods + // + + /// Default ctor + CUTLASS_HOST_DEVICE + OutputTileShapeDesc(): column(0), row(0), group(0), cluster(0), tile(0) { } + + /// Ctor + CUTLASS_HOST_DEVICE + OutputTileShapeDesc( + int column_, + int row_, + int group_, + int cluster_, + int tile_ + ): + column(column_), + row(row_), + group(group_), + cluster(cluster_), + tile(tile_) { } + + /// Total number of points in the 5D space + CUTLASS_HOST_DEVICE + int count() const { + return column * row * group * cluster * tile; + } +}; + +/// Helper template to construct an OutputTileShapeDesc from a OutputTileShape template. +template +CUTLASS_HOST_DEVICE +OutputTileShapeDesc make_OutputTileShapeDesc() { + return OutputTileShapeDesc( + Shape::kColumn, + Shape::kRow, + Shape::kGroup, + Shape::kCluster, + Shape::kTile + ); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Thread map description +struct OutputTileThreadMapDesc { + + int threads; + int elements_per_access; + OutputTileShapeDesc shape; + OutputTileShapeDesc iterations; + OutputTileShapeDesc delta; + OutputTileShapeDesc count; + + // + // Methods + // + + CUTLASS_HOST_DEVICE + OutputTileThreadMapDesc() { } + + CUTLASS_HOST_DEVICE + OutputTileThreadMapDesc( + int threads_, + int elements_per_access_, + OutputTileShapeDesc shape_, + OutputTileShapeDesc iterations_, + OutputTileShapeDesc delta_, + OutputTileShapeDesc count_ + ): + threads(threads_), + elements_per_access(elements_per_access_), + shape(shape_), + iterations(iterations_), + delta(delta_), + count(count_) { } +}; + +/// Helper template to construct an OutputTileShapeDesc from a OutputTileThreadMap template. +template +CUTLASS_HOST_DEVICE +OutputTileThreadMapDesc make_OutputTileThreadMapDesc() { + return OutputTileThreadMapDesc( + ThreadMap::kThreads, + ThreadMap::kElementsPerAccess, + make_OutputTileShapeDesc(), + make_OutputTileShapeDesc(), + make_OutputTileShapeDesc(), + make_OutputTileShapeDesc() + ); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// +// Parameters struct +// + +struct PredicatedTileIteratorParams { + + using Index = int32_t; + using LongIndex = int64_t; + + // + // Data members + // + + LongIndex stride; ///< stride in bytes between rows + + LongIndex increment_row; ///< increment quantity (in bytes) to advance when moving between rows + LongIndex increment_group; ///< increment quantity (in bytes) to advance when moving to the next group + LongIndex increment_cluster; ///< increment quantity (in bytes) to advance when moving to the next cluster + + LongIndex advance_row; ///< amount to add to move to the next 'row' position + LongIndex advance_group; ///< amount to add to move to the next 'group' position + LongIndex advance_cluster; ///< amount to add to move to the next 'cluster' position + LongIndex advance_tile; ///< amount to add to move to the next 'tile' + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Status initialize(Index stride_, OutputTileThreadMapDesc thread_map) { + + stride = LongIndex(stride_); + + increment_row = stride * thread_map.delta.row; + + increment_group = stride * thread_map.delta.group + - stride * thread_map.delta.row * (thread_map.iterations.row - 1); + + increment_cluster = stride * thread_map.delta.cluster + - stride * thread_map.delta.group * (thread_map.iterations.group - 1) + - stride * thread_map.delta.row * (thread_map.iterations.row - 1); + + advance_row = stride * thread_map.shape.row; + + advance_group = + stride * + (thread_map.shape.group - 1) * thread_map.shape.row * thread_map.count.row; + + advance_cluster = + stride * + thread_map.count.group * + thread_map.shape.group * + thread_map.count.row * + thread_map.shape.row; + + advance_tile = + stride * + thread_map.shape.group * + thread_map.shape.row * + thread_map.shape.cluster * + thread_map.shape.tile; + + return Status::kSuccess; + } + + CUTLASS_HOST_DEVICE + PredicatedTileIteratorParams() { + initialize(0, OutputTileThreadMapDesc()); + } + + CUTLASS_HOST_DEVICE + PredicatedTileIteratorParams(Index stride, OutputTileThreadMapDesc thread_map) { + + initialize(stride, thread_map); + } +}; + +/////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace epilogue +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h index 79106b111e..b2a0612ac5 100644 --- a/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h +++ b/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h @@ -37,7 +37,7 @@ #pragma once -#if !defined(__clang__) +#if !(defined(__clang__) && defined(__CUDA__)) #include "cutlass/wmma_array.h" #include "cutlass/layout/matrix.h" @@ -152,5 +152,7 @@ class FragmentIteratorWmmaTensorOp> shr : src; #else - quo = int((div != 1) ? int(src * mul) >> shr : src); + quo = int((div != 1) ? int(((int64_t)src * mul) >> 32) >> shr : src); #endif // The remainder. @@ -215,7 +215,7 @@ void fast_divmod(int& quo, int64_t& rem, int64_t src, int div, unsigned int mul, // Use IMUL.HI if div != 1, else simply copy the source. quo = (div != 1) ? __umulhi(src, mul) >> shr : src; #else - quo = int((div != 1) ? (src * mul) >> shr : src); + quo = int((div != 1) ? ((src * mul) >> 32) >> shr : src); #endif // The remainder. rem = src - (quo * div); diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h index 90cf394941..d20c45df2e 100644 --- a/include/cutlass/functional.h +++ b/include/cutlass/functional.h @@ -161,6 +161,42 @@ struct negate { } }; +/// Greater equal +template +struct greater_equal { + CUTLASS_HOST_DEVICE + bool operator()(T const &lhs, T const &rhs) const { + return (lhs >= rhs); + } +}; + +/// Greater +template +struct greater { + CUTLASS_HOST_DEVICE + bool operator()(T const &lhs, T const &rhs) const { + return (lhs > rhs); + } +}; + +/// Less equal +template +struct less_equal { + CUTLASS_HOST_DEVICE + bool operator()(T const &lhs, T const &rhs) const { + return (lhs <= rhs); + } +}; + +/// Less +template +struct less { + CUTLASS_HOST_DEVICE + bool operator()(T const &lhs, T const &rhs) const { + return (lhs < rhs); + } +}; + /// Fused multiply-add template struct multiply_add { @@ -189,6 +225,40 @@ struct xor_add { } }; +template +struct conjugate { + CUTLASS_HOST_DEVICE + T operator()(T const &a) const { + return a; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct conjugate> { + CUTLASS_HOST_DEVICE + complex operator()(complex const &a) const { + return conj(a); + } +}; + +template +struct conjugate > { + CUTLASS_HOST_DEVICE + Array operator()(Array const &a) const { + + conjugate conj_op; + + Array ca; + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N; ++i) { + ca[i] = conj_op(a[i]); + } + return ca; + } +}; + ///////////////////////////////////////////////////////////////////////////////////////////////// // // Partial specialization for complex to target four scalar fused multiply-adds. @@ -1499,6 +1569,86 @@ struct multiply_add, Array, Array +CUTLASS_HOST_DEVICE +Array operator+(Array const &lhs, Array const &rhs) { + plus> op; + return op(lhs, rhs); +} + +template +CUTLASS_HOST_DEVICE +Array operator-(Array const &lhs, Array const &rhs) { + minus> op; + return op(lhs, rhs); +} + +template +CUTLASS_HOST_DEVICE +Array operator-(Array const &lhs) { + negate> op; + return op(lhs); +} + +template +CUTLASS_HOST_DEVICE +Array operator*(Array const &lhs, Array const &rhs) { + multiplies> op; + return op(lhs, rhs); +} + +template +CUTLASS_HOST_DEVICE +Array operator*(T lhs, Array const &rhs) { + multiplies> op; + return op(lhs, rhs); +} + +template +CUTLASS_HOST_DEVICE +Array operator*(Array const &lhs, T rhs) { + multiplies> op; + return op(lhs, rhs); +} + +template +CUTLASS_HOST_DEVICE +Array operator/(Array const &lhs, Array const &rhs) { + divides> op; + return op(lhs, rhs); +} + +template +CUTLASS_HOST_DEVICE +Array fma(Array const &a, Array const &b, Array const &c) { + multiply_add> op; + return op(a, b, c); +} + +template +CUTLASS_HOST_DEVICE +Array fma(T a, Array const &b, Array const &c) { + multiply_add> op; + return op(a, b, c); +} + +template +CUTLASS_HOST_DEVICE +Array fma(Array const &a, T b, Array const &c) { + multiply_add> op; + return op(a, b, c); +} + +template +CUTLASS_HOST_DEVICE +Array fma(Array const &a, Array const &b, T c) { + multiply_add> op; + return op(a, b, c); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace cutlass ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/device/gemm_sparse.h b/include/cutlass/gemm/device/gemm_sparse.h index df2a141cd1..bfd5606e1f 100644 --- a/include/cutlass/gemm/device/gemm_sparse.h +++ b/include/cutlass/gemm/device/gemm_sparse.h @@ -429,6 +429,25 @@ class SparseGemm { args.epilogue, static_cast(workspace) }; + + int smem_size = int(sizeof(typename GemmKernel::SharedStorage)); + if (smem_size >= (48 << 10)) { + cudaError_t result = cudaFuncSetAttribute(Kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + smem_size); + + if (result != cudaSuccess) { + return Status::kErrorInternal; + } + + result = cudaFuncSetAttribute( + Kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 100); + + if (result != cudaSuccess) { + return Status::kErrorInternal; + } + } return Status::kSuccess; } @@ -461,30 +480,11 @@ class SparseGemm { dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape); dim3 block(GemmKernel::kThreadCount, 1, 1); - cudaError_t result; - int smem_size = int(sizeof(typename GemmKernel::SharedStorage)); - if (smem_size >= (48 << 10)) { - result = cudaFuncSetAttribute(Kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - smem_size); - - if (result != cudaSuccess) { - return Status::kErrorInternal; - } - - result = cudaFuncSetAttribute( - Kernel, - cudaFuncAttributePreferredSharedMemoryCarveout, 100); - - if (result != cudaSuccess) { - return Status::kErrorInternal; - } - } cutlass::Kernel<<>>(params_); - result = cudaGetLastError(); + cudaError_t result = cudaGetLastError(); return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal; } diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h index 12a8a6d7f3..a669483541 100644 --- a/include/cutlass/gemm/device/gemm_universal_adapter.h +++ b/include/cutlass/gemm/device/gemm_universal_adapter.h @@ -117,9 +117,16 @@ class GemmUniversalAdapter { using ThreadblockShape = typename GemmKernel::Mma::Shape; using WarpShape = typename GemmKernel::WarpShape; using InstructionShape = typename GemmKernel::InstructionShape; - - using OperatorClass = typename GemmKernel::OperatorClass; - using ArchTag = typename GemmKernel::ArchTag; + + // warp-level, arch-level (instruction), math operator + using WarpMmaOperator = typename GemmKernel::Mma::Policy::Operator; + using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator; + using MathOperator = typename ArchMmaOperator::Operator; + + // Operator class and arch tag extract bottom-up + // set it for top-level gemm device-level template + using OperatorClass = typename WarpMmaOperator::OperatorClass; + using ArchTag = typename WarpMmaOperator::ArchTag; // Type, layout, and complex transform deliberately exchanged with B using MapArguments = detail::MapArguments< diff --git a/include/cutlass/gemm/device/gemm_universal_base.h b/include/cutlass/gemm/device/gemm_universal_base.h index fc52a08d0f..9ffc6b041c 100644 --- a/include/cutlass/gemm/device/gemm_universal_base.h +++ b/include/cutlass/gemm/device/gemm_universal_base.h @@ -311,6 +311,27 @@ class GemmUniversalBase { gemm_k_size, static_cast(workspace) ); + + // Specify shared memory capacity for kernel. + int smem_size = int(sizeof(typename GemmKernel::SharedStorage)); + + if (smem_size >= (48 << 10)) { + cudaError_t result = cudaFuncSetAttribute(Kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + smem_size); + + if (result != cudaSuccess) { + return Status::kErrorInternal; + } + + result = cudaFuncSetAttribute( + Kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 100); + + if (result != cudaSuccess) { + return Status::kErrorInternal; + } + } return Status::kSuccess; } @@ -335,38 +356,31 @@ class GemmUniversalBase { Status run(cudaStream_t stream = nullptr) { CUTLASS_TRACE_HOST("GemmUniversalBase::run()"); + // + // Configure grid and block dimensions + // + ThreadblockSwizzle threadblock_swizzle; dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape); dim3 block(GemmKernel::kThreadCount, 1, 1); - cudaError_t result; - int smem_size = int(sizeof(typename GemmKernel::SharedStorage)); - if (smem_size >= (48 << 10)) { - result = cudaFuncSetAttribute(Kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, - smem_size); - - if (result != cudaSuccess) { - return Status::kErrorInternal; - } - result = cudaFuncSetAttribute( - Kernel, - cudaFuncAttributePreferredSharedMemoryCarveout, 100); - - if (result != cudaSuccess) { - return Status::kErrorInternal; - } - } + // + // Launch kernel + // CUTLASS_TRACE_HOST(" grid: (" << grid << "), block: (" << block << "), SMEM: " << smem_size << " bytes"); + // Launch cutlass::Kernel<<>>(params_); - result = cudaGetLastError(); + // + // Query for errors + // + cudaError_t result = cudaGetLastError(); if (result != cudaSuccess) { CUTLASS_TRACE_HOST(" grid launch failed with error " << cudaGetErrorString(result)); diff --git a/include/cutlass/gemm/kernel/default_gemm_complex.h b/include/cutlass/gemm/kernel/default_gemm_complex.h index 15b1430c79..cff06e69de 100644 --- a/include/cutlass/gemm/kernel/default_gemm_complex.h +++ b/include/cutlass/gemm/kernel/default_gemm_complex.h @@ -49,6 +49,7 @@ #include "cutlass/gemm/kernel/gemm_pipelined.h" #include "cutlass/gemm/threadblock/default_mma_core_sm75.h" #include "cutlass/gemm/threadblock/default_mma_core_sm70.h" +#include "cutlass/gemm/threadblock/default_mma_core_simt.h" #include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h" #include "cutlass/gemm/threadblock/default_mma.h" #include "cutlass/gemm/threadblock/default_multistage_mma_complex.h" @@ -112,6 +113,101 @@ struct DefaultGemmComplex; //////////////////////////////////////////////////////////////////////////////// +/// Partial specialization for Ampere Architecture +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Element type for C and D matrix operands + typename ElementC, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of stages used in the pipelined mainloop + int Stages, + /// Complex elementwise transformation on A operand + ComplexTransform TransformA, + /// Complex elementwise transformation on B operand + ComplexTransform TransformB, + /// Multiply-add operator + // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator, + /// If true, kernel is configured to support serial reduction in the epilogue + bool SplitKSerial + > +struct DefaultGemmComplex< + ElementA, LayoutA, ElementB, LayoutB, ElementC, + layout::RowMajor, ElementAccumulator, arch::OpClassSimt, + arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, + EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> { + + /// Define the threadblock-scoped matrix multiply-accumulate + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, + WarpShape, + InstructionShape, + ElementA, LayoutA, + ElementB, LayoutB, + ElementAccumulator, layout::RowMajor, + arch::OpClassSimt, + Stages, + Operator, + false, + cutlass::arch::CacheOperation::Global, + cutlass::arch::CacheOperation::Global, + TransformA, + TransformB + >; + + // Define iterators over tiles from the A operand + using IteratorA = + cutlass::transform::threadblock::PredicatedTileIterator< + cutlass::MatrixShape, + ElementA, LayoutA, 1, + typename MmaCore::IteratorThreadMapA>; + + // Define iterators over tiles from the B operand + using IteratorB = + cutlass::transform::threadblock::PredicatedTileIterator< + cutlass::MatrixShape, + ElementB, LayoutB, 0, + typename MmaCore::IteratorThreadMapB>; + + // Define the threadblock-scoped pipelined matrix multiply + using Mma = cutlass::gemm::threadblock::MmaPipelined< + typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, + IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator, + layout::RowMajor, typename MmaCore::MmaPolicy>; + + /// Define the epilogue + using Epilogue = + typename cutlass::epilogue::threadblock::DefaultEpilogueSimt< + ThreadblockShape, + typename Mma::Operator, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + /// Define the kernel-level GEMM operator. + using GemmKernel = kernel::Gemm; +}; + +//////////////////////////////////////////////////////////////////////////////// + /// Partial specialization for Ampere Architecture template < /// Element type for A matrix operand @@ -170,6 +266,70 @@ struct DefaultGemmComplex< using GemmKernel = kernel::Gemm; }; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for Ampere Architecture +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Element type for C and D matrix operands + typename ElementC, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of stages used in the pipelined mainloop + int Stages, + /// Complex elementwise transformation on A operand + ComplexTransform TransformA, + /// Complex elementwise transformation on B operand + ComplexTransform TransformB, + /// Multiply-add operator + // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator, + /// If true, kernel is configured to support serial reduction in the epilogue + bool SplitKSerial + > +struct DefaultGemmComplex< + ElementA, LayoutA, ElementB, LayoutB, ElementC, + layout::RowMajor, ElementAccumulator, arch::OpClassSimt, + arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, + EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> { + + /// Define the threadblock-scoped matrix multiply-accumulate + using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex< + ElementA, LayoutA, ElementB, LayoutB, ElementAccumulator, + layout::RowMajor, arch::OpClassSimt, arch::Sm80, ThreadblockShape, + WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma; + + /// Define the epilogue + using Epilogue = + typename cutlass::epilogue::threadblock::DefaultEpilogueSimt< + ThreadblockShape, + typename Mma::Operator, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + /// Define the kernel-level GEMM operator. + using GemmKernel = kernel::Gemm; +}; + //////////////////////////////////////////////////////////////////////////////// } // namespace kernel diff --git a/include/cutlass/gemm/kernel/gemm.h b/include/cutlass/gemm/kernel/gemm.h index fc2daa9759..ce61137f36 100644 --- a/include/cutlass/gemm/kernel/gemm.h +++ b/include/cutlass/gemm/kernel/gemm.h @@ -138,8 +138,20 @@ struct Gemm { typename Epilogue::OutputTileIterator::TensorRef ref_C, typename Epilogue::OutputTileIterator::TensorRef ref_D) { - static int const kAlignmentA = Mma::IteratorA::AccessType::kElements; - static int const kAlignmentB = Mma::IteratorB::AccessType::kElements; + static int const kAlignmentA = (platform::is_same>::value) + ? 32 + : (platform::is_same>::value) + ? 64 + : Mma::IteratorA::AccessType::kElements; + static int const kAlignmentB = (platform::is_same>::value) + ? 32 + : (platform::is_same>::value) + ? 64 + : Mma::IteratorB::AccessType::kElements; static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess; if (!TensorRef_aligned(ref_A, kAlignmentA)) { @@ -274,7 +286,7 @@ struct Gemm { semaphore.fetch(); // Indicate which position in a serial reduction the output operator is currently updating - output_op.set_k_partition(threadblock_tile_offset.k()); + output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k()); } // Tile iterator loading from source tensor. diff --git a/include/cutlass/gemm/kernel/gemm_planar_complex.h b/include/cutlass/gemm/kernel/gemm_planar_complex.h index aede20dae5..b9626145fe 100644 --- a/include/cutlass/gemm/kernel/gemm_planar_complex.h +++ b/include/cutlass/gemm/kernel/gemm_planar_complex.h @@ -582,7 +582,7 @@ struct GemmPlanarComplex { semaphore.fetch(); // Indicate which position in a serial reduction the output operator is currently updating - output_op.set_k_partition(threadblock_tile_offset.k()); + output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k()); } } else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) { diff --git a/include/cutlass/gemm/kernel/gemm_universal.h b/include/cutlass/gemm/kernel/gemm_universal.h index 99ece26742..bba6217160 100644 --- a/include/cutlass/gemm/kernel/gemm_universal.h +++ b/include/cutlass/gemm/kernel/gemm_universal.h @@ -302,8 +302,20 @@ struct GemmUniversal { CUTLASS_TRACE_HOST("GemmUniversal::can_implement()"); - static int const kAlignmentA = Mma::IteratorA::AccessType::kElements; - static int const kAlignmentB = Mma::IteratorB::AccessType::kElements; + static int const kAlignmentA = (platform::is_same>::value) + ? 32 + : (platform::is_same>::value) + ? 64 + : Mma::IteratorA::AccessType::kElements; + static int const kAlignmentB = (platform::is_same>::value) + ? 32 + : (platform::is_same>::value) + ? 64 + : Mma::IteratorB::AccessType::kElements; static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess; if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) || @@ -468,7 +480,7 @@ struct GemmUniversal { semaphore.fetch(); // Indicate which position in a serial reduction the output operator is currently updating - output_op.set_k_partition(threadblock_tile_offset.k()); + output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k()); } } else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) { diff --git a/include/cutlass/gemm/kernel/sparse_gemm.h b/include/cutlass/gemm/kernel/sparse_gemm.h index 7db469e534..730745fdc8 100644 --- a/include/cutlass/gemm/kernel/sparse_gemm.h +++ b/include/cutlass/gemm/kernel/sparse_gemm.h @@ -319,7 +319,7 @@ struct SparseGemm { semaphore.fetch(); // Indicate which position in a serial reduction the output operator is currently updating - output_op.set_k_partition(threadblock_tile_offset.k()); + output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k()); } // Tile iterator loading from source tensor. diff --git a/include/cutlass/gemm/thread/mma_sm60.h b/include/cutlass/gemm/thread/mma_sm60.h index 486497cb79..07e2d55629 100644 --- a/include/cutlass/gemm/thread/mma_sm60.h +++ b/include/cutlass/gemm/thread/mma_sm60.h @@ -93,6 +93,9 @@ struct Mma_HFMA2 < /// C operand storage using FragmentC = Array; + /// Underlying mathematical operator + using Operator = arch::OpMultiplyAdd; + // // Methods // @@ -179,6 +182,9 @@ struct Mma_HFMA2< /// C operand storage using FragmentC = Array; + /// Underlying mathematical operator + using Operator = arch::OpMultiplyAdd; + // // Methods // @@ -270,6 +276,9 @@ struct Mma_HFMA2 < /// C operand storage using FragmentC = Array; + /// Underlying mathematical operator + using Operator = arch::OpMultiplyAdd; + // // Methods // @@ -356,6 +365,8 @@ struct Mma_HFMA2< /// C operand storage using FragmentC = Array; + /// Underlying mathematical operator + using Operator = arch::OpMultiplyAdd; // // Methods // @@ -443,6 +454,9 @@ struct Mma_HFMA2 < /// C operand storage using FragmentC = Array; + /// Underlying mathematical operator + using Operator = arch::OpMultiplyAdd; + // // Methods // @@ -533,6 +547,9 @@ struct Mma_HFMA2 < /// C operand storage using FragmentC = Array; + /// Underlying mathematical operator + using Operator = arch::OpMultiplyAdd; + // // Methods // @@ -623,6 +640,9 @@ struct Mma_HFMA2 < /// C operand storage using FragmentC = Array; + /// Underlying mathematical operator + using Operator = arch::OpMultiplyAdd; + // // Methods // @@ -714,6 +734,9 @@ struct Mma_HFMA2< /// C operand storage using FragmentC = Array; + /// Underlying mathematical operator + using Operator = arch::OpMultiplyAdd; + // // Methods // @@ -800,6 +823,9 @@ struct Mma_HFMA2< /// C operand storage using FragmentC = Array; + /// Underlying mathematical operator + using Operator = arch::OpMultiplyAdd; + // // Methods // @@ -879,6 +905,9 @@ struct Mma_HFMA2< /// C operand storage using FragmentC = Array; + /// Underlying mathematical operator + using Operator = arch::OpMultiplyAdd; + // // Methods // diff --git a/include/cutlass/gemm/threadblock/default_mma_core_simt.h b/include/cutlass/gemm/threadblock/default_mma_core_simt.h index be50149372..ba3a161650 100644 --- a/include/cutlass/gemm/threadblock/default_mma_core_simt.h +++ b/include/cutlass/gemm/threadblock/default_mma_core_simt.h @@ -389,7 +389,7 @@ struct DefaultMmaCore, ElementA_, /// Policy used to define MmaPipelined using MmaPolicy = MmaPolicy< MmaWarpSimt, - MatrixShape, // skew for A matrix to avoid SMEM bank conflicts + MatrixShape, // skew for A matrix to avoid SMEM bank conflicts MatrixShape<0, kPaddingN>, // skew for B matrix to avoid SMEM bank conflicts WarpCount::kK >; diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h index 7f3d534a1f..36c5c54ee9 100644 --- a/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h +++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h @@ -34,6 +34,7 @@ #include "cutlass/gemm/threadblock/default_mma_core_sm80.h" #include "cutlass/numeric_types.h" #include "cutlass/transform/threadblock/predicated_tile_iterator.h" +#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h" //////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h index 230e8d7681..697d22bf6d 100644 --- a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h +++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h @@ -1105,6 +1105,676 @@ struct DefaultMultistageMmaComplexCore< //////////////////////////////////////////////////////////////////////////////// +/// Partial specialization for complex double-precision +/// +/// A: column-major +/// B: row-major +/// Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + typename RealA, + typename RealB, + typename RealC, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMultistageMmaComplexCore< + Shape_, WarpShape_, GemmShape<1, 1, 1>, + complex, layout::ColumnMajor, + complex, layout::ColumnMajor, + complex, LayoutC_, + arch::OpClassSimt, + Stages, + TransformA, TransformB, + Operator_, + CacheOpA, CacheOpB> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<1, 1, 1>; + using ElementA = complex; + using LayoutA = layout::ColumnMajor; + using ElementB = complex; + using LayoutB = layout::ColumnMajor; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static ComplexTransform const kTransformA = TransformA; + static ComplexTransform const kTransformB = TransformB; + using Operator = Operator_; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of access + static int const kAccessSizeInBits = sizeof_bits::value; + + /// No vectorized accesses + static int const kElementsPerAccess = 1; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajor; + + using SmemLayoutB = layout::RowMajor; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 0, + IteratorThreadMapA>; + + /// Policy of iterator B + using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Transpose the ThreadMap of iterator B + using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 1, + SmemThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level op + static const int WarpNumThreadsM = 4; // TODO need to extract these from template data + static const int WarpNumThreadsN = 8; + static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN), + "WarpShape must be divisible by ThreadTile shape."); + static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM; + static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN; + static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1; + static const int numElementsA = 128 / sizeof_bits::value; + static const int numElementsB = 128 / sizeof_bits::value; + static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM); + static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN); + // these should have max of thread tile also + using LaneMmaShape = cutlass::gemm::GemmShape< + LaneM, + LaneN, + 1>; + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape, // WarpShape + cutlass::layout::RowMajorInterleaved, // LaneLayout + LaneMmaShape + >; + + using MmaWarpSimt = cutlass::gemm::warp::MmaSimt< + WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8 + ElementA, /// Data type of A elements + SmemLayoutA, /// Layout of A matrix (concept: MatrixLayout) + ElementB, /// Data type of B elements + SmemLayoutB, /// Layout of B matrix (concept: MatrixLayout) + ElementC, /// Element type of C matrix + LayoutC, /// Layout of C matrix (concept: MatrixLayout) + Policy /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + >; /// Used for partial specialization + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy< + MmaWarpSimt, + MatrixShape<0, 0>, + MatrixShape<0, Shape::kK / 32>, + WarpCount::kK>; +}; + +/// Partial specialization for complex double-precision +/// +/// A: column-major +/// B: row-major +/// Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + typename RealA, + typename RealB, + typename RealC, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMultistageMmaComplexCore< + Shape_, WarpShape_, GemmShape<1, 1, 1>, + complex, layout::ColumnMajor, + complex, layout::RowMajor, + complex, LayoutC_, + arch::OpClassSimt, + Stages, + TransformA, TransformB, + Operator_, + CacheOpA, CacheOpB> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<1, 1, 1>; + using ElementA = complex; + using LayoutA = layout::ColumnMajor; + using ElementB = complex; + using LayoutB = layout::RowMajor; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static ComplexTransform const kTransformA = TransformA; + static ComplexTransform const kTransformB = TransformB; + using Operator = Operator_; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of access + static int const kAccessSizeInBits = sizeof_bits::value; + + /// No vectorized accesses + static int const kElementsPerAccess = 1; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajor; + + using SmemLayoutB = layout::RowMajor; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 0, + IteratorThreadMapA>; + + /// Policy of iterator B + using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 1, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level op + static const int WarpNumThreadsM = 4; // TODO need to extract these from template data + static const int WarpNumThreadsN = 8; + static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN), + "WarpShape must be divisible by ThreadTile shape."); + static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM; + static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN; + static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1; + static const int numElementsA = 128 / sizeof_bits::value; + static const int numElementsB = 128 / sizeof_bits::value; + static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM); + static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN); + // these should have max of thread tile also + using LaneMmaShape = cutlass::gemm::GemmShape< + LaneM, + LaneN, + 1>; + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape, // WarpShape + cutlass::layout::RowMajorInterleaved, // LaneLayout + LaneMmaShape + >; + + using MmaWarpSimt = cutlass::gemm::warp::MmaSimt< + WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8 + ElementA, /// Data type of A elements + SmemLayoutA, /// Layout of A matrix (concept: MatrixLayout) + ElementB, /// Data type of B elements + SmemLayoutB, /// Layout of B matrix (concept: MatrixLayout) + ElementC, /// Element type of C matrix + LayoutC, /// Layout of C matrix (concept: MatrixLayout) + Policy /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + >; /// Used for partial specialization + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy< + MmaWarpSimt, + MatrixShape<0, 0>, + MatrixShape<0, 0>, // or Shape::kK / 32 + WarpCount::kK>; +}; + +/// Partial specialization for complex double-precision +/// +/// A: column-major +/// B: row-major +/// Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + typename RealA, + typename RealB, + typename RealC, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMultistageMmaComplexCore< + Shape_, WarpShape_, GemmShape<1, 1, 1>, + complex, layout::RowMajor, + complex, layout::ColumnMajor, + complex, LayoutC_, + arch::OpClassSimt, + Stages, + TransformA, TransformB, + Operator_, + CacheOpA, CacheOpB> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<1, 1, 1>; + using ElementA = complex; + using LayoutA = layout::RowMajor; + using ElementB = complex; + using LayoutB = layout::ColumnMajor; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static ComplexTransform const kTransformA = TransformA; + static ComplexTransform const kTransformB = TransformB; + using Operator = Operator_; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of access + static int const kAccessSizeInBits = sizeof_bits::value; + + /// No vectorized accesses + static int const kElementsPerAccess = 1; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajor; + + using SmemLayoutB = layout::RowMajor; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Transpose the ThreadMap of iterator A + using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 0, + SmemThreadMapA>; + + /// Policy of iterator B + using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Transpose the ThreadMap of iterator B + using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 1, + SmemThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level op + static const int WarpNumThreadsM = 4; // TODO need to extract these from template data + static const int WarpNumThreadsN = 8; + static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN), + "WarpShape must be divisible by ThreadTile shape."); + static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM; + static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN; + static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1; + static const int numElementsA = 128 / sizeof_bits::value; + static const int numElementsB = 128 / sizeof_bits::value; + static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM); + static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN); + // these should have max of thread tile also + using LaneMmaShape = cutlass::gemm::GemmShape< + LaneM, + LaneN, + 1>; + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape, // WarpShape + cutlass::layout::RowMajorInterleaved, // LaneLayout + LaneMmaShape + >; + + using MmaWarpSimt = cutlass::gemm::warp::MmaSimt< + WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8 + ElementA, /// Data type of A elements + SmemLayoutA, /// Layout of A matrix (concept: MatrixLayout) + ElementB, /// Data type of B elements + SmemLayoutB, /// Layout of B matrix (concept: MatrixLayout) + ElementC, /// Element type of C matrix + LayoutC, /// Layout of C matrix (concept: MatrixLayout) + Policy /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + >; /// Used for partial specialization + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy< + MmaWarpSimt, + MatrixShape, + MatrixShape<0, Shape::kK / 32>, + WarpCount::kK>; +}; + +/// Partial specialization for complex double-precision +/// +/// A: column-major +/// B: row-major +/// Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + typename RealA, + typename RealB, + typename RealC, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMultistageMmaComplexCore< + Shape_, WarpShape_, GemmShape<1, 1, 1>, + complex, layout::RowMajor, + complex, layout::RowMajor, + complex, LayoutC_, + arch::OpClassSimt, + Stages, + TransformA, TransformB, + Operator_, + CacheOpA, CacheOpB> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<1, 1, 1>; + using ElementA = complex; + using LayoutA = layout::RowMajor; + using ElementB = complex; + using LayoutB = layout::RowMajor; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static ComplexTransform const kTransformA = TransformA; + static ComplexTransform const kTransformB = TransformB; + using Operator = Operator_; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of access + static int const kAccessSizeInBits = sizeof_bits::value; + + /// No vectorized accesses + static int const kElementsPerAccess = 1; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajor; + + using SmemLayoutB = layout::RowMajor; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Transpose the ThreadMap of iterator A + using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 0, + SmemThreadMapA>; + + /// Policy of iterator B + using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 1, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level op + static const int WarpNumThreadsM = 4; // TODO need to extract these from template data + static const int WarpNumThreadsN = 8; + static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN), + "WarpShape must be divisible by ThreadTile shape."); + static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM; + static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN; + static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1; + static const int numElementsA = 128 / sizeof_bits::value; + static const int numElementsB = 128 / sizeof_bits::value; + static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM); + static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN); + // these should have max of thread tile also + using LaneMmaShape = cutlass::gemm::GemmShape< + LaneM, + LaneN, + 1>; + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape, // WarpShape + cutlass::layout::RowMajorInterleaved, // LaneLayout + LaneMmaShape + >; + + using MmaWarpSimt = cutlass::gemm::warp::MmaSimt< + WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8 + ElementA, /// Data type of A elements + SmemLayoutA, /// Layout of A matrix (concept: MatrixLayout) + ElementB, /// Data type of B elements + SmemLayoutB, /// Layout of B matrix (concept: MatrixLayout) + ElementC, /// Element type of C matrix + LayoutC, /// Layout of C matrix (concept: MatrixLayout) + Policy /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + >; /// Used for partial specialization + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy< + MmaWarpSimt, + MatrixShape, + MatrixShape<0, 0>, // or Shape::kK / 32 + WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + } // namespace threadblock } // namespace gemm diff --git a/include/cutlass/gemm/threadblock/mma_multistage.h b/include/cutlass/gemm/threadblock/mma_multistage.h index 0431c3060f..804e3373a3 100644 --- a/include/cutlass/gemm/threadblock/mma_multistage.h +++ b/include/cutlass/gemm/threadblock/mma_multistage.h @@ -228,7 +228,7 @@ class MmaMultistage : for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) { auto gmem_ptr = iterator_A.get(); - cutlass::arch::cp_async( + cutlass::arch::cp_async_zfill( dst_ptr + v, gmem_ptr, iterator_A.valid()); ++iterator_A; @@ -258,7 +258,7 @@ class MmaMultistage : for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) { auto gmem_ptr = iterator_B.get(); - cutlass::arch::cp_async( + cutlass::arch::cp_async_zfill( dst_ptr + v, gmem_ptr, iterator_B.valid()); ++iterator_B; @@ -513,6 +513,11 @@ class MmaMultistage : } } + + // commit and drain all pending and predicated LDGSTS pnz from the GEMM mainloop + cutlass::arch::cp_async_fence(); + cutlass::arch::cp_async_wait<0>(); + __syncthreads(); } }; diff --git a/include/cutlass/gemm/threadblock/mma_singlestage.h b/include/cutlass/gemm/threadblock/mma_singlestage.h index 32d4d4ee60..373d985ac6 100644 --- a/include/cutlass/gemm/threadblock/mma_singlestage.h +++ b/include/cutlass/gemm/threadblock/mma_singlestage.h @@ -105,6 +105,14 @@ class MmaSingleStage : public MmaBase { /// Warp-level Mma using Operator = typename Policy::Operator; + using ArchTag = arch::Sm70; + + /// Complex transform on A operand + static ComplexTransform const kTransformA = Operator::kTransformA; + + /// Complex transform on B operand + static ComplexTransform const kTransformB = Operator::kTransformB; + // staticaly assert kStages for MmaSingleStage is 1 (single stage mma pipeline) static_assert((Base::kStages==1), "MmaSingleStage requires kStages set to value 1"); private: diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op.h b/include/cutlass/gemm/warp/mma_complex_tensor_op.h index 2dc72fd333..a34c16df07 100644 --- a/include/cutlass/gemm/warp/mma_complex_tensor_op.h +++ b/include/cutlass/gemm/warp/mma_complex_tensor_op.h @@ -314,8 +314,17 @@ class MmaComplexTensorOp< /// Shape of the warp in units of thread (concept: MmaLanePolicyTensorOp) using Policy = Policy_; + /// Underlying matrix multiply operator (concept: arch::Mma) + using ArchMmaOperator = typename Policy::Operator; + + /// Architecture tag from underlying instruction + using ArchTag = typename ArchMmaOperator::ArchTag; + + /// Indicates class of matrix operator + using OperatorClass = arch::OpClassTensorOp; + /// Shape of underlying instruction - using InstructionShape = typename Policy::Operator::Shape; + using InstructionShape = typename ArchMmaOperator::Shape; /// Complex transform on A operand static ComplexTransform const kTransformA = TransformA; @@ -323,9 +332,6 @@ class MmaComplexTensorOp< /// Complex transform on B operand static ComplexTransform const kTransformB = TransformB; - /// Indicates class of matrix operator - using OperatorClass = arch::OpClassTensorOp; - /// Number of threads participating in warp-level matrix product static int const kThreadCount = 32; @@ -337,7 +343,7 @@ class MmaComplexTensorOp< Operand::kA, ElementA, LayoutA, - MatrixShape, + MatrixShape, Policy::OpDelta::kRow, 32, 1 @@ -355,7 +361,7 @@ class MmaComplexTensorOp< Operand::kB, ElementB, LayoutB, - MatrixShape, + MatrixShape, Policy::OpDelta::kColumn, 32, 1 @@ -368,14 +374,14 @@ class MmaComplexTensorOp< using TransformedFragmentB = FragmentB; static_assert( - !(Shape::kM % Policy::Operator::Shape::kM) && - !(Shape::kN % Policy::Operator::Shape::kN), + !(Shape::kM % ArchMmaOperator::Shape::kM) && + !(Shape::kN % ArchMmaOperator::Shape::kN), "Shape of warp-level Mma must be divisible by operator shape."); /// Number of mma operations performed using MmaIterations = MatrixShape< - Shape::kM / Policy::Operator::Shape::kM, - Shape::kN / Policy::Operator::Shape::kN + Shape::kM / ArchMmaOperator::Shape::kM, + Shape::kN / ArchMmaOperator::Shape::kN >; /// Iterates over the C operand in memory @@ -383,7 +389,7 @@ class MmaComplexTensorOp< MatrixShape, ElementC, LayoutC, - typename Policy::Operator::Shape, + typename ArchMmaOperator::Shape, typename Policy::OpDelta>; /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this @@ -393,7 +399,7 @@ class MmaComplexTensorOp< using FragmentC = typename IteratorC::Fragment; static_assert( - FragmentC::kElements == 2 * MmaIterations::kCount * Policy::Operator::FragmentC::kElements, + FragmentC::kElements == 2 * MmaIterations::kCount * ArchMmaOperator::FragmentC::kElements, "Unexpected planar complex fragment length."); private: @@ -403,7 +409,7 @@ class MmaComplexTensorOp< // /// Underlying real-valued matrix multiply operator (concept: arch::Mma) - typename Policy::Operator mma; + ArchMmaOperator mma; public: @@ -425,9 +431,9 @@ class MmaComplexTensorOp< ) const { // Alias types for underlying real-valued matrix multiply operator - using MmaOperandA = typename Policy::Operator::FragmentA; - using MmaOperandB = typename Policy::Operator::FragmentB; - using MmaOperandC = typename Policy::Operator::FragmentC; + using MmaOperandA = typename ArchMmaOperator::FragmentA; + using MmaOperandB = typename ArchMmaOperator::FragmentB; + using MmaOperandC = typename ArchMmaOperator::FragmentC; static_assert(MmaOperandA::kElements == 1, "This implementation only supports math instructions in which exactly one element is needed for the A operand." @@ -599,12 +605,18 @@ class MmaComplexTensorOp< /// Shape of the warp in units of thread (concept: MmaLanePolicySimt) using Policy = Policy_; - + + /// Underlying matrix multiply operator (concept: arch::Mma) + using ArchMmaOperator = typename Policy::Operator; + /// Shape of underlying instruction - using InstructionShape = typename Policy::Operator::Shape; + using InstructionShape = typename ArchMmaOperator::Shape; /// Underlying arch tag - using ArchTag = typename Policy::Operator::ArchTag; + using ArchTag = typename ArchMmaOperator::ArchTag; + + /// Indicates class of matrix operator + using OperatorClass = arch::OpClassTensorOp; /// Complex transform on A operand static ComplexTransform const kTransformA = TransformA; @@ -612,9 +624,6 @@ class MmaComplexTensorOp< /// Complex transform on B operand static ComplexTransform const kTransformB = TransformB; - /// Indicates class of matrix operator - using OperatorClass = arch::OpClassTensorOp; - /// Number of threads participating in warp-level matrix product static int const kThreadCount = 32; @@ -626,7 +635,7 @@ class MmaComplexTensorOp< Operand::kA, ElementA, LayoutA, - MatrixShape, + MatrixShape, Policy::OpDelta::kRow, 32, 1 @@ -637,7 +646,7 @@ class MmaComplexTensorOp< /// Storage for transformed A tile using TransformedFragmentA = - Array; + Array; /// Iterates over the B operand in memory using IteratorB = MmaTensorOpMultiplicandTileIterator< @@ -645,7 +654,7 @@ class MmaComplexTensorOp< Operand::kB, ElementB, LayoutB, - MatrixShape, + MatrixShape, Policy::OpDelta::kColumn, 32, 1 @@ -656,17 +665,17 @@ class MmaComplexTensorOp< /// Storage for transformed B tile using TransformedFragmentB = - Array; + Array; static_assert( - !(Shape::kM % Policy::Operator::Shape::kM) && - !(Shape::kN % Policy::Operator::Shape::kN), + !(Shape::kM % ArchMmaOperator::Shape::kM) && + !(Shape::kN % ArchMmaOperator::Shape::kN), "Shape of warp-level Mma must be divisible by operator shape."); /// Number of complex products operations performed (one complex product needs four mma instructions) using MmaIterations = MatrixShape< - Shape::kM / Policy::Operator::Shape::kM, - Shape::kN / Policy::Operator::Shape::kN + Shape::kM / ArchMmaOperator::Shape::kM, + Shape::kN / ArchMmaOperator::Shape::kN >; /// Iterates over the C operand in memory @@ -674,7 +683,7 @@ class MmaComplexTensorOp< MatrixShape, ElementC, LayoutC, - typename Policy::Operator::Shape, + typename ArchMmaOperator::Shape, typename Policy::OpDelta>; /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this @@ -690,7 +699,7 @@ class MmaComplexTensorOp< // /// Underlying real-valued matrix multiply operator (concept: arch::Mma) - typename Policy::Operator mma; + ArchMmaOperator mma; public: @@ -712,11 +721,11 @@ class MmaComplexTensorOp< ) const { // Alias types for underlying real-valued matrix multiply operator - using InstMmaOperandA = typename Policy::Operator::FragmentA; - using InstMmaOperandB = typename Policy::Operator::FragmentB; - using MmaOperandC = typename Policy::Operator::FragmentC; + using InstMmaOperandA = typename ArchMmaOperator::FragmentA; + using InstMmaOperandB = typename ArchMmaOperator::FragmentB; + using MmaOperandC = typename ArchMmaOperator::FragmentC; - static_assert(platform::is_same, typename Policy::Operator::Shape>::value, + static_assert(platform::is_same, typename ArchMmaOperator::Shape>::value, "This implementation only supports MMA.1688 math instructions."); static_assert(InstMmaOperandA::kElements == 4, @@ -794,8 +803,8 @@ class MmaComplexTensorOp< void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B, FragmentA const &A, FragmentB const &B) const { // Alias types for underlying real-valued matrix multiply operator - using InstMmaOperandA = typename Policy::Operator::FragmentA; - using InstMmaOperandB = typename Policy::Operator::FragmentB; + using InstMmaOperandA = typename ArchMmaOperator::FragmentA; + using InstMmaOperandB = typename ArchMmaOperator::FragmentB; // // Define conversions from source type to instruction operands' type diff --git a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h index bf3d98dfbe..4ab139023a 100644 --- a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h +++ b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h @@ -147,11 +147,17 @@ class MmaGaussianComplexTensorOp< /// Shape of the warp in units of thread (concept: MmaLanePolicySimt) using Policy = Policy_; + /// Underlying matrix multiply operator (concept: arch::Mma) + using ArchMmaOperator = typename Policy::Operator; + /// Shape of underlying instruction - using InstructionShape = typename Policy::Operator::Shape; + using InstructionShape = typename ArchMmaOperator::Shape; + + /// Underlying arch tag + using ArchTag = typename ArchMmaOperator::ArchTag; - /// Underlying architecture tag - using ArchTag = typename Policy::Operator::ArchTag; + /// Indicates class of matrix operator + using OperatorClass = arch::OpClassTensorOp; /// Complex transform on A operand static ComplexTransform const kTransformA = TransformA; @@ -159,8 +165,6 @@ class MmaGaussianComplexTensorOp< /// Complex transform on B operand static ComplexTransform const kTransformB = TransformB; - /// Indicates class of matrix operator - using OperatorClass = arch::OpClassTensorOp; /// Number of threads participating in warp-level matrix product static int const kThreadCount = 32; @@ -173,7 +177,7 @@ class MmaGaussianComplexTensorOp< Operand::kA, ElementA, LayoutA, - MatrixShape, + MatrixShape, Policy::OpDelta::kRow, 32, 1 @@ -191,7 +195,7 @@ class MmaGaussianComplexTensorOp< Operand::kB, ElementB, LayoutB, - MatrixShape, + MatrixShape, Policy::OpDelta::kColumn, 32, 1 @@ -204,14 +208,14 @@ class MmaGaussianComplexTensorOp< using TransformedFragmentB = FragmentB; static_assert( - !(Shape::kM % Policy::Operator::Shape::kM) && - !(Shape::kN % Policy::Operator::Shape::kN), + !(Shape::kM % ArchMmaOperator::Shape::kM) && + !(Shape::kN % ArchMmaOperator::Shape::kN), "Shape of warp-level Mma must be divisible by operator shape."); /// Number of mma operations performed using MmaIterations = MatrixShape< - Shape::kM / Policy::Operator::Shape::kM, - Shape::kN / Policy::Operator::Shape::kN + Shape::kM / ArchMmaOperator::Shape::kM, + Shape::kN / ArchMmaOperator::Shape::kN >; /// Iterates over the C operand in memory @@ -219,7 +223,7 @@ class MmaGaussianComplexTensorOp< MatrixShape, ElementC, LayoutC, - typename Policy::Operator::Shape, + typename ArchMmaOperator::Shape, typename Policy::OpDelta>; /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this @@ -229,7 +233,7 @@ class MmaGaussianComplexTensorOp< using FragmentC = typename IteratorC::Fragment; static_assert( - FragmentC::kElements == 3 * MmaIterations::kCount * Policy::Operator::FragmentC::kElements, + FragmentC::kElements == 3 * MmaIterations::kCount * ArchMmaOperator::FragmentC::kElements, "Unexpected gaussian complex fragment length."); private: @@ -239,7 +243,7 @@ class MmaGaussianComplexTensorOp< // /// Underlying real-valued matrix multiply operator (concept: arch::Mma) - typename Policy::Operator mma; + ArchMmaOperator mma; public: @@ -261,9 +265,9 @@ class MmaGaussianComplexTensorOp< ) const { // Alias types for underlying real-valued matrix multiply operator - using MmaOperandA = typename Policy::Operator::FragmentA; - using MmaOperandB = typename Policy::Operator::FragmentB; - using MmaOperandC = typename Policy::Operator::FragmentC; + using MmaOperandA = typename ArchMmaOperator::FragmentA; + using MmaOperandB = typename ArchMmaOperator::FragmentB; + using MmaOperandC = typename ArchMmaOperator::FragmentC; static_assert(MmaOperandA::kElements == 1, "This implementation only supports math instructions in which exactly one element is needed for the A operand." @@ -346,8 +350,6 @@ class MmaGaussianComplexTensorOp< ///////////////////////////////////////////////////////////////////////////////////////////////// -// TODO - partial specializations of real*complex and complex*real - ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace warp diff --git a/include/cutlass/gemm/warp/mma_simt.h b/include/cutlass/gemm/warp/mma_simt.h index c90624cee7..306a08d17c 100644 --- a/include/cutlass/gemm/warp/mma_simt.h +++ b/include/cutlass/gemm/warp/mma_simt.h @@ -68,6 +68,10 @@ template < typename Policy_, /// Number of partitions along K dimension int PartitionsK = 1, + /// Complex transformation on operand A + ComplexTransform TransformA = ComplexTransform::kNone, + /// Complex transformation on operand B + ComplexTransform TransformB = ComplexTransform::kNone, /// Used for partial specialization typename Enable = bool > @@ -104,10 +108,10 @@ class MmaSimt { using ArchTag = arch::Sm50; /// Complex transform on A operand - static ComplexTransform const kTransformA = ComplexTransform::kNone; + static ComplexTransform const kTransformA = TransformA; /// Complex transform on B operand - static ComplexTransform const kTransformB = ComplexTransform::kNone; + static ComplexTransform const kTransformB = TransformB; /// Layout of threads using ThreadLayoutA = typename platform::conditional< platform::is_same< layout::ColumnMajorInterleaved<4>, LayoutA >::value, @@ -215,12 +219,22 @@ class MmaSimt { CUTLASS_DEVICE void operator()( FragmentC &d, - FragmentA const &a, - FragmentB const &b, + FragmentA a, + FragmentB b, FragmentC const &c, int group_idx = 0) const { ThreadMma mma; + if (kTransformA == ComplexTransform::kConjugate) { + conjugate conj_a; + a = conj_a(a); + } + + if (kTransformB == ComplexTransform::kConjugate) { + conjugate conj_b; + b = conj_b(b); + } + mma(d, a, b, c); } diff --git a/include/cutlass/gemm/warp/mma_sparse_tensor_op.h b/include/cutlass/gemm/warp/mma_sparse_tensor_op.h index 8b7312baa0..ba86e08583 100644 --- a/include/cutlass/gemm/warp/mma_sparse_tensor_op.h +++ b/include/cutlass/gemm/warp/mma_sparse_tensor_op.h @@ -111,17 +111,28 @@ class SparseMmaTensorOp { /// Shape of the warp in units of thread (concept: MmaLanePolicySimt) using Policy = Policy_; + /// Equivalant base dense mma + using Base = MmaTensorOp; + + /// Underlying matrix multiply operator (concept: arch::Mma) + using ArchMmaOperator = typename Base::ArchMmaOperator; + /// Architecture tag from underlying instruction - using ArchTag = typename Policy::Operator::ArchTag; + using ArchTag = typename Base::ArchTag; /// Indicates class of matrix operator - using OperatorClass = arch::OpClassTensorOp; + using OperatorClass = typename Base::OperatorClass; + + /// Shape of underlying instruction + using InstructionShape = typename Base::InstructionShape; /// Complex transform on A operand - static ComplexTransform const kTransformA = ComplexTransform::kNone; + static ComplexTransform const kTransformA = Base::kTransformA; /// Complex transform on B operand - static ComplexTransform const kTransformB = ComplexTransform::kNone; + static ComplexTransform const kTransformB = Base::kTransformB; /// Number of threads participating in warp-level matrix product static int const kThreadCount = 32; @@ -171,25 +182,19 @@ class SparseMmaTensorOp { Array; /// Iterates over the B operand in memory - using IteratorB = MmaTensorOpMultiplicandTileIterator< - MatrixShape, Operand::kB, ElementB, LayoutB, - MatrixShape, - Policy::OpDelta::kRow, kThreadCount, kPartitionsK>; + using IteratorB = typename Base::IteratorB; /// Storage for B tile - using FragmentB = typename IteratorB::Fragment; + using FragmentB = typename Base::FragmentB; /// Storage for transformed B tile - using TransformedFragmentB = - Array; + using TransformedFragmentB = typename Base::TransformedFragmentB; /// Iterates over the C operand in memory - using IteratorC = MmaTensorOpAccumulatorTileIterator< - MatrixShape, ElementC, LayoutC, - typename Policy::Operator::Shape, typename Policy::OpDelta>; + using IteratorC = typename Base::IteratorC; /// Storage for C tile - using FragmentC = typename IteratorC::Fragment; + using FragmentC = typename Base::FragmentC; /// Iterates over the E operand in memory using IteratorE = SparseMmaTensorOpMetaTileIterator< @@ -204,23 +209,13 @@ class SparseMmaTensorOp { /// Storage for E tile using FragmentE = typename IteratorE::Fragment; -private: - - static_assert( - !(Shape::kM % Policy::Operator::Shape::kM) && - !(Shape::kN % Policy::Operator::Shape::kN), - "Shape of warp-level Mma must be divisible by operator shape."); - - /// Number of mma operations performed - using MmaIterations = MatrixShape< - Shape::kM / Policy::Operator::Shape::kM, - Shape::kN / Policy::Operator::Shape::kN - >; + /// Number of mma operations performed + using MmaIterations = typename Base::MmaIterations; public: /// Underlying matrix multiply operator (concept: arch::Mma) - typename Policy::Operator mma; + ArchMmaOperator mma; public: @@ -299,21 +294,21 @@ class SparseMmaTensorOp { // Define conversions from source type to instruction type // FloatRoundStyle const kRoundA = - PreferredRoundingMode::kRound; FloatRoundStyle const kRoundB = - PreferredRoundingMode::kRound; - detail::ConvertAndPack convert_A; - NumericArrayConverter convert_B; Array const *ptr_A = reinterpret_cast const *>(&A); - Array * - ptr_dst_A = reinterpret_cast * + ptr_dst_A = reinterpret_cast *>(&dst_A); dst_B = convert_B(B); diff --git a/include/cutlass/gemm/warp/mma_tensor_op.h b/include/cutlass/gemm/warp/mma_tensor_op.h index 1a10c7e4fe..a60a86020a 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op.h +++ b/include/cutlass/gemm/warp/mma_tensor_op.h @@ -244,8 +244,6 @@ class MmaTensorOp { /// Storage for C tile using FragmentC = typename IteratorC::Fragment; -private: - /// Number of mma operations performed using MmaIterations = MatrixShape< (Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM, diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h index 1fe04e92af..59f68a42a1 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h @@ -1518,6 +1518,7 @@ class MmaTensorOpMultiplicandTileIterator< } else if (Layout::kFactor == 2) { // Super Matrix multiply kBlock = 32 if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) { + // Matrix multiply 1688 A/B // (Q stands for 1 8x128bit block). // Q0 // Q1 @@ -3191,10 +3192,430 @@ class MmaTensorOpAccumulatorTileIterator< int idx = mma_m + mma_n * Policy::MmaIterations::kRow; - AccessType* access_ptr = reinterpret_cast(offset_ref.data() + - offset_ref.offset(TensorCoord(accum_m, accum_n))); + AccessType* access_ptr = reinterpret_cast(offset_ref.data() + + offset_ref.offset(TensorCoord(accum_m, accum_n))); + + access_ptr[0] = frag_ptr[idx]; + } + } + } + + /// Stores a fragment to memory with additional pointer offset + CUTLASS_DEVICE + void store_with_byte_offset( + Fragment const &frag, ///< fragment to store from the tensor + Index byte_offset) const { ///< store a tile with a linear offset + + store_with_pointer_offset(byte_offset / sizeof(Element)); + } + + /// Stores a fragment to memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void store( + Fragment &frag, ///< fragment to store to the tensor + TensorCoord const &tile_offset) const { ///< stores a tile with a logical offset in units of whole tiles + + store(frag, tile_offset, 0); + } + + /// Stores a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void store( + /// fragment to store to the tensor + Fragment const &frag, + /// stores a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// stores a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// This tile iterator is specialized for 32-thread TensorOps. It is used to load or store +/// accumulators from memory and is agnostic to layout. It could be faster if it assumed row-major +/// accumulator layout. +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept | +/// WriteableRandomAccessContiguousTileIteratorConcept +/// + +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Element typ + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions, concept: MatrixShape) + typename OpDelta_, + /// Interleaved N + int InterleavedN> +class MmaTensorOpAccumulatorTileIterator< + Shape_, Element_, cutlass::layout::TensorNCxHWx, + InstructionShape_, OpDelta_> { + public: + + /// Shape of tile to load (concept: MatrixShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand::kC; + + /// Element type + using Element = int8_t; + + /// Layout of source tile + using Layout = cutlass::layout::TensorNCxHWx; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + using OpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Internal structure of iterator - made public to enable introspection + struct Policy { + static_assert( + !(Shape::kRow % InstructionShape::kM) && + !(Shape::kColumn % InstructionShape::kN), + "Shape of warp-level Mma must be divisible by operator shape."); + + /// Number of elements in strided dimension that each STG writes + static int const kStridedPerSTG = 8; + + /// Factor to calculate reorder index to pack accumulator. + static int const kPackedFactor = Shape::kColumn / 32; + + /// Number of mma operations performed + using MmaIterations = MatrixShape; + }; + +private: + + static int const kElementsPerAccess = InterleavedN / 4; + +public: + + // + // Derived quantities + // + + struct alignas((kElementsPerAccess * sizeof_bits::value / 8)) AccessType { + Array storage; + }; + + /// Fragment object holding a thread's part of a tile + using Fragment = Array; + +private: + + /// Reference to output tensor + TensorRef ref_; + + /// Row offset index globally + LongIndex global_offset_row_; + + /// Column offset index globally + LongIndex global_offset_col_; + + /// Output tensor size + TensorCoord extent_; + + /// Alpha + float alpha_; + + /// Beta + float beta_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpAccumulatorTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpAccumulatorTileIterator( + TensorRef const &ref, + int const lane_id, + TensorCoord extent, + float alpha = 1.0f, + float beta = 0.0f + ): + ref_(ref), + extent_(extent), + alpha_(alpha), + beta_(beta) { + + int quad = (lane_id >> 2); + int lane_in_quad = (lane_id & 3); + + global_offset_row_ = quad; + + global_offset_col_ = lane_in_quad * kElementsPerAccess; + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) { + ref_.add_pointer_offset(offset); + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpAccumulatorTileIterator &add_tile_offset(MatrixCoord const &tile_offset) { + + global_offset_row_ += tile_offset.row() * Shape::kRow; + + global_offset_col_ += tile_offset.column() * Shape::kColumn; + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpAccumulatorTileIterator & operator++() { + // deliberate no-op + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpAccumulatorTileIterator & operator--() { + // deliberate no-op + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(-tile_offset); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + load_with_pointer_offset(frag); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + Fragment &frag, ///< fragment to load from the tensor + Index pointer_offset) const { ///< loads a tile with a linear offset + + TensorRef offset_ref(ref_); + offset_ref.add_pointer_offset(pointer_offset); + + AccessType* frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Policy::MmaIterations::kN; ++mma_n) { + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Policy::MmaIterations::kM; ++mma_m) { + int accum_m = mma_m * InstructionShape::kM; + int accum_n = mma_n * InstructionShape::kN; + + int idx = mma_m + mma_n * Policy::MmaIterations::kM; + + AccessType* access_ptr = reinterpret_cast(offset_ref.data() + + accum_m * offset_ref.stride(0) + accum_n); + + frag_ptr[idx] = access_ptr[0]; + } + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + Fragment &frag, ///< fragment to load from the tensor + Index byte_offset) const { ///< loads a tile with a linear offset + + load_with_pointer_offset(byte_offset / sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + Fragment &frag, ///< fragment to load from the tensor + TensorCoord const &tile_offset) const { ///< loads a tile with a logical offset in units of whole tiles + + load(frag, tile_offset, 0); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + Fragment &frag, ///< fragment to load from the tensor + TensorCoord const &tile_offset, ///< loads a tile with a logical offset in units of whole tiles + Index pointer_offset) const { ///< loads a tile with a logical offset AND a pointer offset + + load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset); + } - access_ptr[0] = frag_ptr[idx]; + /// Stores a fragment to memory + CUTLASS_HOST_DEVICE + void store(Fragment const &frag) const { + store_with_pointer_offset(frag, 0); + } + + /// Stores a fragment to memory with additional pointer offset + CUTLASS_DEVICE + void store_with_pointer_offset( + Fragment const &frag, ///< fragment to store from the tensor + Index pointer_offset) const { ///< store a tile with a linear offset + + TensorRef offset_ref(ref_); + offset_ref.add_pointer_offset(pointer_offset); + + Array output_frag_f; + Array output_frag; + + LongIndex pq = extent_.h() * extent_.w(); + + LongIndex extent_row = extent_.n() * pq; + LongIndex extent_col = extent_.c(); + + LongIndex k_major = (global_offset_col_ / InterleavedN) * pq; + Index k_minor = global_offset_col_ % InterleavedN; + LongIndex k_offset = k_major * InterleavedN + k_minor; + LongIndex k_offset_delta = pq * InterleavedN; + + LongIndex stride_n = pq * extent_.c(); + + Index n; + LongIndex pq_rem; + + unsigned int pq_mul, pq_shr; + find_divisor(pq_mul, pq_shr, pq); + + if(beta_ == 0.0f) { + CUTLASS_PRAGMA_UNROLL + for(int i = 0; i < frag.size(); ++i) { + output_frag_f[i] = frag[i]; + } + + if(InstructionShape::kM == Policy::kStridedPerSTG) { + CUTLASS_PRAGMA_UNROLL + for(int i = 0; i < frag.size(); ++i) { + output_frag[i] = (Element)(output_frag_f[i] * alpha_); + } + } else { + CUTLASS_PRAGMA_UNROLL + for(int i = 0; i < frag.size(); ++i) { + int map_i = (i / (16 * Policy::kPackedFactor)) * (16 * Policy::kPackedFactor) + + (i % (8 * Policy::kPackedFactor)) / 2 * 4 + + (i % (8 * Policy::kPackedFactor)) % 2 + + (i / (8 * Policy::kPackedFactor)) % 2 * 2; + output_frag[i] = (Element)(output_frag_f[map_i] * alpha_); + } + } + + AccessType const *frag_ptr = reinterpret_cast(&output_frag); + + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) { + int accum_m = mma_m * Policy::kStridedPerSTG; + + fast_divmod(n, pq_rem, global_offset_row_ + accum_m, pq, pq_mul, pq_shr); + LongIndex offset_m = n * stride_n + k_offset + pq_rem * InterleavedN; + + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) { + + int accum_n = mma_n * InterleavedN; + + int idx = mma_n + mma_m * Policy::MmaIterations::kColumn; + + if((global_offset_row_ + accum_m < extent_row) && (global_offset_col_ + accum_n < extent_col)) { + AccessType* access_ptr = reinterpret_cast(offset_ref.data() + + offset_m + mma_n * k_offset_delta); + + access_ptr[0] = frag_ptr[idx]; + } + } + } + } else { + if(InstructionShape::kM == Policy::kStridedPerSTG) { + CUTLASS_PRAGMA_UNROLL + for(int i = 0; i < frag.size(); ++i) { + output_frag_f[i] = frag[i]; + } + } else { + CUTLASS_PRAGMA_UNROLL + for(int i = 0; i < frag.size(); ++i) { + int map_i = (i / (16 * Policy::kPackedFactor)) * (16 * Policy::kPackedFactor) + + (i % (8 * Policy::kPackedFactor)) / 2 * 4 + + (i % (8 * Policy::kPackedFactor)) % 2 + + (i / (8 * Policy::kPackedFactor)) % 2 * 2; + output_frag_f[i] = frag[map_i]; + } + } + + AccessType const *frag_ptr = reinterpret_cast(&output_frag); + + Array ref_frag; + AccessType *ref_frag_ptr = reinterpret_cast(&ref_frag); + + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) { + int accum_m = mma_m * Policy::kStridedPerSTG; + + fast_divmod(n, pq_rem, global_offset_row_ + accum_m, pq, pq_mul, pq_shr); + LongIndex offset_m = n * stride_n + k_offset + pq_rem * InterleavedN; + + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) { + + int accum_n = mma_n * InterleavedN; + + int idx = mma_n + mma_m * Policy::MmaIterations::kColumn; + + if((global_offset_row_ + accum_m < extent_row) && (global_offset_col_ + accum_n < extent_col)) { + AccessType* access_ptr = reinterpret_cast(offset_ref.data() + + offset_m + mma_n * k_offset_delta); + + ref_frag_ptr[0] = access_ptr[0]; + + CUTLASS_PRAGMA_UNROLL + for(int i = 0; i < kElementsPerAccess; ++i) { + output_frag[idx * kElementsPerAccess + i] = Element(alpha_ * output_frag_f[idx * kElementsPerAccess + i] + + beta_ * ref_frag[i]); + } + + access_ptr[0] = frag_ptr[idx]; + } + } } } } diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h index ed6384f05a..c57cc6a8d9 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h @@ -2243,6 +2243,847 @@ class MmaVoltaTensorOpMultiplicandTileIterator< } }; +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Tile iterator specialized for 'TN' arrangement +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Operand identity + Operand Operand_, + /// Data type of A elements + typename Element_, + /// Layout of matrix operand + typename Layout_, + /// Shape of one matrix production operation (concept: MatrixShape) + typename InstructionShape_, + /// Delta between *MMA operations (in units of *MMA operations, concept: + /// MatrixShape) + int OpDelta_, + /// Number of threads participating in one matrix operation + int Threads = 32, + /// Number of partitions along K dimension + int PartitionsK_ = 1> +class MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner { + public: + + /// Shape of tile to load (concept: MatrixShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + /// Basic check + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaVoltaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = Layout_; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Number of elements accessed per Shared Memory load + static int const kElementsPerAccess = 4; + +private: + + static int const kInterleavedTileRows = 32; + static int const kInterleavedTileColumns = 32; + static int const kInstructionsPerTile = 2; + + /// Rounded up instruction counts + using TileCount = MatrixShape< + Shape::kRow / kInterleavedTileRows, + Shape::kColumn / kInterleavedTileColumns + >; + + using FragmentCount = MatrixShape< + TileCount::kRow * kInstructionsPerTile, + TileCount::kColumn * kInstructionsPerTile + >; + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = Array< + Element, + (kOperand == Operand::kA ? FragmentCount::kRow : FragmentCount::kColumn) * kElementsPerAccess + >; + + /// Memory access type + using AccessType = AlignedArray; + +private: + + /// Underlying tensor reference + TensorRef ref_; + + /// Extent of tensor + MatrixCoord extent_; + + /// Origin + MatrixCoord origin_; + + /// Used to conditionally enable extents checking + bool divisible_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner(): divisible_(true) { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner( + TensorRef const &ref, + int lane_id + ): + ref_(ref), extent_(Shape::kRow, Shape::kColumn), divisible_(true) { + + int quad_id = lane_id / 4; + int lane_in_quad = (lane_id % 4); + + if (kOperand == Operand::kA) { + + int row_idx = ((quad_id & 1) + ((quad_id & 4) / 2)) * 4 * kInstructionsPerTile + lane_in_quad; + int col_idx = 0; + + origin_ = MatrixCoord(row_idx, col_idx); + } + else { + + int row_idx = 0; + int col_idx = (quad_id / 2) * 4 * kInstructionsPerTile + lane_in_quad; + + origin_ = MatrixCoord(row_idx, col_idx); + } + + ref_.add_coord_offset(origin_); + } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner( + TensorRef const &ref, + TensorCoord extent, + int lane_id + ): ref_(ref), extent_(extent), divisible_(false) { + + int quad_id = lane_id / 4; + int lane_in_quad = (lane_id % 4); + + if (kOperand == Operand::kA) { + + int row_idx = ((quad_id & 1) + ((quad_id & 4) / 2)) * 4 * kInstructionsPerTile + lane_in_quad; + int col_idx = 0; + + origin_ = MatrixCoord(row_idx, col_idx); + } + else { + + int row_idx = 0; + int col_idx = (quad_id / 2) * 4 * kInstructionsPerTile + lane_in_quad; + + origin_ = MatrixCoord(row_idx, col_idx); + } + + #if defined(__CUDA_ARCH__) + __syncthreads(); + #endif + + ref_.add_coord_offset(origin_); + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner &add_pointer_offset(LongIndex offset) { + + ref_.add_pointer_offset(offset); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner &add_tile_offset(TensorCoord const &tile_offset) { + + TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn); + origin_ += coord_offset; + + ref_.add_coord_offset(coord_offset); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_DEVICE + MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner & operator++() { + + if (kOperand == Operand::kA) { + add_tile_offset({0, 1}); + } + else { + add_tile_offset({1, 0}); + } + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner & operator--() { + + if (kOperand == Operand::kA) { + add_tile_offset({0, -1}); + } + else { + add_tile_offset({-1, 0}); + } + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(-tile_offset); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + load_with_pointer_offset(frag, 0); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + + AccessType *frag_ptr = reinterpret_cast(&frag); + AccessType const *access_ptr = reinterpret_cast(ref_.data()); + int ldm = ref_.stride()[0]; + + if (kOperand == Operand::kA) { + + CUTLASS_PRAGMA_UNROLL + for (int idx = 0; idx < FragmentCount::kRow; ++idx) { + + int tile_idx = idx / 2; + int quad_idx = idx % 2; + + int row_offset = tile_idx * kInterleavedTileRows + quad_idx * 4; + frag_ptr[idx] = access_ptr[row_offset * ldm / kElementsPerAccess]; + } + } + else { + CUTLASS_PRAGMA_UNROLL + for (int idx = 0; idx < FragmentCount::kColumn; ++idx) { + + int tile_idx = idx / 2; + int quad_idx = idx % 2; + + int col_offset = tile_idx * kInterleavedTileColumns + quad_idx * 4; + frag_ptr[idx] = access_ptr[col_offset * ldm / kElementsPerAccess]; + } + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index byte_offset) const { + + load_with_pointer_offset(frag, byte_offset * 8 / sizeof_bits::value); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + + TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn); + + load_with_pointer_offset(frag, ref_.offset(coord_offset)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + + TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn); + + load_with_pointer_offset(frag, ref_.offset(coord_offset) + pointer_offset); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + + TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn); + + load_with_pointer_offset(frag, ref_.offset(coord_offset) + byte_offset * 8 / sizeof_bits::value); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + // no operation + } +}; + + +/// Tile iterator specialized for 'NT' arrangement +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Operand identity + Operand Operand_, + /// Data type of A elements + typename Element_, + /// Layout of matrix operand + typename Layout_, + /// Shape of one matrix production operation (concept: MatrixShape) + typename InstructionShape_, + /// Delta between *MMA operations (in units of *MMA operations, concept: + /// MatrixShape) + int OpDelta_, + /// Number of threads participating in one matrix operation + int Threads = 32, + /// Number of partitions along K dimension + int PartitionsK_ = 1> +class MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter { + public: + + /// Shape of tile to load (concept: MatrixShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + /// Basic check + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaVoltaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = Layout_; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Number of elements accessed per Shared Memory load + static int const kElementsPerAccess = 4; + +private: + + static int const kInterleavedTileRows = 32; + static int const kInterleavedTileColumns = 32; + static int const kInstructionsPerTile = 2; + + /// Rounded up instruction counts + using TileCount = MatrixShape< + Shape::kRow / kInterleavedTileRows, + Shape::kColumn / kInterleavedTileColumns + >; + + using FragmentCount = MatrixShape< + TileCount::kRow * kInstructionsPerTile, + TileCount::kColumn * kInstructionsPerTile + >; + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = Array< + Element, + (kOperand == Operand::kA ? FragmentCount::kRow : FragmentCount::kColumn) * kElementsPerAccess + >; + + /// Memory access type + using AccessType = AlignedArray; + +private: + + /// Underlying tensor reference + TensorRef ref_; + + /// Extent of tensor + MatrixCoord extent_; + + /// Origin + MatrixCoord origin_; + + /// Used to conditionally enable extents checking + bool divisible_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter(): divisible_(true) { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter( + TensorRef const &ref, + int lane_id + ): + ref_(ref), extent_(Shape::kRow, Shape::kColumn), divisible_(true) { + + int quad_id = lane_id / 4; + int lane_in_quad = (lane_id % 4); + + if (kOperand == Operand::kA) { + + int row_idx = ((quad_id & 1) + ((quad_id & 4) / 2)) * 4 * kInstructionsPerTile; + int col_idx = lane_in_quad; + + origin_ = MatrixCoord(row_idx, col_idx); + } + else { + + int row_idx = lane_in_quad; + int col_idx = (quad_id / 2) * 4 * kInstructionsPerTile; + + origin_ = MatrixCoord(row_idx, col_idx); + } + + ref_.add_coord_offset(origin_); + } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter( + TensorRef const &ref, + TensorCoord extent, + int lane_id + ): ref_(ref), extent_(extent), divisible_(false) { + + int quad_id = lane_id / 4; + int lane_in_quad = (lane_id % 4); + + if (kOperand == Operand::kA) { + + int row_idx = ((quad_id & 1) + ((quad_id & 4) / 2)) * 4 * kInstructionsPerTile; + int col_idx = lane_in_quad; + + origin_ = MatrixCoord(row_idx, col_idx); + } + else { + + int row_idx = lane_in_quad; + int col_idx = (quad_id / 2) * 4 * kInstructionsPerTile; + + origin_ = MatrixCoord(row_idx, col_idx); + } + + #if defined(__CUDA_ARCH__) + __syncthreads(); + #endif + + ref_.add_coord_offset(origin_); + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter &add_pointer_offset(LongIndex offset) { + + ref_.add_pointer_offset(offset); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter &add_tile_offset(TensorCoord const &tile_offset) { + + TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn); + origin_ += coord_offset; + + ref_.add_coord_offset(coord_offset); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_DEVICE + MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter & operator++() { + + if (kOperand == Operand::kA) { + add_tile_offset({0, 1}); + } + else { + add_tile_offset({1, 0}); + } + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter & operator--() { + + if (kOperand == Operand::kA) { + add_tile_offset({0, -1}); + } + else { + add_tile_offset({-1, 0}); + } + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(-tile_offset); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + load_with_pointer_offset(frag, 0); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + + AccessType *frag_ptr = reinterpret_cast(&frag); + AccessType const *access_ptr = reinterpret_cast(ref_.data()); + int ldm = ref_.stride()[0]; + + if (kOperand == Operand::kA) { + + CUTLASS_PRAGMA_UNROLL + for (int idx = 0; idx < FragmentCount::kRow; ++idx) { + + int tile_idx = idx / 2; + int quad_idx = idx % 2; + + int row_offset = tile_idx * kInterleavedTileRows; + frag_ptr[idx] = access_ptr[row_offset / kElementsPerAccess + quad_idx]; + } + } + else { + CUTLASS_PRAGMA_UNROLL + for (int idx = 0; idx < FragmentCount::kColumn; ++idx) { + + int tile_idx = idx / 2; + int quad_idx = idx % 2; + + int col_offset = tile_idx * kInterleavedTileColumns; + frag_ptr[idx] = access_ptr[col_offset / kElementsPerAccess + quad_idx]; + } + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index byte_offset) const { + + load_with_pointer_offset(frag, byte_offset * 8 / sizeof_bits::value); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + + TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn); + + load_with_pointer_offset(frag, ref_.offset(coord_offset)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + + TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn); + + load_with_pointer_offset(frag, ref_.offset(coord_offset) + pointer_offset); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + + TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn); + + load_with_pointer_offset(frag, ref_.offset(coord_offset) + byte_offset * 8 / sizeof_bits::value); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + // no operation + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_> +class MmaVoltaTensorOpMultiplicandTileIterator< + Shape_, + Operand::kA, + Element_, + cutlass::layout::RowMajor, + InstructionShape_, + OpDelta_, + 32 +> : public MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner< + Shape_, Operand::kA, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> { + +public: + using Base = MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner< + Shape_, Operand::kA, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> ; + + using TensorRef = typename Base::TensorRef; + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaVoltaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): Base(ref, lane_id) { } + +}; + +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_> +class MmaVoltaTensorOpMultiplicandTileIterator< + Shape_, + Operand::kA, + Element_, + cutlass::layout::ColumnMajor, + InstructionShape_, + OpDelta_, + 32 +> : public MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter< + Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_> { + +public: + using Base = MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter< + Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_> ; + + using TensorRef = typename Base::TensorRef; + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaVoltaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): Base(ref, lane_id) { } + +}; + +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_> +class MmaVoltaTensorOpMultiplicandTileIterator< + Shape_, Operand::kB, Element_, + cutlass::layout::ColumnMajor, + InstructionShape_, OpDelta_, 32 +> : public MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner< + Shape_, Operand::kB, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_> { + +public: + using Base = MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner< + Shape_, Operand::kB, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_>; + + using TensorRef = typename Base::TensorRef; + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaVoltaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): Base(ref, lane_id) { } +}; + +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_> +class MmaVoltaTensorOpMultiplicandTileIterator< + Shape_, Operand::kB, Element_, + cutlass::layout::RowMajor, + InstructionShape_, OpDelta_, 32 +> : public MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter< + Shape_, Operand::kB, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> { + +public: + using Base = MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter< + Shape_, Operand::kB, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_>; + + using TensorRef = typename Base::TensorRef; + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaVoltaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): Base(ref, lane_id) { } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace warp } // namespace gemm } // namespace cutlass diff --git a/include/cutlass/layout/tensor.h b/include/cutlass/layout/tensor.h index f3d5a12bf8..7f608dcf76 100644 --- a/include/cutlass/layout/tensor.h +++ b/include/cutlass/layout/tensor.h @@ -40,6 +40,7 @@ #endif #include "cutlass/cutlass.h" #include "cutlass/fast_math.h" +#include "cutlass/layout/pitch_linear.h" #include "cutlass/layout/matrix.h" #include "cutlass/coord.h" #include "cutlass/tensor_coord.h" @@ -120,6 +121,12 @@ class TensorNHWC { LongIndex(stride_[1] * coord.h()) + LongIndex(stride_[2] * coord.n()); } + + /// Returns the offset of a pitchlinear coordinate in linear memory. + CUTLASS_HOST_DEVICE + LongIndex operator()(PitchLinearCoord coord) const { + return coord.contiguous() + LongIndex(coord.strided() * stride_[2]); + } /// Returns the logical coordinate (n, h, w, c) from a given offset in linear memory. CUTLASS_HOST_DEVICE @@ -182,7 +189,6 @@ class TensorNHWC { } }; - ///////////////////////////////////////////////////////////////////////////////////////////////// /// Mapping function for 4-D NCHW tensors. @@ -424,6 +430,14 @@ class TensorCxRSKx { LongIndex(stride_[2] * c_major); } + /// Returns the offset of a pitchlinear coordinate in linear memory. + CUTLASS_HOST_DEVICE + LongIndex operator()(PitchLinearCoord const &coord) const { + return (coord.contiguous() % kInterleave) + + LongIndex((coord.contiguous() / kInterleave) * stride_[2]) + + LongIndex(coord.strided() * kInterleave); + } + /// Returns the stride of the layout CUTLASS_HOST_DEVICE Stride stride() const { diff --git a/include/cutlass/transform/pitch_linear_thread_map.h b/include/cutlass/transform/pitch_linear_thread_map.h index de21ede4ea..c19f79cbbc 100644 --- a/include/cutlass/transform/pitch_linear_thread_map.h +++ b/include/cutlass/transform/pitch_linear_thread_map.h @@ -340,6 +340,134 @@ struct PitchLinearWarpRakedThreadMap { //////////////////////////////////////////////////////////////////////////////// +/// Policy defining a warp-raked arrangement in which a shape is partitioned into contiguous +/// elements. Warps are arranged based on a stride. +/// +/// This ThreadMap is used by tensor core kernels for NCxHWx layout. +template < + typename Shape_, + int Threads, + typename WarpThreadArrangement_, + int ElementsPerAccess = 1 +> +struct PitchLinearStridedWarpRakedThreadMap { + + /// Tensor coordinate + using TensorCoord = layout::PitchLinearCoord; + + /// Tile shape + using Shape = Shape_; + + /// Number of threads total + static int const kThreads = Threads; + + using WarpThreadArrangement = WarpThreadArrangement_; + + /// Extract vector length from Layout + static int const kElementsPerAccess = ElementsPerAccess; + + /// Base ThreadMap + using BaseThreadMap = PitchLinearWarpRakedThreadMap< + Shape, + kThreads, + WarpThreadArrangement, + kElementsPerAccess + >; + + /// Shape of access by each thread + using ThreadAccessShape = typename BaseThreadMap::ThreadAccessShape; + + + struct Detail { + + using WarpThreadArrangement = WarpThreadArrangement_; + + using WarpAccessIterations = typename BaseThreadMap::Detail::WarpAccessIterations; + + static int const kWarpSize = BaseThreadMap::Detail::kWarpSize; + + static int const kWarpCount = BaseThreadMap::Detail::kWarpCount; + + using ShapeInAccesses = typename BaseThreadMap::Detail::ShapeInAccesses; + + // Divide it into the number of warps, first partitioning the contiguous dimension then the + // stride. + static int const kWarpsContiguous = + (WarpAccessIterations::kContiguous >= kWarpCount + ? kWarpCount + : WarpAccessIterations::kContiguous); + + static int const kWarpsStrided = + (kWarpCount > WarpAccessIterations::kContiguous + ? kWarpCount / kWarpsContiguous + : 1); + + /// Arrangement of warps within a threadblock-scoped tile + using WarpArrangement = layout::PitchLinearShape< + kWarpsContiguous, kWarpsStrided + >; + + }; + + ///< Iterations along each dimension (concept: PitchLinearShape) + using Iterations = layout::PitchLinearShape< + Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous, + Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided + >; + + static_assert(Iterations::kCount, + "Number of iterations must be non-zero"); + + ///< Delta betweeen accesses (units of elements, concept: PitchLinearShape) + using Delta = typename BaseThreadMap::Delta; + + /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space + CUTLASS_HOST_DEVICE + static TensorCoord initial_offset(int thread_id) { + + int warp_id = (thread_id / Detail::kWarpSize); + int lane_id = (thread_id % Detail::kWarpSize); + + // + // compute warp-level offset + // + + // This is the shape of the entire area covered by a warp's memory access (in units of vectors) + layout::PitchLinearCoord warp_footprint{ + Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous, + Detail::WarpThreadArrangement::kStrided * Iterations::kStrided + }; + + // This is the offset of a specific warp (in units of vectors) + layout::PitchLinearCoord warp_offset{ + (warp_id % Detail::kWarpsContiguous), + (warp_id / Detail::kWarpsContiguous) + }; + + // This is the offset of a specific thread within a warp (units of vectors) + layout::PitchLinearCoord thread_offset_in_warp{ + lane_id % Detail::WarpThreadArrangement::kContiguous, + lane_id / Detail::WarpThreadArrangement::kContiguous + }; + + // This is the offset of a thread within a threadblock tile (units of vectors) + layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec = + warp_footprint * warp_offset + thread_offset_in_warp; + + // This is the offset of a thread within a threadblock tile (units of elements) + layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{ + thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess, + thread_offset_in_threadblock_tile_vec.strided() + }; + + return thread_offset_in_threadblock_tile_base; + } + + +}; + +//////////////////////////////////////////////////////////////////////////////// + /// Transpose the existing ThreadMap. For example, interleaved layout is like /// congruous in the global memory and crosswise in the shared memory. We need /// to transpose the coordinates between two. diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h index 7e34b546be..7dce3228ec 100644 --- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h +++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h @@ -500,7 +500,7 @@ class PredicatedTileAccessIterator -class RegularTileAccessIterator< - Shape_, Element_, - layout::TensorOpMultiplicandRowMajorInterleaved::value, - InterleavedK>, - AdvanceRank, ThreadMap_, Alignment> { - public: - static_assert( - AdvanceRank == 0 || AdvanceRank == 1, - "Specialization for pitch-linear iterator may along advance along the " - "contiguous(rank=0) or strided(rank=1) dimension."); - - using Shape = Shape_; - using Element = Element_; - using Layout = - layout::TensorOpMultiplicandRowMajorInterleaved::value, - InterleavedK>; - static int const kAdvanceRank = AdvanceRank; - static int const kAlignment = Alignment; - - using Index = typename Layout::Index; - using LongIndex = typename Layout::LongIndex; - - using TensorRef = TensorRef; - using TensorCoord = typename Layout::TensorCoord; - - using ThreadMap = ThreadMap_; - - /// Internal details made public to facilitate introspection - struct Detail { - /// This iterator is specialized for an access size that is 128 bits in - /// length. - static int const kAccessSizeInBits = 128; - - static_assert(sizeof_bits::value * ThreadMap::kElementsPerAccess == - kAccessSizeInBits, - "This iterator requires a policy whose access size is 128bs"); - }; - - private: - - /// Element type per access - using AccessType = Array; - - private: - // - // Data members - // - - /// Internal pointer to first access of tile - AccessType *pointer_; - - /// Internal byte offset - Index byte_offset_; - - /// Iteration in the contiguous dimension - int iteration_contiguous_; - - /// Iteration in the strided dimension - int iteration_strided_; - - public: - /// Construct a TileIterator with zero threadblock offset - CUTLASS_HOST_DEVICE - RegularTileAccessIterator(TensorRef ref, ///< Pointer to start of tensor - int thread_id ///< ID of each participating thread - ) - : byte_offset_(0) { - layout::PitchLinearCoord thread_offset_base = - ThreadMap::initial_offset(thread_id); - - // initialize pointer - pointer_ = reinterpret_cast( - ref.data() + ref.offset(thread_offset_base)); - - set_iteration_index(0); - } - - /// Overrides the internal iteration index - CUTLASS_HOST_DEVICE - void set_iteration_index(int index) { - iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; - iteration_strided_ = index / ThreadMap::Iterations::kContiguous; - } - - /// Adds a pointer offset in units of Element - CUTLASS_HOST_DEVICE - void add_pointer_offset(LongIndex pointer_offset) { - byte_offset_ += pointer_offset * sizeof(Element); - } - - /// Returns a pointer - CUTLASS_HOST_DEVICE - AccessType *get() const { - AccessType *access_ptr = pointer_; - - int access_offset = - (iteration_strided_ * ThreadMap::Delta::kStrided * Layout::kInterleavedK + - iteration_contiguous_ * ThreadMap::Delta::kContiguous) / ThreadMap::kElementsPerAccess; - - char *access_byte_ptr = - reinterpret_cast(access_ptr + access_offset); - - return reinterpret_cast(access_byte_ptr + byte_offset_); - } - - /// Advances to the next tile in memory. - CUTLASS_HOST_DEVICE - RegularTileAccessIterator &operator++() { - ++iteration_contiguous_; - - if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) - return *this; - - // Enter here only if (iteration_contiguous_ == - // ThreadMap::Iteration::kContiguous) - iteration_contiguous_ = 0; - ++iteration_strided_; - - if (iteration_strided_ < ThreadMap::Iterations::kStrided) { - return *this; - } - - // Enter here only if (iteration_strided_ == ThreadMap::Iteration::kStrided) - // which means we enter the next tile. - iteration_strided_ = 0; - - return *this; - } - - /// Advances to the next tile in memory. - CUTLASS_HOST_DEVICE - RegularTileAccessIterator operator++(int) { - RegularTileAccessIterator prev(*this); - this->operator++(); - - return prev; - } - - /// Adds a tile offset - CUTLASS_DEVICE - void add_tile_offset(TensorCoord const &coord) { - add_pointer_offset(coord.contiguous() * Shape::kCount); - } -}; - -//////////////////////////////////////////////////////////////////////////////// - -/// Tile iterator specialized for k interleaved arrangements for TensorOps -/// -/// -/// Satisfies: ForwardTileIteratorConcept | -/// ReadableContiguousTileIteratorConcept | -/// WriteableContiguousTileIteratorConcept -/// - -template -class RegularTileAccessIterator< - Shape_, Element_, - layout::TensorOpMultiplicandColumnMajorInterleaved::value, - InterleavedK>, - AdvanceRank, ThreadMap_, Alignment> { - - public: - static_assert( - AdvanceRank == 0 || AdvanceRank == 1, - "Specialization for pitch-linear iterator may along advance along the " - "contiguous(rank=0) or strided(rank=1) dimension."); - - using Shape = Shape_; - using Element = Element_; - using Layout = - layout::TensorOpMultiplicandColumnMajorInterleaved::value, - InterleavedK>; - static int const kAdvanceRank = AdvanceRank; - static int const kAlignment = Alignment; - - using Index = typename Layout::Index; - using LongIndex = typename Layout::LongIndex; - - using TensorRef = TensorRef; - using TensorCoord = typename Layout::TensorCoord; - - using ThreadMap = ThreadMap_; - - /// Underlying iterator type - using UnderlyingIterator = RegularTileAccessIterator< - cutlass::MatrixShape, - Element, - layout::TensorOpMultiplicandRowMajorInterleaved::value, InterleavedK>, - (kAdvanceRank == 1 ? 0 : 1), - ThreadMap - >; - - private: - - /// Element type per access - using AccessType = Array; - - private: - - /// Underlying iterator - UnderlyingIterator iterator_; - - public: - /// Construct a TileIterator with zero threadblock offset - CUTLASS_HOST_DEVICE - RegularTileAccessIterator(TensorRef ref, ///< Pointer to start of tensor - int thread_id ///< ID of each participating thread - ) - : iterator_({ref.data(), ref.stride()}, thread_id) {} - - /// Overrides the internal iteration index - CUTLASS_HOST_DEVICE - void set_iteration_index(int index) { - iterator_.set_iteration_index(index); - } - - /// Adds a pointer offset in units of Element - CUTLASS_HOST_DEVICE - void add_pointer_offset(LongIndex pointer_offset) { - iterator_.add_pointer_offset(pointer_offset); - } - - /// Returns a pointer - CUTLASS_HOST_DEVICE - AccessType *get() const { - return iterator_.get(); - } - - /// Advances to the next tile in memory. - CUTLASS_HOST_DEVICE - RegularTileAccessIterator &operator++() { - ++iterator_; - return *this; - } - - /// Advances to the next tile in memory. - CUTLASS_HOST_DEVICE - RegularTileAccessIterator operator++(int) { - RegularTileAccessIterator prev(*this); - ++iterator_; - - return prev; - } - - /// Adds a tile offset - CUTLASS_DEVICE - void add_tile_offset(TensorCoord const &coord) { - iterator_.add_tile_offset({coord.strided(), coord.contiguous()}); - } -}; - -//////////////////////////////////////////////////////////////////////////////// - } // namespace threadblock } // namespace transform } // namespace cutlass diff --git a/media/docs/functionality.md b/media/docs/functionality.md index 3c416b3e9d..77f1ba142c 100644 --- a/media/docs/functionality.md +++ b/media/docs/functionality.md @@ -44,6 +44,28 @@ Hyperlinks to relevant unit tests demonstrate how specific template instances ma | **SpTensorOp** | 80 | 11.1+ | `s4 * s4 + s32 => {s4, s32}` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu) | +## Device-level Implicit GEMM convolution + +The following table summarizes device-level implicit GEMM convolution kernels in CUTLASS, organized by opcode class, data type, and layout. +Hyperlinks to relevant conv2d fprop unit tests demonstrate how specific template instances may be defined. +One can find and/or create equivalent dgrad and wgrad convolutional operators. + +|**Opcode Class** | **Compute Capability** | **CUDA Toolkit** | **Data Type** | **Layouts** | **Unit Test** | +|-----------------|------------------------|------------------|--------------------------------|------------------|------------------| +| **Simt** | 50,60,61,70,75 | 9.2+ | `f32 * f32 + f32 => f32` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu) | +| **Simt** | 50,60,61,70,75 | 9.2+ | `cf32 * cf32 + cf32 => cf32` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu) | +| **TensorOp** | 70 | 10.1+ | `f16 * f16 + f32 => {f16, f32}`| NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu) | +| **TensorOp** | 75 | 10.2+ | `f16 * f16 + f32 => {f16, f32}`| NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu) | +| **TensorOp** | 75 | 10.2+ | `s8 * s8 + s32 => {s32, s8}` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu) | +| **Simt** | 80 | 11.0+ | `f32 * f32 + f32 => f32` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu) | +| **Simt** | 80 | 11.0+ | `cf32 * cf32 + cf32 => cf32` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `f16 * f16 + f32 => {f16, f32}`| NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `f16 * f16 + f16 => f16` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `tf32 * tf32 + f32 => f32` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `s8 * s8 + s32 => {s32, s8}` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `s4 * s4 + s32 => {s32, s4}` | NHWC | [example](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu) | + + ## Warp-level Matrix Multiply with Tensor Cores diff --git a/media/docs/implicit_gemm_convolution.md b/media/docs/implicit_gemm_convolution.md new file mode 100644 index 0000000000..34102918d3 --- /dev/null +++ b/media/docs/implicit_gemm_convolution.md @@ -0,0 +1,779 @@ +![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Implicit GEMM API") + +[README](/README.md#documentation) > **Implicit GEMM Convolution** + +# CUTLASS Convolution + +Implicit GEMM is the formulation of a convolution operation as a GEMM (generalized matrix-matrix +product). Convolution takes an activation tensor and applies a sliding filter on it to produce an +output tensor. + +## Introduction + +This release of CUTLASS contains several artifacts related to convolution. + +- [**Implicit GEMM Algorithm**](implicit_gemm_convolution.md#implicit-gemm-algorithm) +- [**CUTLASS Convolution Implementation**](implicit_gemm_convolution.md#cutlass-convolution-implementation) +- [**Convolution Examples**](implicit_gemm_convolution.md#convolution-example) + + +# Implicit GEMM Algorithm + +2-D convolution may be mapped to matrix multiply by forming a _convolution matrix_ containing +elements of the activations tensor then multiplying this by a matrix formed from the filters tensor. +The earliest form of this algorithm construct the convolution matrix explicitly via an operation +conventionally referred to as `im2col`. The resulting matrix replicates each activation element by a factor +equal to the filter size, consuming additional storage capacity and memory bandwidth. + +The _implicit GEMM_ algorithm is a variation on the blocked, hierarchical GEMM computation in CUDA +that instead forms tiles of the convolution matrix on the fly as data is loaded from global memory +into Shared Memory by carefully updating pointers and predicates. Once the convolution matrix is +formed in Shared Memory, the existing components computing warp-level GEMM accumulate the result of +convolution and update the output tensor. + +This section describes the structure of an efficient Implicit GEMM Convolution CUDA kernel +for Turing Tensor Cores. + +## Mapping Convolution to GEMM + +The forward convolutional layer computes an output tensor _y = conv(x, w)_ where x(NHWC), w(KRSC), and y(NPQK) +are 4-D tensors. + +This computation may be described by the following analytic function. + +``` +y[n, p, q, k] = sum_c(sum_r(sum_s( x[n, f(p, r), g(q, s), c] * w[k, r, s, c] ))) +``` +where functions _f_ and _g_ are defined as follows. + +``` +f(p, r) = p * stride_h + R - r - 1 + pad_h +g(q, s) = h * stride_w + S - s - 1 + pad_w +``` + +A [host](/tools/util/include/reference/host/convolution.h) and [device](/tools/util/include/reference/device/convolution.h) +reference implementation are provided in the CUTLASS Utilities. + +This computation may be mapped to the elements of a matrix product as follows. + +``` +C = gemm(A, B) +``` +where +- A is a row-major matrix of extent _NHW_-by-_RSC_ containing activations +- B is a column-major matrix of extent _RSC_-by-_K_ containing filters +- C is a row-major matrix of extent _NPQ_-by-_K_ containing the output + +Each element of the output matrix _Cij_ corresponds to an element in the output tensor y[n, p, q, k] according to +the following relation. +``` +y[n, p, q, k] = Cij +``` +where +``` +i = q + Q * (p + P * n) +j = k +``` + +These relations may be inverted as follows. +``` +k = j + +n = i / (PQ) +residual = i % (PQ) + +p = residual / Q +q = residual % Q +``` + +The triple loop nest iterating over CRS to accumulate the result may also be linearized and mapped to the inner +GEMM _K_ dimension (not to be confused with the filter tensor dimension _K_) by the following relations. + +``` +gemm_k = s + S * (r + R * c) +``` +and inverse +``` +c = gemm_k / (RS) +residual = gemm_k % (RS) + +r = residual / S +s = residual % S +``` + +Given these equations, a GEMM triple loop nest could be augmented with tensor indexing as follows. +```c++ +int GEMM_M = N * P * Q; +int GEMM_N = K; +int GEMM_K = C * R * S; + +for (int gemm_i = 0; gemm_i < GEMM_M; ++gemm_i) { + for (int gemm_j = 0; gemm_j < GEMM_N; ++gemm_j) { + + int n = gemm_i / (PQ); + int npq_residual = gemm_i % (PQ); + + int p = npq_residual / Q; + int q = npq_residual % Q; + + Accumulator accum = 0; + + for (int gemm_k = 0; gemm_k < GEMM_K; ++gemm_k) { + + int k = gemm_j; + + int c = gemm_k / (RS); + int crs_residual = gemm_k % (RS); + + int r = crs_residual / S; + int s = crs_residual % S; + + int h = f(p, r); + int w = g(q, s); + + ElementA a = tensor_A.at({n, h, w, c}); + ElementB b = tensor_B.at({k, r, s, c}); + + accum += a * b; + } + + C[gemm_i * K + gemm_j] = accum; + } +} +``` +The [CUTLASS GEMM implementation](/media/docs/efficient_gemm.md) explicitly iterates over tiles. Consequently, +a tile iterator could be implemented to compute these functions analytically and load the appropriate +elements. However, the resulting modulo arithmetic would be computationally intensive, and overhead would +limit performance of a GEMM kernel targeting Turing Tensor Cores. + +The following section describes how an efficient implementation may be implemented within the structure of +a hierarchical GEMM kernel targeting Tensor Cores. + + +# CUTLASS Convolution Implementation + +The CUTLASS Implicit GEMM implementation makes several assumptions. + +- All tensors are 128-bit aligned NHWC tensors +- Channel count (C) is a multiple of 32 elements +- Filter count (K) is a multiple of 32 elements + +This enables 128-bit vector memory acceses which lead to efficient CUDA kernels. + +# CUTLASS Device-level Convolution Operator + +CUTLASS defines CUDA C++ templates accepting numerous template arguments to specialize the resulting +kernel by operation, data type, tile configuration, math instruction, and fused output operation. + +In [09_turing_tensorop_conv2dfprop.cu](/examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop.cu), a convolution +operation is defined as follows. + +```c++ +/// Define an Implicit GEMM convolution forward propagation (fprop) kernel +using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementInputA, // data type of element a (mapped to activation for fprop) + LayoutInputA, // layout of element a (mapped to activation for fprop) + ElementInputB, // data type of element b (mapped to filters for fprop) + LayoutInputB, // layout of element b (mapped to filters for fprop) + ElementC, // data type of element c (mapped to output for fprop) + LayoutC, // layout of element c (mapped to output for fprop) + ElementAccumulator, // data type of internal accumulation + MMAOp, // opcode class tag + SmArch, // target SM architecture + ThreadblockShape, // shape of threadblock tile + WarpShape, // shape of warp-level GEMM tile + InstructionShape, // shape of target math instruction + EpilogueOp, // epilogue operator + SwizzleThreadBlock, // optional function to reorder threadblocks for locality + NumStages, // number of pipeline stages in threadblock-scoped GEMM + cutlass::arch::OpMultiplyAddSaturate, // math operation on data of element a and b + cutlass::conv::IteratorAlgorithm::kAnalytic // globabl memory iterator algorithm +>::Kernel +``` + +This template is intended to be generic and cover all feasible configurations. The example specifies +the following concrete data types, layouts, and tile sizes. + +```c++ +/// Define an Implicit GEMM convolution forward propagation (fprop) kernel +using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + cutlass::int4b_t, // data type of element a (mapped to activation for fprop) + cutlass::layout::TensorNHWC, // layout of element a (mapped to activation for fprop) + cutlass::int4b_t, // data type of element b (mapped to filters for fprop) + cutlass::layout::TensorNHWC, // layout of element b (mapped to filters for fprop) + int32_t, // data type of element c (mapped to output for fprop) + cutlass::layout::TensorNHWC, // layout of element c (mapped to output for fprop) + int32_t, // data type of internal accumulation + cutlass::arch::OpClassTensorOp, // opcode class tag + cutlass::arch::Sm75, // target SM architecture + cutlass::gemm::GemmShape<128, 128, 128>, // shape of threadblock tile + cutlass::gemm::GemmShape<64, 64, 128>, // shape of warp-level GEMM tile + cutlass::gemm::GemmShape<8, 8, 32>, // shape of target math instruction + cutlass::epilogue::thread::LinearCombinationClamp< + int32_t, // data type of output matrix + 8, // The number of elements per vectorized + // memory access. This becomes the vector width of + // math instructions in the epilogue too. + int32_t, // Data type of accumulator + float>; , // epilogue operator + SwizzleThreadBlock, // optional function to reorder threadblocks for locality + 2, // number of pipeline stages in threadblock-scoped GEMM + cutlass::arch::OpMultiplyAddSaturate, // math operation on data of element a and b + cutlass::conv::IteratorAlgorithm::kAnalytic // globabl memory iterator algorithm +>::Kernel +``` + +That is, this computes 2D convolutional forward propagation with 4-bit integer inputs and outputs (`cutlass::int4b_t`). +Internal accumulation is performed using 32-bit integers (`int32_t`), and an elementwise linear combination operation +is performed on the output in single-precision floating point (`float`). + +The threadblock and warp-level tile sizes refer to the hierarhically blocked GEMM computation +[described here](/media/docs/gemm_api.md). Larger tiles achieve greater reuse of data loaded through shared memory +but launch fewer CTAs and may not fully occupy the GPU for small problem sizes. Smaller tile configurations achieve +lower peak utilizations but may better match the number of SMs within the GPU for real-world workloads. + + +## Launching the convolution + +The following code collects the arguments for an implicit GEMM operation into a structure. + +```c++ +// +// Define arguments for CUTLASS Convolution +// + +// mode (kCrossCorrelation or kConvolution) +cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation; + +// Split K dimension into 1 partitions +int split_k_slices = 1; + +cutlass::conv::Conv2dProblemSize problem_size( + options.input_size, + options.filter_size, + options.padding, + options.conv_stride, + options.dilation, + options.output_size(), + mode, + split_k_slices); + +typename ImplicitGemm::Arguments arguments{ + problem_size, + tensor_a.device_ref(), + tensor_b.device_ref(), + tensor_c.device_ref(), + tensor_c.device_ref(), + {options.alpha, options.beta}, +}; +``` + +The `mode` flag indicates whether to compute cross correlation or convolution. The arguments +`input_size`, `filter_size`, `padding`, `conv_stride`, and `dilation` specify the dimensions of the +input and output tensors and characterize the problem size. + +The arguments `tensor_a.device_ref()`, `tensor_b.device_ref()`, and `tensor_c.device_ref()` are +CUTLASS `TensorRef<>` objects containing a pointer to the tensor data in GPU device memory and stride values. + +The following code initializes and launches the Implicit GEMM operation on the device. After initializing +the arguments structure, it is used to query device-side workspace requirements and allocate them +in device memory if needed. + +Then, the Implicit GEMM object is initialized with the `arguments` structure and the workspace in +device memory. This initialization step precomputes internal lookup tables used by the convolution kernel +and may also clear the device-side workspace if needed. + +Finally, the initialized Implicit GEMM object is called, launching a kernel on the device. `tensor_c` now +contains the result of the implicit GEMM. + +```c++ +ImplicitGemm implicit_gemm_op; + +// Query workspace size +size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments); + +// Allocate workspace memory +cutlass::device_memory::allocation workspace(workspace_size); + +// Initialize the Implicit GEMM object +cutlass::Status status = implicit_gemm_op.initialize(arguments, workspace.get()); + +if (status != cutlass::Status::kSuccess) { + /* error */ +} + +// +// Launch initialized CUTLASS kernel +// + +status = implicit_gemm_op(); + +if (status != cutlass::Status::kSuccess) { + /* error */ +} +``` + +The example demonstrates how the input and output tensors may be written to a file as CSV using +`cutlass::HostTensor<>` defined in the [CUTLASS Utilities](/media/docs/utilities.md). + +```c++ + std::ofstream output_workspace(ss.str()); + + output_workspace + << "Input = \n" << tensor_a.host_view() << "\n\n" + << "Filters = \n" << tensor_b.host_view() << "\n\n"; + + // Copy device memory to host backing store + tensor_c.sync_host(); + + output_workspace << "Computed = \n" << tensor_c.host_view() << std::endl; +``` + + +## CUTLASS Components + +CUTLASS defines the following CUDA C++ templates to implement Implicit GEMM Convolution which are described in greater detail in subsequent sections. + +**Activations tile iterators** load the activations tile into registers. Two implementations are provided: +- [conv2d_fprop_activation_tile_access_iterator_analytic.h](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h) computes pointer deltas and masks analytically +- [conv2d_fprop_activation_tile_access_iterator_optimized.h](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h) optimizes iterating over global memory and +creating GEMM-A tile in shared memory. + +**Filter tile iterators** load filters into registers. Similarly, two implementations are provided: +- [conv2d_fprop_filter_tile_access_iterator_analytic.h](/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h) computes pointer deltas and masks analytically +- [conv2d_fprop_filter_tile_access_iterator_optimized.h](/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h) optimizes iterating over global memory and +creating GEMM-B tile in shared memory. + +The improvements covered by optimized iterators are: +- (a) Precomputing kernel-invariant pointer deltas on the host +- (b) Computing cta-invariant mask predicates on device-side iterator ctors +- (c) Use of [fast divmod](include/cutlass/fast_math.h) to map GEMM dimenstions to convolution tensors. +For example, _optimized_ activation iterator uses fast divmod to map GEMM _M_ to NPQ +for activation iterator + + +**Pipelined mainloop** loads threadblock-scoped tiles from global memory into shared memory and then applies +CUTLASS warp-level GEMM operations to load from Shared Memory and issue instructions to Turing Tensor Cores. +- [mma_pipelined.h](/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h) + +Operations for storing to shared memory and performing warp-wide matrix multiply operations using +Turing Tensor Cores are applied directly from the CUTLASS GEMM components. These include the +following components. + +**Regular Tile Iterator** implemented in +[transform::threadblock::RegularTileIterator](/include/cutlass/transform/threadblock/regular_tile_iterator.h) +stores register-backed fragments to Shared Memory in permuted layouts. + +**Warp-level GEMM** defined in [cutlass::gemm::warp::MmaTensorOp](/include/cutlass/gemm/warp/mma_tensor_op.h) +defines tile iterators to load from Shared Memory and issue math instructions to Turing Tensor Cores. +Further details are [described in here](/media/docs/gemm_api.md#warp-level-matrix-multiply-api). + +**Epilogue** reorders accumulator elements among threads within a threadblock to efficiently update +the output tensor. It is implemented in [epilogue::threadblock::Epilogue](/include/cutlass/epilogue/threadblock/epilogue.h). + +### Loading Activations and Filters + +The Implicit GEMM Convolution algorithm partitions the GEMM _K_ dimension (of extent _CRS_) into +threadblock tiles and assigning each threadblock tile to one filter position and an interval +of channels. After iterating over all filter positions, the convolution algorithm advances to the +next interval of channels and proceeds from filter `r=0, s=0`. + +The matrix product of one threadblock tile is computed per iteration of +the mainloop as described in the [CUTLASS GEMM implementation](/media/docs/efficient_gemm.md). To +summarize, the threadblock tile of activations and filters are loaded from tensors in global memory +and stored to shared memory. Each thread within the threadblock loads one or more vectors and +collectively span the entire tile. + +The following figure illustrates one particular iteration of the Implicit GEMM mainloop. Each +thread within the threadblock is mapped to several vectors of elements in the Activations and +Filters tensors. Each index in the GEMM _M_ dimension corresponds to a unique _(N,P,Q)_ +index of the output tensor, and pointers may be computed based on this as well as +filter position _(r,s)_. + +![ALT](/media/images/conv2d-fprop-int4.png "Convolution Forward Propagation on INT4 data.") + +The CUTLASS component that embodies this functionality is [Conv2dFpropFilterTileAccessIteratorAnalytic](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h). +Its constructor computes the mapping of GEMM _M_ to _(N, P, Q)_, the `at()` method maps the linear offset into the Activations +tensor for each memory access the thread is to perform. Additionally, the method `valid()` computes the valided of the access +for each filter position and for each memory access to indicate whether the memory access will be within the bounds of the +tensor or out of bounds. + +`operator++()` iterates over memory accesses performed by a thread in both contiguous and strided dimension. + +```c++ +// cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h + +// Update iterator to thread's next contiguous, strided memory access +Conv2dFpropActivationTileAccessIteratorAnalytic &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; +} +``` + +After all accesses have been visited for the current threadblock tile, `advance()` updates the pointers to next tile. +Offsets added to each pointer follows the traversal of filter positions, performing one of the +following: +- advance from filter position _(r, s, c)_ to filter position _(r, s+1, c)_ +- advance from filter position _(r, S-1, c)_ to filter position _(r+1, 0, c)_ +- advance from filter position _(R-1, S-1, c)_ to filter position _(0, 0, c+32)_ + +This logic within method `advance()`'s body computes the above three updates for the activation GEMM-A tile. + +```c++ +// cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h + +// Advance to the next access +void advance() { + // moves to the next tile + ++filter_s_; + if (filter_s_ < problem_size_.S) { + return; + } + filter_s_ = 0; + + ++filter_r_; + if (filter_r_ < problem_size_.R) { + return; + } + filter_r_ = 0; + + filter_c_ += Shape::kRow * problem_size_.split_k_slices; +} +``` + +Similar logic holds for [Conv2dFpropFilterTileAccessIteratorAnalytic](/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h). + +To reduce computational overhead in the mainloop body, the pointer offsets may be precomputed +in host code and provided to the CUDA kernel as a lookup table in its `Params` structure. +As shown in [Conv2dFpropFilterTileAccessIteratorOptimized](/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h), +the logic to compute offsets from filter position has been extracted to the `Params` constructor. + +```c++ +// cutlass/conv/threadblock/conv2d_params.h +struct Conv2dFpropActivationIteratorOptimizedParams { + ... +// next S +inc_next[0] = conv_sign * (int64_t(layout.stride()[0]) * problem_size.dilation_w) * element_size_bits / 8; + +// next R +inc_next[1] = conv_sign * ( + int64_t(layout.stride()[1]) * problem_size.dilation_h + - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w + ) * element_size_bits / 8; + +// next C +inc_next[2] = ( + threadblock_shape.column() * problem_size.split_k_slices + - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h + - conv_sign * int64_t(problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w + ) * element_size_bits / 8; + + ... +} +``` + +This allows only a simple lookup from the _delta table_ performed in device code in `Conv2dFpropActivationTileAccessIteratorOptimized::advance()` + +```c++ +// cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h +CUTLASS_HOST_DEVICE +void advance() { + + int next_idx = 0; + + // moves to the next tile + ++filter_s_; + if (filter_s_ == problem_size_.S) { + filter_s_ = 0; + ++filter_r_; + + if (filter_r_ < problem_size_.R) { + next_idx = 1; + } + else { + filter_r_ = 0; + next_idx = 2; + } + } + + add_byte_offset_(params_.inc_next[next_idx]); // in addition to Conv2dFpropActivationTileAccessIteratorAnalytic::advance() + + if (next_idx == 2) { + filter_c_ += params_.filter_c_delta; + } +} + +``` + +### Utilizing Tensor Cores + +Turing Tensor Cores compute matrix multiply-accumulate operations efficiently by sharing data among all +threads within a warp. The following operations are supported. + +|**Shape**|**A**|**B**|**C**| +|---------|-----|-----|-----| +| 8x8x32 | int4b_t | int4b_t | int32_t | +| 8x8x16 | int8b_t | int8b_t | int32_t | +| 16x8x8 | half | half | half | +| 16x8x8 | half | half | float | + +Functionally, the Turing 8x8x32 matrix multiply operation distributes the _A_, _B_, and _C_ matrix across 32 +threads within a warp according to the following illustration. + +![ALT](/media/images/mma-8x8x32.png "Turing Tensor Op") + +This Tensor Core operation is accessible to the CUDA programmer via the PTX instruction +[`mma.sync`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-fragment-mma-8832). +CUTLASS wraps inline PTX with device-side intrinsics defined in [`cutlass/arch/mma_sm75.h`](/include/cutlass/arch/mma_sm75.h) +as in the following example. + +```c++ +unsigned A; // eight packed 4-bit integer elements +unsigned B; // eight packed 4-bit integer elements + +int C[2]; // two 32-bit integer elements +int D[2]; // two 32-bit integer elements + +asm volatile( + "mma.sync.aligned.m8n8k32.row.col.s32.s4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n" + : "=r"(D[0]), "=r"(D[1]) + : "r"(A), "r"(B), "r"(C[0]), "r"(C[1])); +``` + +To efficiently load data from Shared Memory into registers with the distribution among +warps matching the above, the Turing GPU architecture introduces +[`ldmatrix`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix). +`ldmatrix` is the ultimate warp-cooperative instruction, as all threads contribute addresses to up to 32 row vectors of +size 128-bits in length. These rows are fetched from Shared Memory and then distributed among groups of four threads +per row. + +The arrangement of SMEM pointers and destination registers within threads is illustrated as follows. Thread 0 is highlighted +in the illustration to emphasize the mapping. + +![ALT](/media/images/ldmatrix-8x128bx4.png "Turing ldmatrix PTX instruction") + +The size of the Turing Tensor Core operation computing matrix multiply-accumulate on INT4 data is 8-by-8-by-32 +elements. `ldmatrix` fetches up to 32 rows (or columns) per operation. Sixteen Tensor Core operations may be issued +to implement a 32-by-32-by-32 matrix product and perfectly consume all data loaded by two `ldmatrix` instructions +as shown in the following figure. Larger tiles are possible by increasing the number of memory instructions +and issuing more Tensor Core operations, up to warp-level matrix operations of size 64-by-64-by-32. The limit is +the number of registers to hold the accumulator elements. + +![ALT](/media/images/ldmatrix-tensorop-32x32x32.png "Turing ldmatrix PTX instruction feeding Tensor Core operations") + +### Shared Memory Layouts + +In the previous two sections, we have described how data may be loaded from activations and filters tensors +in global memory to compute convolution, and we have described a composition of `ldmatrix` and `mma.sync` +to fetch data from Shared Memory and issue Tensor Core operations. + +To ensure this data movement is efficient, care must be taken to ensure bank conflicts are avoided. CUTLASS +uses a permuted Shared Memory layout to avoid bank conflicts when storing to Shared Memory and to efficiently +load from Shared Memory using `ldmatrix`. The following figure illustrates the thread mapping used for +the loading the activations and filters threadblock tiles from global memory and the permuted layout in +Shared Memory. + +![ALT](/media/images/tensor-op-permuted-smem-layout-TN.png "Shared Memory layout used for Turing Tensor Cores") + +In the illustration, one warp-wide memory access is highlighted in blue, with individual threads +loading one 128-bit vector. The tile in global memory could correspond either to the activations +or filters and is assumed to be 'strip-mined' with four threads loading consecutive channels. + +Shared Memory is visualized as a 'row-major' matrix with eight columns representing +the eight 128-bit banks. +As described in the CUTLASS GTC 2019 presentation [slides](https://developer.download.nvidia.com/video/gputechconf/gtc/2019/presentation/s9593-cutensor-high-performance-tensor-operations-in-cuda-v2.pdf), +[recording](https://developer.nvidia.com/gtc/2019/video/S9593), an access to Shared Memory will be conflict-free if +the following conditions are satisfied across each warp: +- {T0, T1, .., T7} do not access the same 128-bit bank +- {T8, T9, .., T16} do not access the same 128-bit bank +- {T16, T17, .., T23} do not access the same 128-bit bank +- {T24, T25, .., T31} do not access the same 128-bit bank + +To achieve conflict-free stores, the Shared Memory layout remaps the strip-mined arrangement to transpose +the vectors and applies an XOR operation on the column index of each thread's pointer. Specifically, + +```c++ + int store_column = (lane_id % 8) ^ (lane_id / 8); +``` + +This transformation on the layout will be instrumental in reading slices of data from Shared Memory +to compute the warp-level matrix multiply using Tensor Cores. + +The following figure shows how the first sixteen threads participating in an `ldmatrix` instruction +logically map to the c=0..31 slice of a matrix in Shared Memory. This slice is known as a "k-group" +within the code because it corresponds to the same K-index of a warp-level matrix multiply. + +![ALT](/media/images/tensor-op-permuted-smem-layout-TN-k0.png "Load kgroup=0 from Shared Memory using ldmatrix") + +The lower half of the figure shows the physical arrangement in Shared Memory, with threads offset by row and column +according to the XOR function. By inspection, we can observe there are no bank conflicts, as _T0 ... T7_ each access unique +banks, as do _T8 ... T15_. and beyond. + +To advance to the next "k-group" within Shared Memory, pointers are updated using an XOR operation according to +the following sequence: +- **^1** advances from _k=0_ to _k=1_ +- **^3** advances from _k=1_ to _k=2_ +- **^1** advances from _k=2_ to _k=3_ +- **^3** advances from _k=3_ to _k=0_ + +The first of these transitions is shown below. +![ALT](/media/images/tensor-op-permuted-smem-layout-TN-k1.png "Advance to kgroup=1 from Shared Memory using ldmatrix") + +The [CUTLASS warp-level GEMM API](/media/docs/gemm_api.md#warp-level-matrix-multiply-api) defines templates for +loading slices of data from permuted Shared Memory and issuing operations to Tensor Cores. + +### Updating the Output Tensor + +After the mainloop terminates, the accumulator tile of the warp-level GEMM stores a warp's contribution to the output +tensor. However, the distribution of data among threads within the threadblock is specialized for efficient matrix multiply-accumulate +operations using Tensor Cores and is not conducive to efficient, coalesced operations to Global Memory. A data rearrangement is +needed. + +The **Epilogue** is the component for exchanging accumulator elements through Shared Memory, loading slices of the output +matrix or tensor, applying an elementwise operation such as linear scaling or bias, and storing the result to the output tensor. +CUTLASS structures this as several components: +- [cutlass::epilogue::threadblock::Epilogue](/include/cutlass/epilogue/threadblock/epilogue.h) - the top-level component for looping over the entire threadblock tile +- [cutlass::epilogue::warp::TileIteratorTensorOp](/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h) - a specialized component for storing accumulators for Tensor Core to Shared Memory +- [cutlass::epilogue::threadblock::SharedLoadIterator](/include/cutlass/epilogue/threadblock/shared_load_iterator.h) - a component for loading elements from a row-major arrangement in Shared Memory +- [cutlass::epilogue::threadblock::PredicatedTileIterator](/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h) - a component for loading or storing matrix fragments to Global Memory (with bounds checks) +- [cutlass::epilogue::thread::LinearCombination](/include/cutlass/epilogue/thread/linear_combination.h) - an element-wise function computing `alpha * AB + beta * C` to compute the final output + +## Unit Tests + +Unit tests verify the functional behavior of each of the above components in a standalone CUDA kernel. This provides a +convenient environment to (a.) inspect the template definition, (b.) showcase instantiation of use of these templates +in device code, and (c.) assert functional correctness. + +**Convolution unit tests** +- Device-wide convolution operator: [conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu](/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu) + +**GEMM unit tests** +- Warp-scoped matrix multiply for Turing Tensor Cores: [gemm_sm75.cu](/test/unit/gemm/warp/gemm_sm75.cu) + +**Epilogue unit tests** +- Epilogue for Turing Tensor Cores: [epilogue_tensor_op.cu](/test/unit/epilogue/threadblock/epilogue_tensor_op.cu) + + +# Convolution Example + +This section describes the provided convolution example and is intended to orient the reader to the CUTLASS implementation +of Implicit GEMM Convolution. + +## Building and Running the Example + +Example `09_turing_tensorop_conv2dfprop` computes a forward convolutional layer in which inputs and +outputs are 4-b integers. The example source is visible in +[examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu](/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu). + + +Before building the example, first perform the prerequisite steps for building any CUTLASS component [described here](/media/docs/quickstart.md). +Compute capability 7.5 refers to the Turing architecture, and this work requires CUDA 10.2 Toolkit or later to target +Turing Tensor Cores using the native `mma` [PTX instruction](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-fragment-mma-8832). + +```bash +$ mkdir build && cd build + +$ cmake .. -DCUTLASS_NVCC_ARCHS=75 +``` + +To build the example, execute `make 09_turing_tensorop_conv2dfprop` from the build directory. +```bash +$ make 09_turing_tensorop_conv2dfprop + +$ ls examples/09_turing_tensorop_conv2dfprop +examples/09_turing_tensorop_conv2dfprop + +``` + +This example provides a simple command line interface to specify the extents of 4D tensors of 4-bit integer elements (`cutlass::int4b_t`), +initialize them to random values, and compute the result of a convolutional layer. Optionally, the input and output +tensors may be saved to .csv files, and the CUTLASS host-side reference check may be executed to verify correctness. + +The complete usage statement is visible by running with `--help`: +```bash +$ ./examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop --help +09_turing_tensorop_conv2dfprop example + + This example uses Turing's Tensor Core operators on int4 data types to compute + forward convolution on tensors of layout NHWC. + +Options: + + --help If specified, displays this usage statement. + + --n Input tensor extent N + --h Input tensor extent H + --w Input tensor extent W + --c Input tensor extent C + --k Filter extent K + --r Filter extent R + --s Filter extent S + + --alpha Epilogue scalar alpha + --beta Epilogue scalar beta + + --ref-check If set (true), reference check on the host is computed + --perf-check If set (true), performance is measured. + --benchmark If set (true), performance benchmarking on several layers and batch-size. + --iterations Number of profiling iterations to perform. + --save-workspace If set, workspace is written to a text file. + --tag String to replicate across the first column in the results table + + + +Examples: + +$ ./examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1 + +$ ./examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check +``` + +*Note*, this example assumes all tensors are 128b aligned and in format _NHWC_. Consequently, dimension +_C_ must be divisible by 32 for activations, filters, and output. + +If the option `--benchmark` is passed, several layers from ResNet50 are profiled for various batch sizes. +This sample output was computed on an NVIDIA RTX 2080 compiled with CUDA 10.2. + +```bash +build$ ./examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop --benchmark +``` + +Convolution can also be run by the CUTLASS Profiler. + + +# Copyright + +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + +``` + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + * Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` diff --git a/media/docs/profiler.md b/media/docs/profiler.md index dd1f62a7c9..032848c6fa 100644 --- a/media/docs/profiler.md +++ b/media/docs/profiler.md @@ -109,16 +109,29 @@ About: Operations: --operation= Specifies a particular operation to run or print the usage statement. - gemm General matrix-matrix product. D = alpha * A*B + beta * C + gemm General matrix-matrix product. D = alpha * A*B + beta * C + spgemm Structured sparse GEMM. D = alpha * A*B + beta * C + conv2d Conv2d operation. Output(Tensor4D) = alpha * Input(Tensor4D) * Filter(Tensor4D) + beta * Input(Tensor4D) + conv3d Conv3d operation. Output(Tensor5D) = alpha * Input(Tensor5D) * Filter(Tensor5D) + beta * Input(Tensor5D) For more details about a particular operation, specify the operation name with --help. Example: - $ ./tools/profiler/cutlass_profiler --operation=Gemm --help + $ cutlass_profiler --operation=Gemm --help + + $ cutlass_profiler --operation=Conv3d --help + + $ cutlass_profiler --operation=Conv2d --help + + $ cutlass_profiler --operation=SparseGemm --help ``` +# GEMM + +The CUTLASS Profiler is capable of executing each GEMM kernel. + ## GEMM Arguments The complete set of arguments available to each operation may be viewed by specifying the operation name @@ -189,7 +202,7 @@ Test your changes to gemm kernels with a quick functional test and save results --providers=cutlass --output=functional-test.csv ``` -## Example SGEMM +## Example CUDA Core GEMM Operation (SGEMM) Example command line for profiling SGEMM kernels is as follows: ```bash @@ -226,7 +239,7 @@ $ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096 Note, the arguments which appear in the output may be used as command line parameters for subsequent invocations. -## Example Tensor Core Operations +## Example Tensor Core GEMM Operations (S16816GEMM) To execute kernels targeting Tensor Core operations, supply the flag `--op_class=tensorop` in the command line. @@ -293,6 +306,158 @@ $ ./tools/profiler/cutlass_profiler --kernels=cutlass_simt_sgemm_128x128_nn --tags=cutlass:2.2,date:2020-06-08 ``` +# Convolution + +The CUTLASS Profiler is capable of executing 2-D and 3-D convolution problems for forwards and backwards +oeprator variants. + +The CUTLASS Profiler can be built with cuDNN enabled to use as a reference implementation. If CMake detects +the cuDNN library available in the system, it is included as a dependency. This may be explicitly overridden +with CMake flag `CUTLASS_ENABLE_CUDNN`. + +```bash +$ cmake .. -DCUTLASS_LIBRARY_OPERATIONS=conv2d -DCUTLASS_ENABLE_CUDNN=OFF +... +$ make -j16 cutlass_profiler +``` + + +## Convolution Arguments + +```bash +$ ./tools/profiler/cutlass_profiler --help --operation=Conv2d + +Conv2d + + [enum] --conv_kind Convolutional operator (fprop, dgrad, wgrad) + [int] --n,--input_n Input N dimension of the Conv2d problem space + [int] --h,--input_h Input H dimension of the Conv2d problem space + [int] --w,--input_w Input W dimension of the Conv2d problem space + [int] --c,--input_c Input C dimension of the Conv2d problem space + [int] --k,--filter_k Filter K dimension of the Conv2d problem space + [int] --r,--filter_r Filter R dimension of the Conv2d problem space + [int] --s,--filter_s Filter S dimension of the Conv2d problem space + [int] --p,--output_p Output P dimension of the Conv2d problem space + [int] --q,--output_q Output Q dimension of the Conv2d problem space + [int] --pad_h Padding in H direction + [int] --pad_w Padding in W direction + [int] --stride_h Stride in H direction + [int] --stride_w Stride in W direction + [int] --dilation_h Dilation in H direction + [int] --dilation_w Dilation in W direction + [tensor] --Activation Tensor storing the Activation operand + [tensor] --Filter Tensor storing the Filter operand + [tensor] --Output Tensor storing the Output operand + [enum] --conv_mode Convolution filter mode (conv, cross) + [enum] --iterator_algorithm,--iterator_algo Convolution iterator algorithm (analytic, optimized) + [scalar] --alpha,--epilogue::alpha Epilogue scalar alpha + [scalar] --beta,--epilogue::beta Epilogue scalar beta + [enum] --split_k_mode,--split-k-mode SplitK mode for serial or parallel reduction (serial, parallel) + [int] --split_k_slices,--split-k-slices Number of partitions of K dimension + [enum] --eq_gemm_provider,--eq-gemm-provider Enable profiling equivalent gemm by the following providers (cutlass) + [enum] --op_class,--opcode-class Class of math instruction (simt, tensorop, wmmatensorop, wmma) + [enum] --accum,--accumulator-type Math instruction accumulator data type + [int] --cta_m,--threadblock-shape::m Threadblock shape in the M dimension + [int] --cta_n,--threadblock-shape::n Threadblock shape in the N dimension + [int] --cta_k,--threadblock-shape::k Threadblock shape in the K dimension + [int] --stages,--threadblock-stages Number of stages of threadblock-scoped matrix multiply + [int] --warps_m,--warp-count::m Number of warps within threadblock along the M dimension + [int] --warps_n,--warp-count::n Number of warps within threadblock along the N dimension + [int] --warps_k,--warp-count::k Number of warps within threadblock along the K dimension + [int] --inst_m,--instruction-shape::m Math instruction shape in the M dimension + [int] --inst_n,--instruction-shape::n Math instruction shape in the N dimension + [int] --inst_k,--instruction-shape::k Math instruction shape in the K dimension + [int] --min_cc,--minimum-compute-capability Minimum device compute capability + [int] --max_cc,--maximum-compute-capability Maximum device compute capability + +Examples: + +Profile a particular convolution (specify all the convolution parameters): + + $ cutlass_profiler --operation=Conv2d --Activation=f16:nhwc \ + --Filter=f16:nhwc --Output=f16 --accumulator-type=f32 \ + --n=32 --h=14 --w=14 --c=8 --k=64 --r=3 --s=3 \ + --pad_h=1 --pad_w=1 \ + --stride::h=1 --stride::w=1 --dilation::h=1 --dilation::w=1 + +``` + +## Example CUDA Core Convolution Operation (SFPROP) + +Example command line for profiling Convolution kernels is as follows: + +```bash +$ ./tools/profiler/cutlass_profiler --kernels=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc --verification-providers=device --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 + + +============================= + Problem ID: 1 + + Provider: CUTLASS + OperationKind: conv2d + Operation: cutlass_simt_sfprop_optimized_128x128_8x2_nhwc + + Status: Success + Verification: ON + Disposition: Passed + +reference_device: Passed + + Arguments: --conv_kind=fprop --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 --p=224 --q=224 --pad_h=1 --pad_w=1 \ + --stride_h=1 --stride_w=1 --dilation_h=1 --dilation_w=1 --Activation=f32:nhwc --Filter=f32:nhwc --Output=f32:nhwc \ + --conv_mode=cross --iterator_algorithm=optimized --alpha=1 --beta=0 --split_k_mode=serial --split_k_slices=1 \ + --eq_gemm_provider=none --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 --stages=2 --warps_m=4 \ + --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 --max_cc=1024 + + Bytes: 2055798784 bytes + FLOPs: 118482796544 flops + + Runtime: 8.13237 ms + Memory: 235.431 GiB/s + + Math: 14569.3 GFLOP/s + +``` + +## Example Tensor Core Convolution Operation (S16816FPROP) + +Example command line for profiling Convolution kernels is as follows: + +```bash +$ ./tools/profiler/cutlass_profiler --kernels=cutlass_tensorop_s16816fprop_optimized_f16_128x128_64x4_nhwc --verification-providers=device --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 + + + +============================= + Problem ID: 1 + + Provider: CUTLASS + OperationKind: conv2d + Operation: cutlass_tensorop_s16816fprop_optimized_f16_128x128_64x4_nhwc + + Status: Success + Verification: ON + Disposition: Passed + +reference_device: Passed + + Arguments: --conv_kind=fprop --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 --p=224 --q=224 --pad_h=1 --pad_w=1 \ + --stride_h=1 --stride_w=1 --dilation_h=1 --dilation_w=1 --Activation=f16:nhwc --Filter=f16:nhwc --Output=f32:nhwc \ + --conv_mode=cross --iterator_algorithm=optimized --alpha=1 --beta=0 --split_k_mode=serial --split_k_slices=1 \ + --eq_gemm_provider=none --op_class=tensorop --accum=f32 --cta_m=128 --cta_n=128 --cta_k=64 --stages=4 \ + --warps_m=2 --warps_n=2 --warps_k=1 --inst_m=16 --inst_n=8 --inst_k=16 --min_cc=80 --max_cc=1024 + + Bytes: 1130659840 bytes + FLOPs: 118482796544 flops + + Runtime: 0.945071 ms + Memory: 1114.21 GiB/s + + Math: 125369 GFLOP/s + + +``` + # Copyright Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. diff --git a/media/docs/quickstart.md b/media/docs/quickstart.md index 427fe13c66..425d927003 100644 --- a/media/docs/quickstart.md +++ b/media/docs/quickstart.md @@ -7,11 +7,15 @@ ## Prerequisites CUTLASS requires: -- NVIDIA CUDA Toolkit (9.2 or later required, [11.0](https://developer.nvidia.com/cuda-toolkit) recommended) +- NVIDIA CUDA Toolkit (9.2 or later required, [11.1](https://developer.nvidia.com/cuda-toolkit) recommended) - CMake 3.12+ - host compiler supporting C++11 or greater (g++ 7.3.0 or Microsoft Visual Studio 2015 recommended) - Python 3.6+ +CUTLASS may be optionally compiled and linked with +- cuBLAS +- cuDNN v7.6 or later + ## Initial build steps Construct a build directory and run CMake. @@ -31,6 +35,23 @@ $ cmake .. -DCUTLASS_NVCC_ARCHS=80 -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BU This reduces overall compilation time by excluding unit tests and enabling the unit build. +You may reduce build times by compiling only certain operations by setting the `CUTLASS_LIBRARY_OPERATIONS` flag as shown below, +executed from an empty `build/` directory. This only compiles 2-D convolution kernels. + +```bash +$ cmake .. -DCUTLASS_NVCC_ARCHS=80 -DCUTLASS_LIBRARY_OPERATIONS=conv2d +``` + +You may also filter kernels by name by supplying a filter string with flag `CUTLASS_LIBRARY_KERNELS`. + +```bash +$ cmake .. -DCUTLASS_NVCC_ARCHS=80 -DCUTLASS_LIBRARY_KERNELS=s16816gemm,s16816fprop*128x128 +``` + +You may explicitly exclude cuBLAS and cuDNN as dependencies with the following CMake flags. +- `-DCUTLASS_ENABLE_CUBLAS=OFF` +- `-DCUTLASS_ENABLE_CUDNN=OFF` + ## Build and run the CUTLASS Profiler @@ -39,7 +60,7 @@ From the `build/` directory created above, compile the the CUTLASS Profiler. $ make cutlass_profiler -j12 ``` -Then execute the CUTLASS Profiler for a set of problem sizes. +Then execute the CUTLASS Profiler computing GEMM, execute the following command. ```bash $ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=4352 --n=4096 --k=4096 @@ -66,6 +87,45 @@ $ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=4352 --n=4096 --k=4096 Math: 13854.9 GFLOP/s ``` +To execute the CUTLASS Profiler for Convolution, run the following example. +```bash +$ ./tools/profiler/cutlass_profiler --kernels=s1688fprop --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 --pad_h=1 --pad_w=1 +``` + +To execute all CUTLASS 2-D convolution operators, execute the following. +```bash +$ ./tools/profiler/cutlass_profiler --operation=conv2d--n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 + + +============================= + Problem ID: 1 + + Provider: CUTLASS + OperationKind: conv2d + Operation: cutlass_simt_sfprop_optimized_128x128_8x2_nhwc + + Status: Success + Verification: ON + Disposition: Passed + +reference_device: Passed + + Arguments: --conv_kind=fprop --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 --p=224 --q=224 --pad_h=1 --pad_w=1 \ + --stride_h=1 --stride_w=1 --dilation_h=1 --dilation_w=1 --Activation=f32:nhwc --Filter=f32:nhwc --Output=f32:nhwc \ + --conv_mode=cross --iterator_algorithm=optimized --alpha=1 --beta=0 --split_k_mode=serial --split_k_slices=1 \ + --eq_gemm_provider=none --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 --stages=2 --warps_m=4 \ + --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 --max_cc=1024 + + Bytes: 2055798784 bytes + FLOPs: 118482796544 flops + + Runtime: 8.13237 ms + Memory: 235.431 GiB/s + + Math: 14569.3 GFLOP/s + +``` + See [documentation for the CUTLASS Profiler](profiler.md) for more details. ## Build and run CUTLASS Unit Tests diff --git a/media/images/conv2d-fprop-int4.png b/media/images/conv2d-fprop-int4.png new file mode 100644 index 0000000000..375c0d752f Binary files /dev/null and b/media/images/conv2d-fprop-int4.png differ diff --git a/media/images/ldmatrix-8x128bx4.png b/media/images/ldmatrix-8x128bx4.png new file mode 100644 index 0000000000..44d50d9ae8 Binary files /dev/null and b/media/images/ldmatrix-8x128bx4.png differ diff --git a/media/images/ldmatrix-tensorop-32x32x32.png b/media/images/ldmatrix-tensorop-32x32x32.png new file mode 100644 index 0000000000..7acc9723f9 Binary files /dev/null and b/media/images/ldmatrix-tensorop-32x32x32.png differ diff --git a/media/images/mma-8x8x32.png b/media/images/mma-8x8x32.png new file mode 100644 index 0000000000..ff65d83222 Binary files /dev/null and b/media/images/mma-8x8x32.png differ diff --git a/media/images/tensor-op-permuted-smem-layout-TN-k0.png b/media/images/tensor-op-permuted-smem-layout-TN-k0.png new file mode 100644 index 0000000000..b9ab8cb313 Binary files /dev/null and b/media/images/tensor-op-permuted-smem-layout-TN-k0.png differ diff --git a/media/images/tensor-op-permuted-smem-layout-TN-k1.png b/media/images/tensor-op-permuted-smem-layout-TN-k1.png new file mode 100644 index 0000000000..ea7d8b3be9 Binary files /dev/null and b/media/images/tensor-op-permuted-smem-layout-TN-k1.png differ diff --git a/media/images/tensor-op-permuted-smem-layout-TN.png b/media/images/tensor-op-permuted-smem-layout-TN.png new file mode 100644 index 0000000000..5bb4fe47b3 Binary files /dev/null and b/media/images/tensor-op-permuted-smem-layout-TN.png differ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 35994ba6d8..436990fd66 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -21,3 +21,4 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add_subdirectory(unit) + diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt index 52368a346a..d57570ce6c 100644 --- a/test/unit/CMakeLists.txt +++ b/test/unit/CMakeLists.txt @@ -49,19 +49,14 @@ target_link_libraries( cutlass_test_unit_infra ) -set(CUTLASS_INSTALL_TESTS ON CACHE BOOL "Install test executables") -set(CUTLASS_TEST_EXECUTION_ENVIRONMENT "" CACHE BOOL "Environment in which to invoke unit test executables") - -function(cutlass_test_unit_add_executable) +function(cutlass_test_unit_add_executable NAME) set(options) set(oneValueArgs) set(multiValueArgs) cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - cutlass_add_executable(${__UNPARSED_ARGUMENTS}) - - list(GET __UNPARSED_ARGUMENTS 0 NAME) + cutlass_add_executable(${NAME} ${__UNPARSED_ARGUMENTS}) target_link_libraries( ${NAME} @@ -72,25 +67,13 @@ function(cutlass_test_unit_add_executable) string(REGEX REPLACE cutlass_ "" NAME_STEM ${NAME}) - add_test(c${NAME_STEM} ${NAME}) + set(CUTLASS_TEST_UNIT_TEST_COMMAND_OPTIONS --gtest_output=xml:${NAME_STEM}.gtest.xml) - add_custom_target( - ${NAME_STEM} - COMMAND - ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $ - DEPENDS - ${NAME} + cutlass_add_executable_tests( + ${NAME_STEM} ${NAME} + TEST_COMMAND_OPTIONS CUTLASS_TEST_UNIT_TEST_COMMAND_OPTIONS ) - if (CUTLASS_INSTALL_TESTS) - - install( - TARGETS ${NAME} - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} - ) - - endif() - endfunction() add_custom_target(cutlass_test_unit) @@ -99,6 +82,7 @@ add_custom_target(test_unit) set(SUBDIRS core gemm + conv layout transform epilogue diff --git a/test/unit/conv/CMakeLists.txt b/test/unit/conv/CMakeLists.txt new file mode 100644 index 0000000000..a50a58f59e --- /dev/null +++ b/test/unit/conv/CMakeLists.txt @@ -0,0 +1,42 @@ +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, are permitted +# provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright notice, this list of +# conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, this list of +# conditions and the following disclaimer in the documentation and/or other materials +# provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used +# to endorse or promote products derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +add_custom_target(cutlass_test_unit_conv) +add_custom_target(test_unit_conv) + +set(CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED ON CACHE BOOL + "Enable/Disable convolution device reference for conv unit tests.") + +if(CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED) + message(STATUS "Enable device reference verification in conv unit tests") + list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED=1) +endif() + +foreach(SUBDIR + device + ) + + add_subdirectory(${SUBDIR}) + add_dependencies(cutlass_test_unit_conv cutlass_test_unit_conv_${SUBDIR}) + add_dependencies(test_unit_conv test_unit_conv_${SUBDIR}) + +endforeach() diff --git a/test/unit/conv/device/CMakeLists.txt b/test/unit/conv/device/CMakeLists.txt new file mode 100644 index 0000000000..ce907e0d58 --- /dev/null +++ b/test/unit/conv/device/CMakeLists.txt @@ -0,0 +1,148 @@ +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, are permitted +# provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright notice, this list of +# conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, this list of +# conditions and the following disclaimer in the documentation and/or other materials +# provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used +# to endorse or promote products derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + add_custom_target( + cutlass_test_unit_conv_device + DEPENDS + cutlass_test_unit_conv_device_simt + cutlass_test_unit_conv_device_tensorop_f32_sm70 + cutlass_test_unit_conv_device_tensorop_f32_sm75 + cutlass_test_unit_conv_device_tensorop_f16_sm80 + cutlass_test_unit_conv_device_tensorop_f32_sm80 + cutlass_test_unit_conv_device_tensorop_f32_tf32_sm80 + cutlass_test_unit_conv_device_tensorop_s32 + cutlass_test_unit_conv_device_tensorop_s32_interleaved +) + + add_custom_target( + test_unit_conv_device + DEPENDS + test_unit_conv_device_simt + test_unit_conv_device_tensorop_f32_sm70 + test_unit_conv_device_tensorop_f32_sm75 + test_unit_conv_device_tensorop_f16_sm80 + test_unit_conv_device_tensorop_f32_sm80 + test_unit_conv_device_tensorop_f32_tf32_sm80 + test_unit_conv_device_tensorop_s32 + test_unit_conv_device_tensorop_s32_interleaved +) + +# +# OpClassSimt (CUDA cores) +# + +cutlass_test_unit_add_executable( + cutlass_test_unit_conv_device_simt + + # F32 + conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu + + conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu + conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu + conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu + + # CF32 + conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu + conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu + conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu + + conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu + conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu + conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu +) + +# +# OpClassTensorOp (Tensor cores) +# + +# Conv - F16 input, F32 output, F32 accumulation +cutlass_test_unit_add_executable( + cutlass_test_unit_conv_device_tensorop_f32_sm70 + + conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu + conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu + conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu +) + +# Conv2d - F16 input, F32 output, F32 accumulation - SM75 +cutlass_test_unit_add_executable( + cutlass_test_unit_conv_device_tensorop_f32_sm75 + + conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu + conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu + conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu +) + +# Conv2d - F16 input, F16 output, F16 accumulation +cutlass_test_unit_add_executable( + cutlass_test_unit_conv_device_tensorop_f16_sm80 + + conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu + conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu + conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu +) + +# Conv2d - F16 input, F32 output, F32 accumulation +cutlass_test_unit_add_executable( + cutlass_test_unit_conv_device_tensorop_f32_sm80 + + + conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu + conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu + conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu + + conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu + conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu +) + +# Conv2d - TF32 input, F32 output, F32 accumulation +cutlass_test_unit_add_executable( + cutlass_test_unit_conv_device_tensorop_f32_tf32_sm80 + + conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu + conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu + conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu + + conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu + conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu + conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu +) + +# Conv2d - S8 input, S32 output, S32 accumulation +cutlass_test_unit_add_executable( + cutlass_test_unit_conv_device_tensorop_s32 + + conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu + conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu + conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu + conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu +) + +# Conv2d - S8 interleaved input, S8 interleaved output, S32 accumulation +cutlass_test_unit_add_executable( + cutlass_test_unit_conv_device_tensorop_s32_interleaved + + conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu + conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu + conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu + conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu +) diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu new file mode 100644 index 0000000000..4d500d9783 --- /dev/null +++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu @@ -0,0 +1,130 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_dgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM50_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 64x64_8x2_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM50_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 32x64_8x2_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kUnity + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu new file mode 100644 index 0000000000..cc36edc75e --- /dev/null +++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu @@ -0,0 +1,314 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_dgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 32x64_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 64x64_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 128x128_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 128x128_8x4_64x32x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 32x64_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kUnity + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 128x128_8x4_64x32x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kUnity + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu new file mode 100644 index 0000000000..aab0d34e49 --- /dev/null +++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu @@ -0,0 +1,123 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_dgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, + 128x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + using ElementCompute = cutlass::half_t; + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, + 128x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + using ElementCompute = cutlass::half_t; + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kUnity + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu new file mode 100644 index 0000000000..bc9ee6e9d7 --- /dev/null +++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu @@ -0,0 +1,118 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv2d_dgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM70_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, + 128x128_32x2_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM70_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride, + 128x128_32x2_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kUnity + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM70_SUPPORTED diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu new file mode 100644 index 0000000000..7417f92197 --- /dev/null +++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu @@ -0,0 +1,159 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv2d_dgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + + +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM75_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, + 128x128_32x2_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM75_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride, + 128x128_32x2_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic, + cutlass::conv::StrideSupport::kUnity + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM75_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride, + 128x128_32x2_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kUnity + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM75_SUPPORTED diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu new file mode 100644 index 0000000000..01f51a2cc4 --- /dev/null +++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu @@ -0,0 +1,286 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv2d_dgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, + 128x128_32x3_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic, + cutlass::conv::StrideSupport::kStrided + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride, + 128x128_32x3_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic, + cutlass::conv::StrideSupport::kUnity + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride, + 128x128_32x3_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kUnity + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride, + 128x128_32x4_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kUnity + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride, + 128x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kUnity + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride, + 128x128_64x4_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kUnity + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu new file mode 100644 index 0000000000..7682a319fe --- /dev/null +++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu @@ -0,0 +1,323 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_dgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32, + 32x64_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = float; + using ElementB = float; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32, + 64x64_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = float; + using ElementB = float; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32, + 128x128_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = float; + using ElementB = float; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + test::conv::device::Conv2dProblemVector user_size; + + user_size.push_back(cutlass::conv::Conv2dProblemSize( + {1, 8, 8, 4}, // input size (NHWC) + {8, 1, 1, 4}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d(user_size)); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32, + 128x128_8x4_64x32x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = float; + using ElementB = float; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32, + 32x64_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = float; + using ElementB = float; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kUnity + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32, + 128x128_8x4_64x32x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = float; + using ElementB = float; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kUnity + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu new file mode 100644 index 0000000000..48c6ddb043 --- /dev/null +++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu @@ -0,0 +1,124 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv2d_dgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, + 128x128_32x3_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::tfloat32_t; + using ElementB = cutlass::tfloat32_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic, + cutlass::conv::StrideSupport::kUnity + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, + 128x128_32x3_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::tfloat32_t; + using ElementB = cutlass::tfloat32_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dDgradKernel = typename cutlass::conv::kernel::DefaultConv2dDgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kUnity + >::Kernel; + + using Conv2dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu new file mode 100644 index 0000000000..b3b66a9de1 --- /dev/null +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu @@ -0,0 +1,222 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_Device_Conv2d_Fprop_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 32x64_8x2_32x32x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_Device_Conv2d_Fprop_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 32x128_8x2_16x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 128, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM50_Device_Conv2d_Fprop_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 64x64_8x2_32x32x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM50_Device_Conv2d_Fprop_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 32x64_8x2_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>, + 2, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + + diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu new file mode 100644 index 0000000000..25e3ee0d5f --- /dev/null +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu @@ -0,0 +1,397 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 32x64_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 64x64_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 128x128_8x4_64x32x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 128x128_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 128x128_8x4_64x32x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 32x64_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 128x128_8x5_64x32x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 5, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 64x64_8x3_64x32x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu new file mode 100644 index 0000000000..e151f5a78f --- /dev/null +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu @@ -0,0 +1,121 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, + 128x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + using ElementCompute = cutlass::half_t; + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, + 128x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + using ElementCompute = cutlass::half_t; + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu new file mode 100644 index 0000000000..4c8102a503 --- /dev/null +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu @@ -0,0 +1,124 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32, + 128x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = cutlass::half_t; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32, + 128x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = cutlass::half_t; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED + diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu new file mode 100644 index 0000000000..15f5585839 --- /dev/null +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu @@ -0,0 +1,81 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) + +TEST(SM70_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, + 128x128_32x2_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM70_SUPPORTED diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu new file mode 100644 index 0000000000..b54359f177 --- /dev/null +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu @@ -0,0 +1,121 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + +TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, + 128x128_32x2_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, + 128x128_32x2_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM75_SUPPORTED diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu new file mode 100644 index 0000000000..51d2b942f4 --- /dev/null +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu @@ -0,0 +1,124 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, + 128x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#if 0 +TEST(SM80_Device_Conv2d_Fprop_Precomputed_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, + 128x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu new file mode 100644 index 0000000000..820f0fb89f --- /dev/null +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu @@ -0,0 +1,82 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32, + 128x128_8x2_64x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = float; + using ElementB = float; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu new file mode 100644 index 0000000000..746e7d7b0b --- /dev/null +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu @@ -0,0 +1,321 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32, + 32x64_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = float; + using ElementB = float; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32, + 64x64_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = float; + using ElementB = float; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32, + 128x128_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = float; + using ElementB = float; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + test::conv::device::Conv2dProblemVector user_size; + + user_size.push_back(cutlass::conv::Conv2dProblemSize( + {1, 8, 8, 4}, // input size (NHWC) + {8, 1, 1, 4}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d(user_size)); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32, + 128x128_8x4_64x32x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = float; + using ElementB = float; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32, + 32x64_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = float; + using ElementB = float; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32, + 128x128_8x4_64x32x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = float; + using ElementB = float; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu new file mode 100644 index 0000000000..7255eac644 --- /dev/null +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu @@ -0,0 +1,520 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed_interleaved.h" + +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + + +TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 128x128_128x2_64x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 256x128_128x2_64x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 128x256_128x2_64x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 256x64_128x2_64x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 64, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 64x256_128x2_64x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 64x128_128x2_32x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 128>, + cutlass::gemm::GemmShape<32, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 128x128_128x2_64x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 256x128_128x2_64x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 128x256_128x2_64x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 256x64_128x2_64x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 64, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 64x256_128x2_64x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 64x128_128x2_32x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 128>, + cutlass::gemm::GemmShape<32, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM75_SUPPORTED diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu new file mode 100644 index 0000000000..7e9bb9060b --- /dev/null +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu @@ -0,0 +1,521 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed_interleaved.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 128x128_128x3_64x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 256x128_128x3_64x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 128x256_128x3_64x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 256x64_128x3_64x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 64x256_128x3_64x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 64x128_128x4_32x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 128>, + cutlass::gemm::GemmShape<32, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 4, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 128x128_128x3_64x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 256x128_128x3_64x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 128x256_128x3_64x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 256x64_128x3_64x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 64x256_128x3_64x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32, + 64x128_128x4_32x64x128) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<64>, + ElementB, cutlass::layout::TensorCxRSKx<64>, + ElementC, cutlass::layout::TensorNCxHWx<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 128>, + cutlass::gemm::GemmShape<32, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 4, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu new file mode 100644 index 0000000000..5426003779 --- /dev/null +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu @@ -0,0 +1,119 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + +TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32, + 128x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32, + 128x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<8, 8, 32>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM75_SUPPORTED diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu new file mode 100644 index 0000000000..d0ba7a5047 --- /dev/null +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu @@ -0,0 +1,121 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32, + 128x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32, + 128x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::int4b_t; + using ElementB = cutlass::int4b_t; + using ElementC = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu new file mode 100644 index 0000000000..fbab373165 --- /dev/null +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu @@ -0,0 +1,679 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed_interleaved.h" + +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + +TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 128x128_64x2_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 256x128_64x2_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 128x256_64x2_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 256x64_64x2_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 64x256_64x2_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 128x64_64x2_64x32x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 64x128_64x2_32x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 64x64_64x2_32x32x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 128x128_64x2_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 256x128_64x2_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 128x256_64x2_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 256x64_64x2_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 64x256_64x2_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 128x64_64x2_64x32x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 64x128_64x2_32x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 64x64_64x2_32x32x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM75_SUPPORTED diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu new file mode 100644 index 0000000000..e8b7c44fe2 --- /dev/null +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu @@ -0,0 +1,680 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed_interleaved.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 128x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 256x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 128x256_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 256x64_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 64x256_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 128x64_64x4_64x32x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 4, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 64x128_64x4_32x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 4, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 64x64_64x6_32x32x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 6, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 128x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 256x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 128x256_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 256x64_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 64x256_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 128x64_64x4_64x32x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 4, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 64x128_64x4_32x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 4, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32, + 64x64_64x6_32x32x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 6, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE((test::conv::device::TestAllInterleavedConv2d())); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu new file mode 100644 index 0000000000..e5146be328 --- /dev/null +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu @@ -0,0 +1,119 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + +TEST(SM75_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, + 128x128_64x2_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, + 128x128_64x2_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<8, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM75_SUPPORTED diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu new file mode 100644 index 0000000000..4cfdd3722d --- /dev/null +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu @@ -0,0 +1,120 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, + 128x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, + 128x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu new file mode 100644 index 0000000000..c1a1f647a3 --- /dev/null +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu @@ -0,0 +1,81 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, + 128x128_32x3_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::tfloat32_t; + using ElementB = cutlass::tfloat32_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv2d_problems.h b/test/unit/conv/device/conv2d_problems.h new file mode 100644 index 0000000000..74b43e11c7 --- /dev/null +++ b/test/unit/conv/device/conv2d_problems.h @@ -0,0 +1,520 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Implicit GEMM testbed sizes for Conv2d problem +*/ +#pragma once + +#include + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/cutlass.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" + +#define CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED 1 + +namespace test { +namespace conv { +namespace device { + +using Conv2dProblemVector = std::vector; + +// +// Structures to prune items from Conv2dProblemVector +// +// Specification template for pruning items for convolution problem lists +template struct Specification +{ + virtual ~Specification() = default; + virtual bool is_satisfied(T item) const = 0; +}; + +// input size (NHWC) specification +struct InputSizeSpecification : Specification +{ + cutlass::Tensor4DCoord input_size; + + InputSizeSpecification(cutlass::Tensor4DCoord input_size_) : input_size(input_size_) {} + + bool is_satisfied(cutlass::conv::Conv2dProblemSize item) const override { + return ((input_size.n() == item.N) && (input_size.h() == item.H) && (input_size.w() == item.W) && (input_size.c() == item.C)); + } +}; + +// stride (stride_h, stride_w) specification +struct StrideSpecification : Specification +{ + cutlass::MatrixCoord stride; + + StrideSpecification(cutlass::MatrixCoord stride_) : stride(stride_) {} + + bool is_satisfied(cutlass::conv::Conv2dProblemSize item) const override { + return ((stride.row() == item.stride_h) && (stride.column() == item.stride_h)); + } +}; + +// channel (C,K) specification, must be multiple of minimum channel +struct ChannelDivisibilitySpecification : Specification +{ + int channel_multiple; + + ChannelDivisibilitySpecification(int channel_multiple_) : channel_multiple(channel_multiple_) {} + + bool is_satisfied(cutlass::conv::Conv2dProblemSize item) const override { + return ((item.K % channel_multiple == 0) && (item.C % channel_multiple == 0)); + } +}; + +// +// Pruning function for items from Conv2dProblemVector based on a Specification +// +inline Conv2dProblemVector prune(Conv2dProblemVector const &items, + Specification const &spec) +{ + Conv2dProblemVector pruned_list; + + for (auto& p : items) + if (spec.is_satisfied(p)) + pruned_list.push_back(p); + return pruned_list; +} + + +//////////////////////////////////////////////////////////////////////////// +/// Structure TestbedConv2dProblemSizes initializes and holds conv default and +/// important network sizes +//////////////////////////////////////////////////////////////////////////// +struct TestbedConv2dProblemSizes { + + // + // Data members + // + int minimum_channel_size; + + Conv2dProblemVector conv2d_default_sizes; + Conv2dProblemVector conv2d_rigorous_sizes; + Conv2dProblemVector conv2d_resnet50_sizes; + Conv2dProblemVector conv2d_resnet50_sizes_perf; + + // + // Methods + // + /// Default ctor + TestbedConv2dProblemSizes(int minimum_channel_size_ = 64): minimum_channel_size (minimum_channel_size_) { + initialize_conv2d_default_sizes(); + initialize_conv2d_rigorous_sizes(); + initialize_conv2d_resnet50_sizes(conv2d_resnet50_sizes, 1 /*batch-size*/); + + initialize_conv2d_resnet50_sizes(conv2d_resnet50_sizes_perf, 34 /*batch-size*/); + filter_all(); + } + + /// Eliminates some illegal cases + void filter_all() { + + Conv2dProblemVector *problems_vectors[] = { + &conv2d_default_sizes, + &conv2d_rigorous_sizes, + &conv2d_resnet50_sizes, + &conv2d_resnet50_sizes_perf + }; + + for (Conv2dProblemVector *problems : problems_vectors) { + Conv2dProblemVector filtered; + + for (cutlass::conv::Conv2dProblemSize const & problem : *problems) { + if (!(problem.C % minimum_channel_size)) { + filtered.push_back(problem); + } + } + + *problems = filtered; + } + } + + // Add a few standard convolution problem sizes + void initialize_conv2d_default_sizes() { + + //////////////////////////////////////////////////////////////////////////////////////////// + // Very Small input size (1x8x8xminimum_channel_size), filter size (3x3 - 7x7), stride (1,1) + // C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64} + //////////////////////////////////////////////////////////////////////////////////////////// + + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {1, 8, 8, minimum_channel_size}, // input size (NHWC) + {8, 3, 3, minimum_channel_size}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {1, 8, 8, minimum_channel_size}, // input size (NHWC) + {8, 4, 4, minimum_channel_size}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {1, 8, 8, minimum_channel_size}, // input size (NHWC) + {8, 5, 5, minimum_channel_size}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {1, 8, 8, minimum_channel_size}, // input size (NHWC) + {8, 6, 5, minimum_channel_size}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {1, 8, 8, minimum_channel_size}, // input size (NHWC) + {8, 6, 6, minimum_channel_size}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {1, 8, 8, minimum_channel_size}, // input size (NHWC) + {8, 7, 7, minimum_channel_size}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + //////////////////////////////////////////////////////////////////////////////////// + // Medium input size (1x16x16x128), filter size (1x1, 2x2, 3x3, 5x5), stride (1, 1) + //////////////////////////////////////////////////////////////////////////////////// + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {1, 15, 19, 160}, // input size (NHWC) + {224, 1, 1, 160}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {1, 16, 16, 160}, // input size (NHWC) + {224, 2, 3, 160}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {1, 23, 21, 128}, // input size (NHWC) + {224, 3, 3, 128}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {1, 29, 37, 160}, // input size (NHWC) + {224, 5, 5, 160}, // filter size (KRSC) + {2, 2, 2, 2}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + //////////////////////////////////////////////////////////////////////////////////// + // C > CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64} + //////////////////////////////////////////////////////////////////////////////////// + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {1, 15, 19, 32 + minimum_channel_size}, // input size (NHWC) + {96, 3, 3, 32 + minimum_channel_size}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {1, 16, 16, 64 + minimum_channel_size}, // input size (NHWC) + {96, 3, 3, 64 + minimum_channel_size}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + //////////////////////////////////////////////////////////////////////////////////// + // Medium input size (1x16x16x128), filter size (1x1, 3,x3, 5x5), stride (2, 2) + //////////////////////////////////////////////////////////////////////////////////// + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {1, 19, 37, 160}, // input size (NHWC) + {224, 3, 3, 160}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {2, 2}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {1, 16, 16, 288}, // input size (NHWC) + {160, 5, 5, 288}, // filter size (KRSC) + {2, 2, 2, 2}, // padding (pad_h, _, pad_w, _) + {2, 2}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + ///////////////////////////////////////////////////////////////////////////// + // Additional input size + ///////////////////////////////////////////////////////////////////////////// + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {3, 28, 28, 256}, // input size (NHWC) + {256, 2, 2, 256}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {2, 2}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {32, 32, 32, 32}, // input size (NHWC) + {32, 1, 1, 32}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {4, 3, 3, 128}, // input size (NHWC) + {256, 3, 3, 128}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1}, // dilation (dilation_h, dilation_w) + {4, 3, 3, 256} // output size (NPQK) + )); + + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {4, 1, 1, 256}, // input size (NHWC) + {328, 3, 3, 256}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1}, // dilation (dilation_h, dilation_w) + {4, 1, 1, 328} // output size (NPQK) + )); + + } + + + // Add a few large and rigorous convolution problem sizes + void initialize_conv2d_rigorous_sizes() { + +#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED + conv2d_rigorous_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {1, 124, 224, 96}, // input size (NHWC) + {24, 7, 7, 96}, // filter size (KRSC) + {1, 229, 129, 32} // output size (NPQK) + )); + + conv2d_rigorous_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {1, 233, 35, 48}, // input size (NHWC) + {24, 7, 5, 48}, // filter size (KRSC) + {1, 233, 35, 24} // output size (NPQK) + )); + +#endif + + } + + + // Add resent50 layers to unit testing sizes + void initialize_conv2d_resnet50_sizes(Conv2dProblemVector &conv2d_problem_vector, int batch_size = 1){ + +#if 0 // Resnet50 first layer (layer_id = 0) with channel = 3 is not supported in cutlass + conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize( + [1, 224, 224, 3], // input size (NHWC) + [64, 7, 7, 3], // filter size (KRSC) + [3, 3, 3, 3], // padding (pad_h, _, pad_w, _) + [2, 2], // stride (stride_h, stride_w) + [1, 1], // dilation (dilation_h, dilation_w) + )); +#endif + + conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize( + {batch_size, 56, 56, 64}, // input size (NHWC) + {256, 1, 1, 64}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize( + {batch_size, 56, 56, 64}, // input size (NHWC) + {64, 1, 1, 64}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize( + {batch_size, 56, 56, 64}, // input size (NHWC) + {64, 3, 3, 64}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize( + {batch_size, 56, 56, 256}, // input size (NHWC) + {64, 1, 1, 256}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize( + {batch_size, 56, 56, 256}, // input size (NHWC) + {512, 1, 1, 256}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {2, 2}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize( + {batch_size, 56, 56, 256}, // input size (NHWC) + {128, 1, 1, 256}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {2, 2}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize( + {batch_size, 28, 28, 128}, // input size (NHWC) + {128, 3, 3, 128}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize( + {batch_size, 28, 28, 128}, // input size (NHWC) + {512, 1, 1, 128}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize( + {batch_size, 28, 28, 512}, // input size (NHWC) + {128, 1, 1, 512}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize( + {batch_size, 28, 28, 512}, // input size (NHWC) + {1024, 1, 1, 512}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {2, 2}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize( + {batch_size, 28, 28, 512}, // input size (NHWC) + {256, 1, 1, 512}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {2, 2}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize( + {batch_size, 14, 14, 256}, // input size (NHWC) + {256, 3, 3, 256}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize( + {batch_size, 14, 14, 256}, // input size (NHWC) + {1024, 1, 1, 256}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize( + {batch_size, 14, 14, 1024}, // input size (NHWC) + {256, 1, 1, 1024}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize( + {batch_size, 14, 14, 1024}, // input size (NHWC) + {2048, 1, 1, 1024}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {2, 2}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize( + {batch_size, 14, 14, 1024}, // input size (NHWC) + {512, 1, 1, 1024}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {2, 2}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize( + {batch_size, 7, 7, 512}, // input size (NHWC) + {512, 3, 3, 512}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize( + {batch_size, 7, 7, 512}, // input size (NHWC) + {2048, 1, 1, 512}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize( + {batch_size, 7, 7, 2048}, // input size (NHWC) + {512, 1, 1, 2048}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + } + +}; + +} // namespace device +} // namespace conv +} // namespace test diff --git a/test/unit/conv/device/conv2d_testbed.h b/test/unit/conv/device/conv2d_testbed.h new file mode 100644 index 0000000000..14bdd9bf13 --- /dev/null +++ b/test/unit/conv/device/conv2d_testbed.h @@ -0,0 +1,558 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Implicit GEMM testbed +*/ +#pragma once + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/conv/device/implicit_gemm_convolution.h" +#include "cutlass/reduction/device/reduce_split_k.h" +#include "cutlass/reduction/thread/reduction_operators.h" + +#include "conv2d_problems.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_compare.h" + +#include "cutlass/util/reference/host/convolution.h" +#include "cutlass/util/reference/device/convolution.h" + +#include "cutlass/core_io.h" +#include "cutlass/util/tensor_view_io.h" + +namespace test { +namespace conv { +namespace device { + +template +class TestbedConv2d { +public: + + using ElementA = typename Conv2d::ElementA; + using LayoutA = typename Conv2d::LayoutA; + using ElementB = typename Conv2d::ElementB; + using LayoutB = typename Conv2d::LayoutB; + using ElementC = typename Conv2d::ElementC; + using LayoutC = typename Conv2d::LayoutC; + using ElementAccumulator = typename Conv2d::ElementAccumulator; + using ElementCompute = typename Conv2d::ElementCompute; + using EpilogueOutputOp = typename Conv2d::EpilogueOutputOp; + + static cutlass::conv::Operator const kConvolutionalOperator = Conv2d::kConvolutionalOperator; + + /// Reduction kernel + using ReductionOp = cutlass::reduction::thread::ReduceAdd< + ElementAccumulator, + typename EpilogueOutputOp::ElementAccumulator, + EpilogueOutputOp::kCount + >; + + using ReductionKernel = cutlass::reduction::kernel::ReduceSplitK< + cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>, + EpilogueOutputOp, + ReductionOp + >; + + using ReductionDevice = cutlass::reduction::device::ReduceSplitK; + + +public: + + /// Initialization + cutlass::Distribution::Kind init_A; + cutlass::Distribution::Kind init_B; + cutlass::Distribution::Kind init_C; + uint64_t seed; + + cutlass::HostTensor tensor_A; + cutlass::HostTensor tensor_B; + cutlass::HostTensor tensor_C; + cutlass::HostTensor tensor_D_computed; + cutlass::HostTensor tensor_D_reference; + +public: + + TestbedConv2d( + cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, + uint64_t seed_ = 2080 + ): + init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { + + } + + /// Helper to initialize a tensor view + template + void initialize_tensor( + cutlass::TensorView view, + cutlass::Distribution::Kind dist_kind, + uint64_t seed) { + + if (dist_kind == cutlass::Distribution::Uniform) { + + int scope; + int bits = cutlass::sizeof_bits::value; + + if (bits <= 8) { + scope = 2; + } + else if (bits == 16) { + scope = 3; + } + else { + scope = 8; + } + cutlass::reference::host::TensorFillRandomUniform( + view, seed, scope, -scope, 0); + } + else if (dist_kind == cutlass::Distribution::Identity) { + + cutlass::reference::host::TensorFillIdentity(view); + } + else if (dist_kind == cutlass::Distribution::Gaussian) { + + cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5); + } + else if (dist_kind == cutlass::Distribution::Sequential) { + + cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); + } + else { + } + } + + void initialize( + cutlass::conv::Conv2dProblemSize const &problem_size, uint64_t seed = 2019) { + + tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size)); + tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size)); + tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size)); + tensor_D_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size)); + tensor_D_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size)); + + initialize_tensor(tensor_A.host_view(), init_A, seed); + initialize_tensor(tensor_B.host_view(), init_B, seed * 17); + initialize_tensor(tensor_C.host_view(), init_C, seed * 39); + + tensor_A.sync_device(); + tensor_B.sync_device(); + tensor_C.sync_device(); + tensor_D_computed.sync_device(); + tensor_D_reference.sync_device(); + } + + bool sufficient() const { + // + // Determine SMEM requirements and waive if not satisfied + // + + int smem_size = int(sizeof(typename Conv2d::ImplicitGemmKernel::SharedStorage)); + + cudaDeviceProp properties; + int device_idx; + cudaError_t result = cudaGetDevice(&device_idx); + + if (result != cudaSuccess) { + throw std::runtime_error("cudaGetDevice() API call failed."); + } + + result = cudaGetDeviceProperties(&properties, device_idx); + + if (result != cudaSuccess) { + throw std::runtime_error("cudaGetDeviceProperties() failed"); + } + + if (properties.sharedMemPerMultiprocessor < smem_size) { + return false; + } + + return true; + } + + /// Executes one test + bool run( + cutlass::conv::Conv2dProblemSize const &problem_size, + cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial, + ElementCompute alpha = ElementCompute(1), + ElementCompute beta = ElementCompute(0)) { + + // Waive test if CUDA device is insufficient + if (!sufficient()) { + return true; + } + +#if 0 //display conv2d problem size for debugging + std::cout << problem_size << std::endl + << "alpha, beta: (" << float(alpha) << ", " << float(beta) << ")" << std::endl + << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl + << std::endl; +#endif + + initialize(problem_size); + + // configure the operator + Conv2d conv2d_op; + + typename Conv2d::Arguments conv2d_args( + problem_size, + tensor_A.device_ref(), + tensor_B.device_ref(), + tensor_C.device_ref(), + tensor_D_computed.device_ref(), + {alpha, beta}, + split_k_mode + ); + + // find workspace requirement for parallel split-k reduction + size_t workspace_size = Conv2d::get_workspace_size(conv2d_args); + + cutlass::device_memory::allocation workspace(workspace_size); + + cutlass::Status status = conv2d_op.initialize(conv2d_args, workspace.get()); + + if (status != cutlass::Status::kSuccess) { + cudaError_t error = cudaGetLastError(); + std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n"; + return true; + } + + // conv2d operation with parallel split-k-mode + if (split_k_mode == cutlass::conv::SplitKMode::kParallel) { + + // conv2d output is written to workspace in global memory + conv2d_args.ref_D.reset(reinterpret_cast(workspace.get())); + // accumulate mma for each cta in k-dimension (1.0 * A * B) + conv2d_args.output_op = {ElementCompute(1), ElementCompute(0)}; + // update conv2d operator arguments + status = conv2d_op.update(conv2d_args, workspace.get()); + } + + EXPECT_TRUE(status == cutlass::Status::kSuccess); + if (status != cutlass::Status::kSuccess) { + return false; + } + + // run conv2d operator + status = conv2d_op(); + + EXPECT_TRUE(status == cutlass::Status::kSuccess); + if (status != cutlass::Status::kSuccess) { + return false; + } + + if (split_k_mode == cutlass::conv::SplitKMode::kParallel) { + + // configure parallel reduction operator + ReductionDevice reduction_op; + + typename ReductionDevice::Arguments reduction_args( + cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn(), + problem_size.split_k_slices, + cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size), + {reinterpret_cast (workspace.get()), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)}, + {tensor_D_computed.device_data(), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)}, + {tensor_C.device_data(), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)}, + {alpha, beta} // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C + ); + + status = reduction_op.initialize(reduction_args, nullptr); + + EXPECT_TRUE(status == cutlass::Status::kSuccess); + if (status != cutlass::Status::kSuccess) { + return false; + } + + // run prallel reduction kernel + status = reduction_op(); + + EXPECT_TRUE(status == cutlass::Status::kSuccess); + if (status != cutlass::Status::kSuccess) { + return false; + } + } + bool passed = false; + + tensor_D_computed.sync_host(); + +#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED + + cutlass::reference::device::Conv2d< + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementCompute, + ElementAccumulator + >( + kConvolutionalOperator, + problem_size, + tensor_A.device_ref(), + tensor_B.device_ref(), + tensor_C.device_ref(), + tensor_D_reference.device_ref(), + alpha, + beta); + + cudaError_t result = cudaDeviceSynchronize(); + EXPECT_EQ(result, cudaSuccess) << " device reference error: " + << cudaGetErrorString(result); + + // sync host (copy device data to host) for dumping error output in case of mismatches + tensor_D_reference.sync_host(); + +#else + + cutlass::reference::host::Conv2d< + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementCompute, + ElementAccumulator + >( + kConvolutionalOperator, + problem_size, + tensor_A.host_ref(), + tensor_B.host_ref(), + tensor_C.host_ref(), + tensor_D_reference.host_ref(), + alpha, + beta); + +#endif + passed = cutlass::reference::host::TensorEquals( + tensor_D_computed.host_view(), + tensor_D_reference.host_view()); + + EXPECT_TRUE(passed); + + if (!passed) { + std::stringstream fname; + + fname << "error_Conv2d_ImplicitGemm_device_" + << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_") + << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" : + (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" : "wgrad_")) + << "nhwc_" + << problem_size.N << "x" + << problem_size.H << "x" + << problem_size.W << "x" + << problem_size.C + << "_krsc_" + << problem_size.K << "x" + << problem_size.R << "x" + << problem_size.S << "x" + << problem_size.C + << "_padding_" + << problem_size.pad_h << "x" + << problem_size.pad_w + << "_stride_" + << problem_size.stride_h << "x" + << problem_size.stride_w + << "_dilation_" + << problem_size.dilation_h << "x" + << problem_size.dilation_w << "_" + << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_") + << Conv2d::ThreadblockShape::kM << "x" + << Conv2d::ThreadblockShape::kN << "x" + << Conv2d::ThreadblockShape::kK << "_" + << Conv2d::WarpShape::kM << "x" + << Conv2d::WarpShape::kN << "x" + << Conv2d::WarpShape::kK << ".txt"; + + std::cout << fname.str() << std::endl; + + std::ofstream results(fname.str()); + + results << problem_size << std::endl; + + results + << "\nA:\n" << tensor_A.host_view() << "\n" + << "\nB:\n" << tensor_B.host_view() << "\n" + << "\nC:\n" << tensor_C.host_view() << "\n" + << "\nD reference:\n" << tensor_D_reference.host_view() << "\n" + << "\nD computed:\n" << tensor_D_computed.host_view() << "\n"; + + } + + return passed; + } + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference +// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes +// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes +// (conv_blacklist_sizes) +///////////////////////////////////////////////////////////////////////////////////////////////////////////// +template +bool TestAllConv2d( + const Conv2dProblemVector & conv_test_sizes = Conv2dProblemVector(), + const Conv2dProblemVector & conv_blacklist_sizes = Conv2dProblemVector()) { + + bool passed = true; + + // + // Testbed object + // + + TestbedConv2d testbed; + + // + // Get conv problem sizes to run conv operator + // + TestbedConv2dProblemSizes conv_problems(128/cutlass::sizeof_bits::value); + + // Vector of conv2d problem sizes to avoid duplicate runs + Conv2dProblemVector conv_tested_sizes; + + Conv2dProblemVector const *problem_vectors[] = { + &conv_test_sizes, // run user specified sizes + &conv_problems.conv2d_default_sizes, // run default and cudnn bug sizes + &conv_problems.conv2d_resnet50_sizes, // run resnet50 sizes +#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED + &conv_problems.conv2d_rigorous_sizes, // run large and rigorous sizes if enabled +#endif + }; + + // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0) + for (Conv2dProblemVector const * problem_vector : problem_vectors) { + + // Run conv testbed on default convolution sizes + for(auto conv_problem : *problem_vector) { + + // Skip blacklist and avoid duplicate problem sizes + if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() || + std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) { + continue; + } + + // + // Procedurally disable certain cases + // + + // CUTLASS DGRAD's unity stride specialization only support stride {1, 1} + if ((ImplicitGemm::kConvolutionalOperator == + cutlass::conv::Operator::kDgrad) && + (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == + cutlass::conv::StrideSupport::kUnity)) { + if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) { + continue; + } + } + + // + // Test + // + // push back tested problem size to avoid re-running duplicates + conv_tested_sizes.push_back(conv_problem); + + // test mode = xcross + passed = testbed.run( + conv_problem, + cutlass::conv::SplitKMode::kSerial); + + if (!passed) { + return false; + } + + // test mode = convolution + passed = testbed.run( + conv_problem.reset_mode(cutlass::conv::Mode::kConvolution), + cutlass::conv::SplitKMode::kSerial); + + if (!passed) { + return false; + } + } + } + + // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for + // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters + // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep + // alpha and beta for local testing, but only runs one value for alpha and beta. + cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size ( + {1, 17, 11, 288}, // input size (NHWC) + {160, 3, 3, 288}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + ); + + cutlass::conv::SplitKMode split_k_modes [] = { + cutlass::conv::SplitKMode::kSerial, + cutlass::conv::SplitKMode::kParallel, + }; + + int split_k_slices[] = { + 1, 2, 3, 4, 201 + }; + + double problem_alpha[] = { + 2.0 + }; + + double problem_beta[] = { + 2.0 + }; + + for (auto split_k_mode : split_k_modes) { + for (auto split_k_slice : split_k_slices) { + for (auto alpha : problem_alpha) { + for (auto beta : problem_beta) { + + passed = testbed.run( + conv2d_split_k_test_size.reset_split_k_slices(split_k_slice), + split_k_mode, + cutlass::from_real(alpha), + cutlass::from_real(beta)); + + if (!passed) { + return false; + } + } + } + } + } + + return passed; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace device +} // namespace conv +} // namespace test diff --git a/test/unit/conv/device/conv2d_testbed_interleaved.h b/test/unit/conv/device/conv2d_testbed_interleaved.h new file mode 100644 index 0000000000..cb4ecc7056 --- /dev/null +++ b/test/unit/conv/device/conv2d_testbed_interleaved.h @@ -0,0 +1,534 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Implicit GEMM testbed +*/ +#pragma once + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/conv/device/implicit_gemm_convolution.h" +#include "cutlass/reduction/device/reduce_split_k.h" +#include "cutlass/reduction/thread/reduction_operators.h" + +#include "conv2d_problems.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/host_reorder.h" + +#include "cutlass/util/reference/host/convolution.h" +#include "cutlass/util/reference/device/convolution.h" + +#include "cutlass/core_io.h" +#include "cutlass/util/tensor_view_io.h" + +namespace test { +namespace conv { +namespace device { + +template +class InterleavedTestbedConv2d { +public: + + using ElementA = typename Conv2d::ElementA; + using LayoutA = typename Conv2d::LayoutA; + using ElementB = typename Conv2d::ElementB; + using LayoutB = typename Conv2d::LayoutB; + using ElementC = typename Conv2d::ElementC; + using LayoutC = typename Conv2d::LayoutC; + using ElementAccumulator = typename Conv2d::ElementAccumulator; + using ElementCompute = typename Conv2d::ElementCompute; + using EpilogueOutputOp = typename Conv2d::EpilogueOutputOp; + + static cutlass::conv::Operator const kConvolutionalOperator = Conv2d::kConvolutionalOperator; + + /// Reduction kernel + using ReductionOp = cutlass::reduction::thread::ReduceAdd< + ElementAccumulator, + typename EpilogueOutputOp::ElementAccumulator, + EpilogueOutputOp::kCount + >; + + using ReductionKernel = cutlass::reduction::kernel::ReduceSplitK< + cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>, + EpilogueOutputOp, + ReductionOp + >; + + using ReductionDevice = cutlass::reduction::device::ReduceSplitK; + + +public: + + /// Initialization + cutlass::Distribution::Kind init_A; + cutlass::Distribution::Kind init_B; + cutlass::Distribution::Kind init_C; + uint64_t seed; + + cutlass::HostTensor tensor_A; + cutlass::HostTensor tensor_B; + cutlass::HostTensor tensor_B_reordered; + cutlass::HostTensor tensor_C; + cutlass::HostTensor tensor_D_computed; + cutlass::HostTensor tensor_D_reference; + +public: + + InterleavedTestbedConv2d( + cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, + uint64_t seed_ = 2080 + ): + init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { + + } + + /// Helper to initialize a tensor view + template + void initialize_tensor( + cutlass::TensorView view, + cutlass::Distribution::Kind dist_kind, + uint64_t seed) { + + if (dist_kind == cutlass::Distribution::Uniform) { + + int scope; + int bits = cutlass::sizeof_bits::value; + + if (bits <= 8) { + scope = 2; + } + else if (bits == 16) { + scope = 3; + } + else { + scope = 8; + } + cutlass::reference::host::TensorFillRandomUniform( + view, seed, scope, -scope, 0); + } + else if (dist_kind == cutlass::Distribution::Identity) { + + cutlass::reference::host::TensorFillIdentity(view); + } + else if (dist_kind == cutlass::Distribution::Gaussian) { + + cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5); + } + else if (dist_kind == cutlass::Distribution::Sequential) { + + cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); + } + else { + } + } + + void initialize( + cutlass::conv::Conv2dProblemSize const &problem_size, uint64_t seed = 2019) { + + tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size)); + tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size)); + tensor_B_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size)); + tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size)); + tensor_D_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size)); + tensor_D_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size)); + + initialize_tensor(tensor_A.host_view(), init_A, seed); + initialize_tensor(tensor_B.host_view(), init_B, seed * 17); + initialize_tensor(tensor_C.host_view(), init_C, seed * 39); + + cutlass::reorder_convK( + tensor_B_reordered.host_ref(), tensor_B.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size)); + + tensor_A.sync_device(); + tensor_B.sync_device(); + tensor_B_reordered.sync_device(); + tensor_C.sync_device(); + tensor_D_computed.sync_device(); + tensor_D_reference.sync_device(); + } + + /// Executes one test + bool run( + cutlass::conv::Conv2dProblemSize const &problem_size, + cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial, + ElementCompute alpha = ElementCompute(1), + ElementCompute beta = ElementCompute(0)) { + +#if 0 //display conv2d problem size for debugging + std::cout << problem_size << std::endl + << "alpha, beta: (" << float(alpha) << ", " << float(beta) << ")" << std::endl + << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl + << std::endl; +#endif + + initialize(problem_size); + + // configure the operator + Conv2d conv2d_op; + + typename Conv2d::Arguments conv2d_args( + problem_size, + tensor_A.device_ref(), + tensor_B_reordered.device_ref(), + tensor_C.device_ref(), + tensor_D_computed.device_ref(), + {alpha, beta}, + split_k_mode + ); + + // find workspace requirement for parallel split-k reduction + size_t workspace_size = Conv2d::get_workspace_size(conv2d_args); + + cutlass::device_memory::allocation workspace(workspace_size); + + cutlass::Status status = conv2d_op.initialize(conv2d_args, workspace.get()); + + // conv2d operation with parallel split-k-mode + if (split_k_mode == cutlass::conv::SplitKMode::kParallel) { + + // conv2d output is written to workspace in global memory + conv2d_args.ref_D.reset(reinterpret_cast(workspace.get())); + // accumulate mma for each cta in k-dimension (1.0 * A * B) + conv2d_args.output_op = {ElementCompute(1), ElementCompute(0)}; + // update conv2d operator arguments + status = conv2d_op.update(conv2d_args, workspace.get()); + } + + EXPECT_TRUE(status == cutlass::Status::kSuccess); + if (status != cutlass::Status::kSuccess) { + return false; + } + + // run conv2d operator + status = conv2d_op(); + + EXPECT_TRUE(status == cutlass::Status::kSuccess); + if (status != cutlass::Status::kSuccess) { + return false; + } + + if (split_k_mode == cutlass::conv::SplitKMode::kParallel) { + + // configure parallel reduction operator + ReductionDevice reduction_op; + + typename ReductionDevice::Arguments reduction_args( + cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn(), + problem_size.split_k_slices, + cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size), + {reinterpret_cast (workspace.get()), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)}, + {tensor_D_computed.device_data(), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)}, + {tensor_C.device_data(), tensor_C.stride(Conv2d::ImplicitGemmKernel::kTensorCStrideIdx)}, + {alpha, beta} // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C + ); + + status = reduction_op.initialize(reduction_args, nullptr); + + EXPECT_TRUE(status == cutlass::Status::kSuccess); + if (status != cutlass::Status::kSuccess) { + return false; + } + + // run prallel reduction kernel + status = reduction_op(); + + EXPECT_TRUE(status == cutlass::Status::kSuccess); + if (status != cutlass::Status::kSuccess) { + return false; + } + } + bool passed = false; + + tensor_D_computed.sync_host(); + +#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED + + cutlass::reference::device::Conv2d< + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementCompute, + ElementAccumulator, + cutlass::NumericConverterClamp + >( + kConvolutionalOperator, + problem_size, + tensor_A.device_ref(), + tensor_B.device_ref(), + tensor_C.device_ref(), + tensor_D_reference.device_ref(), + alpha, + beta); + + cudaError_t result = cudaDeviceSynchronize(); + EXPECT_EQ(result, cudaSuccess) << " device reference error: " + << cudaGetErrorString(result); + + // sync host (copy device data to host) for dumping error output in case of mismatches + tensor_D_reference.sync_host(); + +#else + + cutlass::reference::host::Conv2d< + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementCompute, + ElementAccumulator, + cutlass::NumericConverterClamp + >( + kConvolutionalOperator, + problem_size, + tensor_A.host_ref(), + tensor_B.host_ref(), + tensor_C.host_ref(), + tensor_D_reference.host_ref(), + alpha, + beta); + +#endif + passed = cutlass::reference::host::TensorEquals( + tensor_D_computed.host_view(), + tensor_D_reference.host_view()); + + EXPECT_TRUE(passed); + + if (!passed) { + std::stringstream fname; + + fname << "error_Conv2d_ImplicitGemm_device_" + << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_") + << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" : + (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" : "wgrad_")) + << "nhwc_" + << problem_size.N << "x" + << problem_size.H << "x" + << problem_size.W << "x" + << problem_size.C + << "_krsc_" + << problem_size.K << "x" + << problem_size.R << "x" + << problem_size.S << "x" + << problem_size.C + << "_padding_" + << problem_size.pad_h << "x" + << problem_size.pad_w + << "_stride_" + << problem_size.stride_h << "x" + << problem_size.stride_w + << "_dilation_" + << problem_size.dilation_h << "x" + << problem_size.dilation_w << "_" + << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_") + << Conv2d::ThreadblockShape::kM << "x" + << Conv2d::ThreadblockShape::kN << "x" + << Conv2d::ThreadblockShape::kK << "_" + << Conv2d::WarpShape::kM << "x" + << Conv2d::WarpShape::kN << "x" + << Conv2d::WarpShape::kK << ".txt"; + + std::cout << fname.str() << std::endl; + + std::ofstream results(fname.str()); + + results << problem_size << std::endl; + + results + << "\nA:\n" << tensor_A.host_view() << "\n" + << "\nB:\n" << tensor_B.host_view() << "\n" + << "\nB_reordered =\n" << tensor_B_reordered.host_view() << "\n" + << "\nC:\n" << tensor_C.host_view() << "\n" + << "\nD reference:\n" << tensor_D_reference.host_view() << "\n" + << "\nD computed:\n" << tensor_D_computed.host_view() << "\n"; + + } + + return passed; + } + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference +// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes +// Additionaly, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes +// (conv_blacklist_sizes) +///////////////////////////////////////////////////////////////////////////////////////////////////////////// +template +bool TestAllInterleavedConv2d( + const Conv2dProblemVector & conv_test_sizes = Conv2dProblemVector(), + const Conv2dProblemVector & conv_blacklist_sizes = Conv2dProblemVector()) { + + bool passed = true; + + // + // Testbed object + // + + InterleavedTestbedConv2d testbed; + + // + // Get conv problem sizes to run conv operator + // + TestbedConv2dProblemSizes conv_problems(InterleavedK); // minimum channel size must be multiple of InterleavedK for interleaved layout + + // Vector of conv2d problem sizes to avoid duplicate runs + Conv2dProblemVector conv_tested_sizes; + + Conv2dProblemVector const *problem_vectors[] = { + &conv_test_sizes, // run user specified sizes + &conv_problems.conv2d_default_sizes, // run default and cudnn bug sizes + &conv_problems.conv2d_resnet50_sizes, // run resnet50 sizes +#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED + &conv_problems.conv2d_rigorous_sizes, // run large and rigorous sizes if enabled +#endif + }; + + // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0) + for (Conv2dProblemVector const * problem_vector : problem_vectors) { + + ChannelDivisibilitySpecification channel_spec(InterleavedK); //input and output channels must be multiple of InterleavedK + auto pruned_problem_vector = prune(*problem_vector, channel_spec); + + // Run conv testbed on default convolution sizes + for(auto conv_problem : pruned_problem_vector) { + + // Skip blacklist and avoid duplicate problem sizes + if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() || + std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) { + continue; + } + + // + // Procedurally disable certain cases + // + + // CUTLASS DGRAD's unity stride specialization only support stride {1, 1} + if ((ImplicitGemm::kConvolutionalOperator == + cutlass::conv::Operator::kDgrad) && + (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == + cutlass::conv::StrideSupport::kUnity)) { + if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) { + continue; + } + } + + // + // Test + // + // push back tested problem size to avoid re-running duplicates + conv_tested_sizes.push_back(conv_problem); + + // test mode = xcross + passed = testbed.run( + conv_problem, + cutlass::conv::SplitKMode::kSerial); + + if (!passed) { + return false; + } + + // test mode = convolution + passed = testbed.run( + conv_problem.reset_mode(cutlass::conv::Mode::kConvolution), + cutlass::conv::SplitKMode::kSerial); + + if (!passed) { + return false; + } + } + } + +#if 0 + // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for + // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters + // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep + // alpha and beta for local testing, but only runs one value for alpha and beta. + cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size ( + {1, 17, 11, 288}, // input size (NHWC) + {160, 3, 3, 288}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + ); + + cutlass::conv::SplitKMode split_k_modes [] = { + cutlass::conv::SplitKMode::kSerial, + cutlass::conv::SplitKMode::kParallel, + }; + + int split_k_slices[] = { + 1, 2, 3, 4, 201 + }; + + double problem_alpha[] = { + 2.0 + }; + + double problem_beta[] = { + 2.0 + }; + + for (auto split_k_mode : split_k_modes) { + for (auto split_k_slice : split_k_slices) { + for (auto alpha : problem_alpha) { + for (auto beta : problem_beta) { + + passed = testbed.run( + conv2d_split_k_test_size.reset_split_k_slices(split_k_slice), + split_k_mode, + cutlass::from_real(alpha), + cutlass::from_real(beta)); + + if (!passed) { + return false; + } + } + } + } + } +#endif + + return passed; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace device +} // namespace conv +} // namespace test diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu new file mode 100644 index 0000000000..07961dd2b7 --- /dev/null +++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu @@ -0,0 +1,172 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_wgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM50_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 32x64_8x2_32x32x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM50_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 64x64_8x2_32x32x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM50_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 32x64_8x2_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm50, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu new file mode 100644 index 0000000000..a68a30fe5b --- /dev/null +++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu @@ -0,0 +1,311 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_wgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 32x64_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 64x64_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 128x128_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 128x128_8x4_64x32x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 32x64_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32, + 128x128_8x4_64x32x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::complex; + using ElementB = cutlass::complex; + using ElementC = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementCompute = cutlass::complex; + + + /// Device-level Conv2d instance + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAddComplex, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu new file mode 100644 index 0000000000..3cbde02888 --- /dev/null +++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu @@ -0,0 +1,122 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_wgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, + 128x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + using ElementCompute = cutlass::half_t; + + /// Device-level Conv2d instance + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + + +TEST(SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, + 128x128_64x3_64x64x64) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + using ElementCompute = cutlass::half_t; + + /// Device-level Conv2d instance + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu new file mode 100644 index 0000000000..ffb79d77ad --- /dev/null +++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu @@ -0,0 +1,78 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv2d_wgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) + +TEST(SM70_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, + 128x128_32x2_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm70, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM70_SUPPORTED + diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu new file mode 100644 index 0000000000..1101090a12 --- /dev/null +++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu @@ -0,0 +1,78 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv2d_wgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + +TEST(SM75_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, + 128x128_32x2_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM75_SUPPORTED + diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu new file mode 100644 index 0000000000..ade6f8df32 --- /dev/null +++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu @@ -0,0 +1,161 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv2d_wgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, + 128x128_32x3_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, + 128x128_32x3_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, + 64x256_32x4_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32 >, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kStrided + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED + diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu new file mode 100644 index 0000000000..a0aac81147 --- /dev/null +++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu @@ -0,0 +1,321 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/kernel/default_conv2d_wgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32, + 32x64_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = float; + using ElementB = float; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + + /// Device-level Conv2d instance + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32, + 64x64_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = float; + using ElementB = float; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + + /// Device-level Conv2d instance + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32, + 128x128_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = float; + using ElementB = float; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + + /// Device-level Conv2d instance + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + test::conv::device::Conv2dProblemVector user_size; + + user_size.push_back(cutlass::conv::Conv2dProblemSize( + {1, 8, 8, 4}, // input size (NHWC) + {8, 1, 1, 4}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d(user_size)); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32, + 128x128_8x4_64x32x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = float; + using ElementB = float; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + + /// Device-level Conv2d instance + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32, + 32x64_8x4_32x64x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = float; + using ElementB = float; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + + /// Device-level Conv2d instance + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32, + 128x128_8x4_64x32x8) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = float; + using ElementB = float; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + + /// Device-level Conv2d instance + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, + cutlass::layout::TensorNHWC, + ElementB, + cutlass::layout::TensorNHWC, + ElementC, + cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 1, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); + +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu new file mode 100644 index 0000000000..2185257f15 --- /dev/null +++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu @@ -0,0 +1,81 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv2d_wgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv2d_testbed.h" + + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, + 128x128_32x3_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::tfloat32_t; + using ElementB = cutlass::tfloat32_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv2dWgradKernel = typename cutlass::conv::kernel::DefaultConv2dWgrad< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd + >::Kernel; + + using Conv2dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv2d instance + EXPECT_TRUE(test::conv::device::TestAllConv2d()); +} + + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu new file mode 100644 index 0000000000..211a331d8b --- /dev/null +++ b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu @@ -0,0 +1,80 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv3d_dgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv3d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv3d_Dgrad_Analytic_ImplicitGemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, + 128x128_32x3_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::tfloat32_t; + using ElementB = cutlass::tfloat32_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv3dDgradKernel = typename cutlass::conv::kernel::DefaultConv3dDgrad< + ElementA, cutlass::layout::TensorNDHWC, + ElementB, cutlass::layout::TensorNDHWC, + ElementC, cutlass::layout::TensorNDHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd + >::Kernel; + + using Conv3dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv3d instance + EXPECT_TRUE(test::conv::device::TestAllConv3d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu new file mode 100644 index 0000000000..0aabef5ba6 --- /dev/null +++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu @@ -0,0 +1,80 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv3d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv3d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv3d_Fprop_Analytic_ImplicitGemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, + 128x128_32x3_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::tfloat32_t; + using ElementB = cutlass::tfloat32_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv3dFpropKernel = typename cutlass::conv::kernel::DefaultConv3dFprop< + ElementA, cutlass::layout::TensorNDHWC, + ElementB, cutlass::layout::TensorNDHWC, + ElementC, cutlass::layout::TensorNDHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd + >::Kernel; + + using Conv3dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv3d instance + EXPECT_TRUE(test::conv::device::TestAllConv3d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv3d_problems.h b/test/unit/conv/device/conv3d_problems.h new file mode 100644 index 0000000000..9cc618467e --- /dev/null +++ b/test/unit/conv/device/conv3d_problems.h @@ -0,0 +1,248 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Implicit GEMM testbed sizes for Conv2d problem +*/ +#pragma once + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/cutlass.h" + +#include "cutlass/aligned_buffer.h" +#include "cutlass/numeric_types.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/core_io.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" +#include "cutlass/conv/conv3d_problem_size.h" + +namespace test { +namespace conv { +namespace device { + +using Conv3dProblemVector = std::vector; + +//////////////////////////////////////////////////////////////////////////// +/// Structure TestbedConv3dProblemSizes initializes and holds conv default and +/// important network sizes +//////////////////////////////////////////////////////////////////////////// +struct TestbedConv3dProblemSizes { + + // + // Data members + // + int minimum_channel_size; + Conv3dProblemVector conv3d_default_sizes; + Conv3dProblemVector conv3d_vnet_medical_sizes; + + // + // Methods + // + /// Default ctor + TestbedConv3dProblemSizes(int minimum_channel_size_ = 64): minimum_channel_size (minimum_channel_size_) { + + initialize_conv3d_default_sizes(); + initialize_conv3d_vnet_medical_sizes(conv3d_vnet_medical_sizes, 1 /*batch-size*/); + + filter_all(); + } + + /// Eliminates some illegal cases + void filter_all() { + + Conv3dProblemVector *problems_vectors[] = { + &conv3d_default_sizes, + &conv3d_vnet_medical_sizes + }; + + for (Conv3dProblemVector *problems : problems_vectors) { + Conv3dProblemVector filtered; + + for (cutlass::conv::Conv3dProblemSize const & problem : *problems) { + if (!(problem.C % minimum_channel_size)) { + filtered.push_back(problem); + } + } + + *problems = filtered; + } + } + + // Add a few standard convolution problem sizes + void initialize_conv3d_default_sizes() { + + conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize( + {1, 1, 3, 3, minimum_channel_size}, // input size (NDHWC) + {8, 1, 1, 1, minimum_channel_size}, // filter size (KTRSC) + cutlass::Coord<3>({0, 0, 0}), // padding (pad_d, pad_h, pad_w) + cutlass::Coord<3>({1, 1, 1}), // stride (stride_d, stride_h, stride_w) + cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) + )); + + conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize( + {1, 1, 16, 16, minimum_channel_size}, // input size (NDHWC) + {8, 1, 3, 3, minimum_channel_size}, // filter size (KTRSC) + cutlass::Coord<3>({0, 1, 1}), // padding (pad_d, pad_h, pad_w) + cutlass::Coord<3>({1, 1, 1}), // stride (stride_d, stride_h, stride_w) + cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) + )); + + conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize( + {1, 1, 15, 19, 160}, // input size (NDHWC) + {224, 1, 3, 6, 160}, // filter size (KTRSC) + cutlass::Coord<3>({0, 0, 0}), // padding (pad_d, pad_h, pad_w) + cutlass::Coord<3>({1, 1, 1}), // stride (stride_d, stride_h, stride_w) + cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) + )); + + conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize( + {1, 2, 1, 1, minimum_channel_size}, // input size (NDHWC) + {8, 2, 1, 1, minimum_channel_size}, // filter size (KTRSC) + cutlass::Coord<3>({0, 0, 0}), // padding (pad_d, pad_h, pad_w) + cutlass::Coord<3>({1, 1, 1}), // stride (stride_d, stride_h, stride_w) + cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) + )); + + conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize( + {1, 1, 7, 7, minimum_channel_size}, // input size (NDHWC) + {16, 1, 3, 3, minimum_channel_size}, // filter size (KTRSC) + cutlass::Coord<3>({0, 0, 0}), // padding (pad_d, pad_h, pad_w) + cutlass::Coord<3>({1, 1, 1}), // stride (stride_d, stride_h, stride_w) + cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) + )); + + conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize( + {1, 11, 15, 19, 64}, // input size (NDHWC) + {32, 4, 3, 6, 64}, // filter size (KTRSC) + cutlass::Coord<3>({2, 1, 3}), // padding (pad_d, pad_h, pad_w) + cutlass::Coord<3>({1, 1, 1}), // stride (stride_d, stride_h, stride_w) + cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) + )); + } + + // Add vnet layers to unit testing sizes + void initialize_conv3d_vnet_medical_sizes(Conv3dProblemVector &conv3d_problem_vector, int batch_size = 1) { + + conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize( + {batch_size, 32, 32, 32, 16}, // input size (NDHWC) + {32, 2, 2, 2, 16}, // filter size (KTRSC) + cutlass::Coord<3>({0, 0, 0}), // padding (pad_d, pad_h, pad_w) + cutlass::Coord<3>({2, 2, 2}), // stride (stride_d, stride_h, stride_w) + cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) + )); + + + conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize( + {batch_size, 16, 16, 16, 32}, // input size (NDHWC) + {32, 3, 3, 3, 32}, // filter size (KTRSC) + cutlass::Coord<3>({1, 1, 1}), // padding (pad_d, pad_h, pad_w) + cutlass::Coord<3>({1, 1, 1}), // stride (stride_d, stride_h, stride_w) + cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) + )); + + + conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize( + {batch_size, 16, 16, 16, 32}, // input size (NDHWC) + {64, 2, 2, 2, 32}, // filter size (KTRSC) + cutlass::Coord<3>({0, 0, 0}), // padding (pad_d, pad_h, pad_w) + cutlass::Coord<3>({2, 2, 2}), // stride (stride_d, stride_h, stride_w) + cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) + )); + + + conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize( + {batch_size, 8, 8, 8, 64}, // input size (NDHWC) + {64, 3, 3, 3, 64}, // filter size (KTRSC) + cutlass::Coord<3>({1, 1, 1}), // padding (pad_d, pad_h, pad_w) + cutlass::Coord<3>({1, 1, 1}), // stride (stride_d, stride_h, stride_w) + cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) + )); + + + conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize( + {batch_size, 8, 8, 8, 64}, // input size (NDHWC) + {128, 2, 2, 2, 64}, // filter size (KTRSC) + cutlass::Coord<3>({0, 0, 0}), // padding (pad_d, pad_h, pad_w) + cutlass::Coord<3>({2, 2, 2}), // stride (stride_d, stride_h, stride_w) + cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) + )); + + + conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize( + {batch_size, 4, 4, 4, 128}, // input size (NDHWC) + {128, 3, 3, 3, 128}, // filter size (KTRSC) + cutlass::Coord<3>({1, 1, 1}), // padding (pad_d, pad_h, pad_w) + cutlass::Coord<3>({1, 1, 1}), // stride (stride_d, stride_h, stride_w) + cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) + )); + + + conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize( + {batch_size, 8, 8, 8, 128}, // input size (NDHWC) + {128, 3, 3, 3, 128}, // filter size (KTRSC) + cutlass::Coord<3>({1, 1, 1}), // padding (pad_d, pad_h, pad_w) + cutlass::Coord<3>({1, 1, 1}), // stride (stride_d, stride_h, stride_w) + cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) + )); + + + conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize( + {batch_size, 16, 16, 16, 64}, // input size (NDHWC) + {64, 3, 3, 3, 64}, // filter size (KTRSC) + cutlass::Coord<3>({1, 1, 1}), // padding (pad_d, pad_h, pad_w) + cutlass::Coord<3>({1, 1, 1}), // stride (stride_d, stride_h, stride_w) + cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) + )); + + + conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize( + {batch_size, 32, 32, 32, 16}, // input size (NDHWC) + {64, 2, 2, 2, 16}, // filter size (KTRSC) + cutlass::Coord<3>({0, 0, 0}), // padding (pad_d, pad_h, pad_w) + cutlass::Coord<3>({2, 2, 2}), // stride (stride_d, stride_h, stride_w) + cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) + )); + + + conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize( + {batch_size, 16, 16, 16, 32}, // input size (NDHWC) + {128, 2, 2, 2, 32}, // filter size (KTRSC) + cutlass::Coord<3>({0, 0, 0}), // padding (pad_d, pad_h, pad_w) + cutlass::Coord<3>({2, 2, 2}), // stride (stride_d, stride_h, stride_w) + cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) + )); + + } + +}; + +} // namespace device +} // namespace conv +} // namespace test diff --git a/test/unit/conv/device/conv3d_testbed.h b/test/unit/conv/device/conv3d_testbed.h new file mode 100644 index 0000000000..179520d158 --- /dev/null +++ b/test/unit/conv/device/conv3d_testbed.h @@ -0,0 +1,537 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Implicit GEMM testbed +*/ +#pragma once + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + + +#include "cutlass/conv/device/implicit_gemm_convolution.h" +#include "cutlass/reduction/device/reduce_split_k.h" +#include "cutlass/reduction/thread/reduction_operators.h" + +#include "cutlass/util/reference/host/tensor_fill.h" + +#include "cutlass/util/reference/host/convolution.h" + +#include "cutlass/util/reference/host/tensor_compare.h" + +#include "cutlass/util/reference/device/convolution.h" +#include "cutlass/util/reference/device/tensor_compare.h" + +#include "conv3d_problems.h" +#include "cutlass/core_io.h" + +namespace test { +namespace conv { +namespace device { + +template +class TestbedConv3d { +public: + + using ElementA = typename Conv3d::ElementA; + using LayoutA = typename Conv3d::LayoutA; + using ElementB = typename Conv3d::ElementB; + using LayoutB = typename Conv3d::LayoutB; + using ElementC = typename Conv3d::ElementC; + using LayoutC = typename Conv3d::LayoutC; + using ElementAccumulator = typename Conv3d::ElementAccumulator; + using ElementCompute = typename Conv3d::ElementCompute; + using EpilogueOutputOp = typename Conv3d::EpilogueOutputOp; + + static cutlass::conv::Operator const kConvolutionalOperator = Conv3d::kConvolutionalOperator; + + /// Reduction kernel + using ReductionOp = cutlass::reduction::thread::ReduceAdd< + ElementAccumulator, + typename EpilogueOutputOp::ElementAccumulator, + EpilogueOutputOp::kCount + >; + + using ReductionKernel = cutlass::reduction::kernel::ReduceSplitK< + cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>, + EpilogueOutputOp, + ReductionOp + >; + + using ReductionDevice = cutlass::reduction::device::ReduceSplitK; + +public: + + /// Initialization + cutlass::Distribution::Kind init_A; + cutlass::Distribution::Kind init_B; + cutlass::Distribution::Kind init_C; + uint64_t seed; + + cutlass::HostTensor tensor_A; + cutlass::HostTensor tensor_B; + cutlass::HostTensor tensor_C; + cutlass::HostTensor tensor_D_computed; + cutlass::HostTensor tensor_D_reference; + +public: + + TestbedConv3d( + cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, + uint64_t seed_ = 2080 + ): + init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { + + } + + /// Helper to initialize a tensor view + template + void initialize_tensor( + cutlass::TensorView view, + cutlass::Distribution::Kind dist_kind, + uint64_t seed) { + + if (dist_kind == cutlass::Distribution::Uniform) { + + int scope; + int bits = cutlass::sizeof_bits::value; + + if (bits <= 8) { + scope = 2; + } + else if (bits == 16) { + scope = 4; + } + else { + scope = 8; + } + cutlass::reference::host::TensorFillRandomUniform( + view, seed, scope, -scope, 0); + } + else if (dist_kind == cutlass::Distribution::Identity) { + + cutlass::reference::host::TensorFillIdentity(view); + } + else if (dist_kind == cutlass::Distribution::Gaussian) { + + cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5); + } + else if (dist_kind == cutlass::Distribution::Sequential) { + + cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); + } + else { + } + } + + void initialize( + cutlass::conv::Conv3dProblemSize const &problem_size, uint64_t seed = 2019) { + + tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size)); + tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size)); + tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size)); + tensor_D_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size)); + tensor_D_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size)); + + initialize_tensor(tensor_A.host_view(), init_A, seed); + initialize_tensor(tensor_B.host_view(), init_B, seed * 17); + initialize_tensor(tensor_C.host_view(), init_C, seed * 39); + + tensor_A.sync_device(); + tensor_B.sync_device(); + tensor_C.sync_device(); + tensor_D_computed.sync_device(); + tensor_D_reference.sync_device(); + } + + bool sufficient() const { + // + // Determine SMEM requirements and waive if not satisfied + // + + int smem_size = int(sizeof(typename Conv3d::ImplicitGemmKernel::SharedStorage)); + + cudaDeviceProp properties; + int device_idx; + cudaError_t result = cudaGetDevice(&device_idx); + + if (result != cudaSuccess) { + throw std::runtime_error("cudaGetDevice() API call failed."); + } + + result = cudaGetDeviceProperties(&properties, device_idx); + + if (result != cudaSuccess) { + throw std::runtime_error("cudaGetDeviceProperties() failed"); + } + + if (properties.sharedMemPerMultiprocessor < smem_size) { + return false; + } + + return true; + } + + + /// Executes one test + bool run( + cutlass::conv::Conv3dProblemSize const &problem_size, + cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial, + ElementCompute alpha = ElementCompute(1), + ElementCompute beta = ElementCompute()) { + + // Waive test if CUDA device is insufficient. + if (!sufficient()) { + return true; + } + +#if 0 //display conv2d problem size for debugging + std::cout << problem_size << std::endl + << "alpha, beta: (" << float(alpha) << ", " << float(beta) << ")" << std::endl + << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl + << std::endl; +#endif + + initialize(problem_size); + + // configure the operator + Conv3d conv3d_op; + + typename Conv3d::Arguments conv3d_args( + problem_size, + tensor_A.device_ref(), + tensor_B.device_ref(), + tensor_C.device_ref(), + tensor_D_computed.device_ref(), + {alpha, beta}, + split_k_mode + ); + + // find workspace requirement for parallel split-k reduction + size_t workspace_size = Conv3d::get_workspace_size(conv3d_args); + + cutlass::device_memory::allocation workspace(workspace_size); + + cutlass::Status status = conv3d_op.initialize(conv3d_args, workspace.get()); + + if (status != cutlass::Status::kSuccess) { + cudaError_t error = cudaGetLastError(); + std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n"; + return true; + } + + // conv3d operation with parallel split-k-mode + if (split_k_mode == cutlass::conv::SplitKMode::kParallel) { + + // conv3d output is written to workspace in global memory + conv3d_args.ref_D.reset(reinterpret_cast(workspace.get())); + // accumulate mma for each cta in k-dimension (1.0 * A * B) + conv3d_args.output_op = {1.0, 0.0}; + // update conv3d operator arguments + status = conv3d_op.update(conv3d_args, workspace.get()); + } + + EXPECT_TRUE(status == cutlass::Status::kSuccess); + if (status != cutlass::Status::kSuccess) { + return false; + } + + // run conv3d operator + status = conv3d_op(); + + EXPECT_TRUE(status == cutlass::Status::kSuccess); + if (status != cutlass::Status::kSuccess) { + return false; + } + + if (split_k_mode == cutlass::conv::SplitKMode::kParallel) { + + // configure parallel reduction operator + ReductionDevice reduction_op; + + typename ReductionDevice::Arguments reduction_args( + cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn(), + problem_size.split_k_slices, + cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size), + {reinterpret_cast (workspace.get()), tensor_C.stride(Conv3d::ImplicitGemmKernel::kTensorCStrideIdx)}, + {tensor_D_computed.device_data(), tensor_C.stride(Conv3d::ImplicitGemmKernel::kTensorCStrideIdx)}, + {tensor_C.device_data(), tensor_C.stride(Conv3d::ImplicitGemmKernel::kTensorCStrideIdx)}, + {alpha, beta} // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C + ); + + status = reduction_op.initialize(reduction_args, nullptr); + + EXPECT_TRUE(status == cutlass::Status::kSuccess); + if (status != cutlass::Status::kSuccess) { + return false; + } + + // run prallel reduction kernel + status = reduction_op(); + + EXPECT_TRUE(status == cutlass::Status::kSuccess); + if (status != cutlass::Status::kSuccess) { + return false; + } + } + bool passed = false; + + cutlass::reference::host::Conv3d< + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + ElementCompute + >( + kConvolutionalOperator, + problem_size, + tensor_A.host_ref(), + tensor_B.host_ref(), + tensor_C.host_ref(), + tensor_D_reference.host_ref(), + alpha, + beta + ); + + tensor_D_computed.sync_host(); + + passed = cutlass::reference::host::TensorEquals( + tensor_D_computed.host_view(), + tensor_D_reference.host_view()); + + EXPECT_TRUE(passed); + + if (!passed) { + std::stringstream fname; + + fname << "error_Conv3d_ImplicitGemm_device_" + << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_") + << (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" : + (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" : "wgrad_")) + << "ndhwc_" + << problem_size.N << "x" + << problem_size.D << "x" + << problem_size.H << "x" + << problem_size.W << "x" + << problem_size.C + << "_ktrsc_" + << problem_size.K << "x" + << problem_size.T << "x" + << problem_size.R << "x" + << problem_size.S << "x" + << problem_size.C + << "_padding_" + << problem_size.pad_d << "x" + << problem_size.pad_h << "x" + << problem_size.pad_w + << "_stride_" + << problem_size.stride_d << "x" + << problem_size.stride_h << "x" + << problem_size.stride_w + << "_dilation_" + << problem_size.dilation_d << "x" + << problem_size.dilation_h << "x" + << problem_size.dilation_w << "_" + << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_") + << Conv3d::ThreadblockShape::kM << "x" + << Conv3d::ThreadblockShape::kN << "x" + << Conv3d::ThreadblockShape::kK << "_" + << Conv3d::WarpShape::kM << "x" + << Conv3d::WarpShape::kN << "x" + << Conv3d::WarpShape::kK << ".txt"; + + std::cout << fname.str() << std::endl; + + std::ofstream results(fname.str()); + + results << problem_size << std::endl; + + results + << "\nA:\n" << tensor_A.host_view() << "\n" + << "\nB:\n" << tensor_B.host_view() << "\n" + << "\nC:\n" << tensor_C.host_view() << "\n" + << "\nD reference:\n" << tensor_D_reference.host_view() << "\n" + << "\nD computed:\n" << tensor_D_computed.host_view() << "\n"; + + } + + return passed; + } + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference +// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes +// Additionaly, each conv3d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes +// (conv_blacklist_sizes) +///////////////////////////////////////////////////////////////////////////////////////////////////////////// + +template +bool TestAllConv3d( + const Conv3dProblemVector & conv_test_sizes = Conv3dProblemVector(), + const Conv3dProblemVector & conv_blacklist_sizes = Conv3dProblemVector()) { + + bool passed = true; + + // + // Testbed object + // + + //TestbedConv3d testbed(cutlass::Distribution::Sequential, cutlass::Distribution::Sequential, cutlass::Distribution::Sequential); + TestbedConv3d testbed; + + // + // Get conv problem sizes to run conv operator + // + TestbedConv3dProblemSizes conv3d_problems(128/cutlass::sizeof_bits::value); + + // + // Get conv problem sizes to run conv operator + // + //TestbedConv3dProblemSizes conv_problems(128/cutlass::sizeof_bits::value); + + // Vector of conv3d problem sizes to avoid duplicate runs + Conv3dProblemVector conv_tested_sizes; + + Conv3dProblemVector const *problem_vectors[] = { + &conv3d_problems.conv3d_default_sizes, + &conv3d_problems.conv3d_vnet_medical_sizes, + &conv_test_sizes + }; + + // Sweep conv3d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0) + for (Conv3dProblemVector const * problem_vector : problem_vectors) { + + // Run conv testbed on default convolution sizes + for(auto conv_problem : *problem_vector) { + + // Skip blacklist and avoid duplicate problem sizes + if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() || + std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) { + continue; + } + + // + // Procedurally disable certain cases + // + + // CUTLASS DGRAD's unity stride specialization only support stride {1, 1} + if ((ImplicitGemm::kConvolutionalOperator == + cutlass::conv::Operator::kDgrad) && + (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == + cutlass::conv::StrideSupport::kUnity)) { + if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) { + continue; + } + } + + // + // Test + // + // push back tested problem size to avoid re-running duplicates + conv_tested_sizes.push_back(conv_problem); + + // test mode = xcross + passed = testbed.run( + conv_problem, + cutlass::conv::SplitKMode::kSerial); + + if (!passed) { + return false; + } + + // test mode = convolution + passed = testbed.run( + conv_problem.reset_mode(cutlass::conv::Mode::kConvolution), + cutlass::conv::SplitKMode::kSerial); + + if (!passed) { + return false; + } + } + } + + // Sweep split-k-slice using serial reduction with non-unity alpha and non-zero beta for + // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters + // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep + // alpha and beta for local testing, but only runs one value for alpha and beta. + cutlass::conv::Conv3dProblemSize conv3d_split_k_test_size ( + {1, 8, 8, 8, 32}, // input size (NDHWC) + {32, 3, 3, 3, 32}, // filter size (KTRSC) + cutlass::Coord<3>({0, 0, 0}), // padding (pad_d, pad_h, pad_w) + cutlass::Coord<3>({1, 1, 1}), // stride (stride_d, stride_h, stride_w) + cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) + ); + + cutlass::conv::SplitKMode split_k_modes [] = { + cutlass::conv::SplitKMode::kSerial, + cutlass::conv::SplitKMode::kParallel + }; + + int split_k_slices[] = { + 1, 2, 3, 4, 201 + }; + + double problem_alpha[] = { + 2.0 + }; + + double problem_beta[] = { + 2.0 + }; + + for (auto split_k_mode : split_k_modes) { + for (auto split_k_slice : split_k_slices) { + for (auto alpha : problem_alpha) { + for (auto beta : problem_beta) { + + passed = testbed.run( + conv3d_split_k_test_size.reset_split_k_slices(split_k_slice), + split_k_mode, + cutlass::from_real(alpha), + cutlass::from_real(beta)); + + if (!passed) { + return false; + } + } + } + } + } + + return passed; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace device +} // namespace conv +} // namespace test diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu new file mode 100644 index 0000000000..a3f8409447 --- /dev/null +++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu @@ -0,0 +1,78 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv3d_wgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv3d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + +TEST(SM75_Device_Conv3d_Wgrad_Analytic_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, + 128x128_32x2_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + using Conv3dWgradKernel = typename cutlass::conv::kernel::DefaultConv3dWgrad< + ElementA, cutlass::layout::TensorNDHWC, + ElementB, cutlass::layout::TensorNDHWC, + ElementC, cutlass::layout::TensorNDHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd + >::Kernel; + + using Conv3dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv3d instance + EXPECT_TRUE(test::conv::device::TestAllConv3d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM75_SUPPORTED + diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu new file mode 100644 index 0000000000..9847aede81 --- /dev/null +++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu @@ -0,0 +1,159 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv3d_wgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv3d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +TEST(SM80_Device_Conv3d_Wgrad_Analytic_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, + 128x128_32x4_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + using Conv3dWgradKernel = typename cutlass::conv::kernel::DefaultConv3dWgrad< + ElementA, cutlass::layout::TensorNDHWC, + ElementB, cutlass::layout::TensorNDHWC, + ElementC, cutlass::layout::TensorNDHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd + >::Kernel; + + using Conv3dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv3d instance + EXPECT_TRUE(test::conv::device::TestAllConv3d()); +} + +//////////////////////////////////////////////////////////////////////////////// + + +TEST(SM80_Device_Conv3d_Wgrad_Optimized_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, + 128x128_32x4_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + using Conv3dWgradKernel = typename cutlass::conv::kernel::DefaultConv3dWgrad< + ElementA, cutlass::layout::TensorNDHWC, + ElementB, cutlass::layout::TensorNDHWC, + ElementC, cutlass::layout::TensorNDHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv3dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv3d instance + EXPECT_TRUE(test::conv::device::TestAllConv3d()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv3d_Wgrad_Optimized_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, + 64x256_32x4_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + using Conv3dWgradKernel = typename cutlass::conv::kernel::DefaultConv3dWgrad< + ElementA, cutlass::layout::TensorNDHWC, + ElementB, cutlass::layout::TensorNDHWC, + ElementC, cutlass::layout::TensorNDHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv3dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv3d instance + EXPECT_TRUE(test::conv::device::TestAllConv3d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM75_SUPPORTED + diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu new file mode 100644 index 0000000000..6dcbf0e726 --- /dev/null +++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu @@ -0,0 +1,120 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv3d_wgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv3d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv3d_Wgrad_Analytic_ImplicitGemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, + 128x128_32x3_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::tfloat32_t; + using ElementB = cutlass::tfloat32_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv3dWgradKernel = typename cutlass::conv::kernel::DefaultConv3dWgrad< + ElementA, cutlass::layout::TensorNDHWC, + ElementB, cutlass::layout::TensorNDHWC, + ElementC, cutlass::layout::TensorNDHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd + >::Kernel; + + using Conv3dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv3d instance + EXPECT_TRUE(test::conv::device::TestAllConv3d()); +} + +//////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Conv3d_Wgrad_Optimized_ImplicitGemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, + 128x128_32x3_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::tfloat32_t; + using ElementB = cutlass::tfloat32_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv3dWgradKernel = typename cutlass::conv::kernel::DefaultConv3dWgrad< + ElementA, cutlass::layout::TensorNDHWC, + ElementB, cutlass::layout::TensorNDHWC, + ElementC, cutlass::layout::TensorNDHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv3dWgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv3d instance + EXPECT_TRUE(test::conv::device::TestAllConv3d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu index fcc8426ca3..f3552a1847 100644 --- a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu +++ b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu @@ -804,7 +804,7 @@ TEST(SM61_Epilogue_threadblock_epilogue, simt_i8_i32_32x64_32x64x8) { // Output operator // - using OutputOp = cutlass::epilogue::thread::LinearCombinationClamp< + using OutputOp = cutlass::epilogue::thread::LinearCombination< ElementOutput, kElementsPerAccess, ElementAccumulator, @@ -874,7 +874,7 @@ TEST(SM61_Epilogue_threadblock_epilogue, simt_i8_i32_32x128_32x64x8) { // Output operator // - using OutputOp = cutlass::epilogue::thread::LinearCombinationClamp< + using OutputOp = cutlass::epilogue::thread::LinearCombination< ElementOutput, kElementsPerAccess, ElementAccumulator, @@ -944,7 +944,7 @@ TEST(SM61_Epilogue_threadblock_epilogue, simt_i8_i32_64x128_32x64x8) { // Output operator // - using OutputOp = cutlass::epilogue::thread::LinearCombinationClamp< + using OutputOp = cutlass::epilogue::thread::LinearCombination< ElementOutput, kElementsPerAccess, ElementAccumulator, @@ -1014,7 +1014,7 @@ TEST(SM61_Epilogue_threadblock_epilogue, simt_i8_i32_128x128_32x64x8) { // Output operator // - using OutputOp = cutlass::epilogue::thread::LinearCombinationClamp< + using OutputOp = cutlass::epilogue::thread::LinearCombination< ElementOutput, kElementsPerAccess, ElementAccumulator, @@ -1084,7 +1084,7 @@ TEST(SM61_Epilogue_threadblock_epilogue, simt_i8_i32_128x64_32x64x8) { // Output operator // - using OutputOp = cutlass::epilogue::thread::LinearCombinationClamp< + using OutputOp = cutlass::epilogue::thread::LinearCombination< ElementOutput, kElementsPerAccess, ElementAccumulator, diff --git a/test/unit/gemm/device/CMakeLists.txt b/test/unit/gemm/device/CMakeLists.txt index 84247e0bdc..7ead7eba54 100644 --- a/test/unit/gemm/device/CMakeLists.txt +++ b/test/unit/gemm/device/CMakeLists.txt @@ -20,85 +20,109 @@ # STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -cutlass_test_unit_add_executable( +add_custom_target( cutlass_test_unit_gemm_device + DEPENDS + cutlass_test_unit_gemm_device_simt + cutlass_test_unit_gemm_device_tensorop_sm70 + cutlass_test_unit_gemm_device_tensorop_sm75 + cutlass_test_unit_gemm_device_tensorop_f16_sm80 + cutlass_test_unit_gemm_device_tensorop_f32_sm80 + cutlass_test_unit_gemm_device_tensorop_f32_tf32_sm80 + cutlass_test_unit_gemm_device_tensorop_f64 + cutlass_test_unit_gemm_device_tensorop_s32_sm80 + cutlass_test_unit_gemm_device_wmma + cutlass_test_unit_gemm_device_tensorop_planar_complex + cutlass_test_unit_gemm_device_sparse_tensorop_sm80 +) + +add_custom_target( + test_unit_gemm_device + DEPENDS + test_unit_gemm_device_simt + test_unit_gemm_device_tensorop_sm70 + test_unit_gemm_device_tensorop_sm75 + test_unit_gemm_device_tensorop_f16_sm80 + test_unit_gemm_device_tensorop_f32_sm80 + test_unit_gemm_device_tensorop_f32_tf32_sm80 + test_unit_gemm_device_tensorop_f64 + test_unit_gemm_device_tensorop_s32_sm80 + test_unit_gemm_device_wmma + test_unit_gemm_device_tensorop_planar_complex + test_unit_gemm_device_sparse_tensorop_sm80 +) + +cutlass_test_unit_add_executable( + cutlass_test_unit_gemm_device_simt BATCH_SOURCES ON BATCH_SIZE 4 - gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu - gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu - gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu + simt_sgemm_nt_sm80.cu + simt_sgemm_tn_sm80.cu + + simt_cgemm_nn_sm50.cu + simt_cgemm_nt_sm50.cu + simt_cgemm_tn_sm50.cu + simt_cgemm_tt_sm50.cu - gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu - gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu - gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu - gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu + simt_dgemm_nn_sm50.cu + simt_dgemm_nt_sm50.cu + simt_dgemm_tn_sm50.cu + simt_dgemm_tt_sm50.cu - gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu - gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu + simt_hgemm_nn_sm50.cu + simt_hgemm_nt_sm50.cu + simt_hgemm_tn_sm50.cu + simt_hgemm_tt_sm50.cu - gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu - gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu + simt_igemm_nn_sm50.cu + simt_igemm_nt_sm50.cu + simt_igemm_tn_sm50.cu + simt_igemm_tt_sm50.cu - gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu - gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu + simt_int8_igemm_sm61_sliced_k.cu + simt_int8_igemm_sm61.cu - gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu - gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu - gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu - gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu - gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu - gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu - gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu - gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu - gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu - gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu - gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu - gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu - gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu - gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu - gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu + simt_sgemm_nn_sm50.cu + simt_sgemm_nt_sm50.cu + simt_sgemm_tn_sm50.cu + simt_sgemm_tt_sm50.cu - gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu - gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu - gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu - gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu - gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu - gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu - gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu - gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu - gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu - gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu - gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu - gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu - gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu + simt_zgemm_nn_sm50.cu + simt_zgemm_nt_sm50.cu + simt_zgemm_tn_sm50.cu + simt_zgemm_tt_sm50.cu - gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu - gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu + gemm_splitk_simt_sm50.cu +) - simt_sgemm_nt_sm80.cu - simt_sgemm_tn_sm80.cu +cutlass_test_unit_add_executable( + cutlass_test_unit_gemm_device_tensorop_sm70 - gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu - gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu - gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu - gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu - gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu - gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu - gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu - gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu + BATCH_SOURCES ON + BATCH_SIZE 4 - gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu - gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu - gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu - gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu + gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu + gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu + gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu + gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu - gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu - gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu + gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu + + gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu + gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu + + gemm_splitk_tensor_op_sm70.cu +) + +cutlass_test_unit_add_executable( + cutlass_test_unit_gemm_device_tensorop_sm75 + + BATCH_SOURCES ON + BATCH_SIZE 4 - gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu @@ -123,54 +147,105 @@ cutlass_test_unit_add_executable( gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu + + gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu + gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu - gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu - gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu - gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu - gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu + gemm_splitk_serial_tensor_op_sm75.cu + gemm_splitk_tensor_op_sm75.cu - gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu +) - gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu - gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu +cutlass_test_unit_add_executable( + cutlass_test_unit_gemm_device_tensorop_f16_sm80 - simt_cgemm_nn_sm50.cu - simt_cgemm_nt_sm50.cu - simt_cgemm_tn_sm50.cu - simt_cgemm_tt_sm50.cu + BATCH_SOURCES ON + BATCH_SIZE 4 - simt_dgemm_nn_sm50.cu - simt_dgemm_nt_sm50.cu - simt_dgemm_tn_sm50.cu - simt_dgemm_tt_sm50.cu + gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu + gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu +) - simt_hgemm_nn_sm50.cu - simt_hgemm_nt_sm50.cu - simt_hgemm_tn_sm50.cu - simt_hgemm_tt_sm50.cu +cutlass_test_unit_add_executable( + cutlass_test_unit_gemm_device_tensorop_f32_sm80 - simt_igemm_nn_sm50.cu - simt_igemm_nt_sm50.cu - simt_igemm_tn_sm50.cu - simt_igemm_tt_sm50.cu + BATCH_SOURCES ON + BATCH_SIZE 4 - simt_int8_igemm_sm61_sliced_k.cu - simt_int8_igemm_sm61.cu + gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu + gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu + gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu + gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu + gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu + gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu + gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu + gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu + gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu + gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu + gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu + gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu +) - simt_sgemm_nn_sm50.cu - simt_sgemm_nt_sm50.cu - simt_sgemm_tn_sm50.cu - simt_sgemm_tt_sm50.cu +cutlass_test_unit_add_executable( + cutlass_test_unit_gemm_device_tensorop_f32_tf32_sm80 - simt_zgemm_nn_sm50.cu - simt_zgemm_nt_sm50.cu - simt_zgemm_tn_sm50.cu - simt_zgemm_tt_sm50.cu + BATCH_SOURCES ON + BATCH_SIZE 4 - gemm_splitk_serial_tensor_op_sm75.cu - gemm_splitk_tensor_op_sm75.cu - gemm_splitk_tensor_op_sm70.cu - gemm_splitk_simt_sm50.cu + gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu + gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu + gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu + gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu + gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu + gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu + gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu + + gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu + gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu +) + +cutlass_test_unit_add_executable( + cutlass_test_unit_gemm_device_tensorop_f64 + + BATCH_SOURCES ON + BATCH_SIZE 4 + + gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu + gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu + + gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu + gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu + gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu + gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu + gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu + gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu + +) + +cutlass_test_unit_add_executable( + cutlass_test_unit_gemm_device_tensorop_s32_sm80 + + BATCH_SOURCES ON + BATCH_SIZE 4 + + gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu + gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu + gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu + gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu + gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu + gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu + gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu + gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu + + gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu + gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu +) + +cutlass_test_unit_add_executable( + cutlass_test_unit_gemm_device_wmma + + BATCH_SOURCES ON + BATCH_SIZE 4 # wmma floating point tests gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu @@ -222,5 +297,37 @@ cutlass_test_unit_add_executable( gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu +) + +cutlass_test_unit_add_executable( + cutlass_test_unit_gemm_device_tensorop_planar_complex + + BATCH_SOURCES ON + BATCH_SIZE 4 + + gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu + gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu + gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu +) + +cutlass_test_unit_add_executable( + cutlass_test_unit_gemm_device_sparse_tensorop_sm80 + + BATCH_SOURCES ON + BATCH_SIZE 4 + gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu + gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu + gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu + gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu + gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu + gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu + gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu + gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu + gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu + gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu + gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu + gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu + gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu ) + diff --git a/test/unit/gemm/device/multistage_testbed.h b/test/unit/gemm/device/multistage_testbed.h index bdc4b77081..f7b6ac8f56 100644 --- a/test/unit/gemm/device/multistage_testbed.h +++ b/test/unit/gemm/device/multistage_testbed.h @@ -97,10 +97,45 @@ struct MultistageTestbed { return true; } + /// Waives test if CUDA device is insufficient + bool sufficient() const { + // + // Determine SMEM requirements and waive if not satisfied + // + + int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage)); + + cudaDeviceProp properties; + int device_idx; + cudaError_t result = cudaGetDevice(&device_idx); + + if (result != cudaSuccess) { + throw std::runtime_error("cudaGetDevice() API call failed."); + } + + result = cudaGetDeviceProperties(&properties, device_idx); + + if (result != cudaSuccess) { + throw std::runtime_error("cudaGetDeviceProperties() failed"); + } + + if (properties.sharedMemPerMultiprocessor < smem_size) { + return false; + } + + return true; + } + /// Executes one test bool run(cutlass::gemm::GemmCoord problem_size, ElementCompute alpha = ElementCompute(1), ElementCompute beta = ElementCompute(0)) { + + // Waives test if CUDA device is insufficient + if (!sufficient()) { + return true; + } + // // Allocate the GEMM workspace // @@ -144,7 +179,11 @@ struct MultistageTestbed { cutlass::Status status = gemm_op.initialize(arguments); - EXPECT_TRUE(status == cutlass::Status::kSuccess); + if (status != cutlass::Status::kSuccess) { + cudaError_t error = cudaGetLastError(); + std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n"; + return true; + } // // Run the GEMM diff --git a/test/unit/gemm/device/simt_sgemm_nt_sm80.cu b/test/unit/gemm/device/simt_sgemm_nt_sm80.cu index 7d2ab45b6f..f0fe1ebd94 100644 --- a/test/unit/gemm/device/simt_sgemm_nt_sm80.cu +++ b/test/unit/gemm/device/simt_sgemm_nt_sm80.cu @@ -39,7 +39,8 @@ #include "cutlass/util/tensor_view_io.h" #include "testbed.h" - + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) //////////////////////////////////////////////////////////////////////////////// TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 32x64x8_32x64x1) { @@ -246,4 +247,8 @@ TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 128x256x8_64x64x1) { EXPECT_TRUE(test::gemm::device::TestAllGemm()); } -//////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/simt_sgemm_tn_sm80.cu b/test/unit/gemm/device/simt_sgemm_tn_sm80.cu index 00461d2e0f..c183fbff34 100644 --- a/test/unit/gemm/device/simt_sgemm_tn_sm80.cu +++ b/test/unit/gemm/device/simt_sgemm_tn_sm80.cu @@ -41,8 +41,10 @@ #include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) //////////////////////////////////////////////////////////////////////////////// - + TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 32x64x8_32x64x1) { using Element = float; @@ -246,4 +248,8 @@ TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 128x256x8_64x64x1) { EXPECT_TRUE(test::gemm::device::TestAllGemm()); } -//////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/testbed.h b/test/unit/gemm/device/testbed.h index b8c739a7e9..c2bf40ec21 100644 --- a/test/unit/gemm/device/testbed.h +++ b/test/unit/gemm/device/testbed.h @@ -247,6 +247,36 @@ struct Testbed { return compare_reference(problem_size, alpha, beta); } + /// Determine if the CUDA device is sufficient to run the kernel + bool sufficient() const { + // + // Determine SMEM requirements and waive if not satisfied + // + + int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage)); + + cudaDeviceProp properties; + int device_idx; + cudaError_t result = cudaGetDevice(&device_idx); + + if (result != cudaSuccess) { + throw std::runtime_error("cudaGetDevice() API call failed."); + } + + result = cudaGetDeviceProperties(&properties, device_idx); + + if (result != cudaSuccess) { + throw std::runtime_error("cudaGetDeviceProperties() failed"); + } + + if (properties.sharedMemPerMultiprocessor < smem_size) { + return false; + } + + return true; + } + + /// Executes one test bool run( cutlass::gemm::GemmCoord problem_size, @@ -254,6 +284,10 @@ struct Testbed { ElementCompute alpha = ElementCompute(1), ElementCompute beta = ElementCompute(0)) { + // Waive test if insufficient CUDA device + if (!sufficient()) { + return true; + } this->initialize(problem_size); @@ -279,7 +313,11 @@ struct Testbed { cutlass::Status status = gemm_op.initialize(arguments, workspace.get()); - EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status); + if (status != cutlass::Status::kSuccess) { + cudaError_t error = cudaGetLastError(); + std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n"; + return true; + } // // Run the GEMM diff --git a/test/unit/gemm/device/testbed_complex.h b/test/unit/gemm/device/testbed_complex.h index 65c0fdfb4c..a3e1353ee1 100644 --- a/test/unit/gemm/device/testbed_complex.h +++ b/test/unit/gemm/device/testbed_complex.h @@ -31,6 +31,7 @@ #include #include #include +#include #include "../../common/cutlass_unit_test.h" @@ -100,6 +101,34 @@ struct TestbedComplex : public Testbed { return this->compare_reference(problem_size, alpha, beta); } + bool sufficient() const { + // + // Determine SMEM requirements and waive if not satisfied + // + + int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage)); + + cudaDeviceProp properties; + int device_idx; + cudaError_t result = cudaGetDevice(&device_idx); + + if (result != cudaSuccess) { + throw std::runtime_error("cudaGetDevice() API call failed."); + } + + result = cudaGetDeviceProperties(&properties, device_idx); + + if (result != cudaSuccess) { + throw std::runtime_error("cudaGetDeviceProperties() failed"); + } + + if (properties.sharedMemPerMultiprocessor < smem_size) { + return false; + } + + return true; + } + /// Executes one test bool run( cutlass::gemm::GemmCoord problem_size, @@ -107,7 +136,17 @@ struct TestbedComplex : public Testbed { ElementCompute alpha = ElementCompute(1), ElementCompute beta = ElementCompute(0)) { + // Waive the test if device not sufficient + if (!sufficient()) { + return true; + } + + // + // Initialize workspace + // + this->initialize(problem_size); + // // Initialize the GEMM operator diff --git a/test/unit/gemm/device/testbed_interleaved.h b/test/unit/gemm/device/testbed_interleaved.h index 3cbd720bd4..6e14f87f6e 100644 --- a/test/unit/gemm/device/testbed_interleaved.h +++ b/test/unit/gemm/device/testbed_interleaved.h @@ -99,6 +99,35 @@ struct InterleavedTestbed { return false; } + return true; + } + + /// Waives test if CUDA device is insufficient + bool sufficient() const { + // + // Determine SMEM requirements and waive if not satisfied + // + + int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage)); + + cudaDeviceProp properties; + int device_idx; + cudaError_t result = cudaGetDevice(&device_idx); + + if (result != cudaSuccess) { + throw std::runtime_error("cudaGetDevice() API call failed."); + } + + result = cudaGetDeviceProperties(&properties, device_idx); + + if (result != cudaSuccess) { + throw std::runtime_error("cudaGetDeviceProperties() failed"); + } + + if (properties.sharedMemPerMultiprocessor < smem_size) { + return false; + } + return true; } @@ -107,6 +136,10 @@ struct InterleavedTestbed { cutlass::gemm::GemmCoord problem_size, ElementCompute alpha = ElementCompute(1), ElementCompute beta = ElementCompute(0)) { + + if (!sufficient()) { + return true; + } // // Allocate the GEMM workspace diff --git a/test/unit/gemm/device/testbed_sparse.h b/test/unit/gemm/device/testbed_sparse.h index d1d57b893c..28901a9867 100644 --- a/test/unit/gemm/device/testbed_sparse.h +++ b/test/unit/gemm/device/testbed_sparse.h @@ -295,6 +295,34 @@ struct SparseTestbed { return compare_reference(problem_size, alpha, beta); } + bool sufficient() const { + // + // Determine SMEM requirements and waive if not satisfied + // + + int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage)); + + cudaDeviceProp properties; + int device_idx; + cudaError_t result = cudaGetDevice(&device_idx); + + if (result != cudaSuccess) { + throw std::runtime_error("cudaGetDevice() API call failed."); + } + + result = cudaGetDeviceProperties(&properties, device_idx); + + if (result != cudaSuccess) { + throw std::runtime_error("cudaGetDeviceProperties() failed"); + } + + if (properties.sharedMemPerMultiprocessor < smem_size) { + return false; + } + + return true; + } + /// Executes one test bool run( cutlass::gemm::GemmCoord problem_size, @@ -302,6 +330,11 @@ struct SparseTestbed { ElementCompute alpha = ElementCompute(1), ElementCompute beta = ElementCompute(0)) { + // Waive test if insufficient CUDA device + if (!sufficient()) { + return true; + } + this->initialize(problem_size); // @@ -327,7 +360,10 @@ struct SparseTestbed { cutlass::Status status = gemm_op.initialize(arguments, workspace.get()); - EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status); + // This failure is likely due to insufficient device capabilities. Waive the test. + if (status != cutlass::Status::kSuccess) { + return true; + } // // Run the GEMM diff --git a/test/unit/gemm/device/testbed_universal.h b/test/unit/gemm/device/testbed_universal.h index a83c27cda6..fb36f10e25 100644 --- a/test/unit/gemm/device/testbed_universal.h +++ b/test/unit/gemm/device/testbed_universal.h @@ -250,6 +250,34 @@ struct TestbedUniversal { return compare_reference(problem_size, alpha, beta); } + bool sufficient() const { + // + // Determine SMEM requirements and waive if not satisfied + // + + int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage)); + + cudaDeviceProp properties; + int device_idx; + cudaError_t result = cudaGetDevice(&device_idx); + + if (result != cudaSuccess) { + throw std::runtime_error("cudaGetDevice() API call failed."); + } + + result = cudaGetDeviceProperties(&properties, device_idx); + + if (result != cudaSuccess) { + throw std::runtime_error("cudaGetDeviceProperties() failed"); + } + + if (properties.sharedMemPerMultiprocessor < smem_size) { + return false; + } + + return true; + } + /// Executes one test bool run( cutlass::gemm::GemmUniversalMode mode, @@ -258,6 +286,11 @@ struct TestbedUniversal { ElementCompute alpha = ElementCompute(1), ElementCompute beta = ElementCompute(0)) { + // Waive test if insufficient CUDA device + if (!sufficient()) { + return true; + } + this->initialize(problem_size); // diff --git a/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h b/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h index 7036e26d97..d667d8f550 100644 --- a/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h +++ b/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h @@ -328,19 +328,17 @@ struct SparseTestbed { test::gemm::threadblock::kernel_multistage_mma_sparse, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size); - EXPECT_EQ(result, cudaSuccess) - << " cudaFuncSetAttribute " - "cudaFuncAttributeMaxDynamicSharedMemorySize error: " - << cudaGetErrorString(result); + if (result != cudaSuccess) { + return true; + } result = cudaFuncSetAttribute( test::gemm::threadblock::kernel_multistage_mma_sparse, cudaFuncAttributePreferredSharedMemoryCarveout, 100); - EXPECT_EQ(result, cudaSuccess) - << " cudaFuncSetAttribute " - "cudaFuncAttributePreferredSharedMemoryCarveout error: " - << cudaGetErrorString(result); + if (result != cudaSuccess) { + return true; + } } test::gemm::threadblock::kernel_multistage_mma_sparse diff --git a/test/unit/gemm/threadblock/mma_multistage_testbed.h b/test/unit/gemm/threadblock/mma_multistage_testbed.h index 3870dd22fb..6b8dc94fb6 100644 --- a/test/unit/gemm/threadblock/mma_multistage_testbed.h +++ b/test/unit/gemm/threadblock/mma_multistage_testbed.h @@ -266,19 +266,17 @@ struct Testbed { test::gemm::threadblock::kernel_multistage_mma, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size); - EXPECT_EQ(result, cudaSuccess) - << " cudaFuncSetAttribute " - "cudaFuncAttributeMaxDynamicSharedMemorySize error: " - << cudaGetErrorString(result); + if (result != cudaSuccess) { + return true; + } result = cudaFuncSetAttribute( test::gemm::threadblock::kernel_multistage_mma, cudaFuncAttributePreferredSharedMemoryCarveout, 100); - EXPECT_EQ(result, cudaSuccess) - << " cudaFuncSetAttribute " - "cudaFuncAttributePreferredSharedMemoryCarveout error: " - << cudaGetErrorString(result); + if (result != cudaSuccess) { + return true; + } } test::gemm::threadblock::kernel_multistage_mma diff --git a/test/unit/gemm/warp/gemm_sm70.cu b/test/unit/gemm/warp/gemm_sm70.cu index 16f1427e55..3785290e5c 100644 --- a/test/unit/gemm/warp/gemm_sm70.cu +++ b/test/unit/gemm/warp/gemm_sm70.cu @@ -199,6 +199,91 @@ TEST(SM70_warp_gemm_tensor_op_crosswise, 64x64x32_64x64x32_16x16x4) { test::gemm::warp::Testbed >().run(); } + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM70_warp_gemm_volta_tensor_op_canonical_f32_row_col, 64x64x16_64x64x4_8x8x4) { + + using Shape = cutlass::gemm::GemmShape<64, 64, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::ColumnMajor; + + using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< + cutlass::arch::Mma< + cutlass::gemm::GemmShape<16, 16, 4>, + 32, + ElementA, + cutlass::layout::RowMajor, + ElementB, + cutlass::layout::ColumnMajor, + ElementC, + cutlass::layout::RowMajor, + cutlass::arch::OpMultiplyAdd + >, + cutlass::MatrixShape<1, 1> + >; + + using MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp< + Shape, + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + Policy + >; + + test::gemm::warp::Testbed >() + .run(); +} + +TEST(SM70_warp_gemm_volta_tensor_op_canonical_f32_col_row, 64x64x16_64x64x4_8x8x4) { + + using Shape = cutlass::gemm::GemmShape<64, 64, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::RowMajor; + + using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< + cutlass::arch::Mma< + cutlass::gemm::GemmShape<16, 16, 4>, + 32, + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::arch::OpMultiplyAdd + >, + cutlass::MatrixShape<1, 1> + >; + + using MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp< + Shape, + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + Policy + >; + + test::gemm::warp::Testbed >() + .run(); +} + ///////////////////////////////////////////////////////////////////////////////////////////////// #endif // CUTLASS_ARCH_MMA_SM70_SUPPORTED diff --git a/test/unit/gemm/warp/testbed.h b/test/unit/gemm/warp/testbed.h index c0c98d80df..3cc00fb447 100644 --- a/test/unit/gemm/warp/testbed.h +++ b/test/unit/gemm/warp/testbed.h @@ -30,6 +30,7 @@ #include "cutlass/cutlass.h" #include "cutlass/aligned_buffer.h" +#include "cutlass/numeric_types.h" #include "cutlass/subbyte_reference.h" #include "cutlass/platform/platform.h" @@ -1019,9 +1020,11 @@ __global__ void sparse_kernel( typename Mma::ElementB, ThreadblockShape::kN * ThreadblockShape::kK> smem_buffer_B; __shared__ cutlass::AlignedBuffer< - typename Mma::ElementE, ThreadblockShape::kM * ThreadblockShape::kK / + typename Mma::ElementE, Mma::Shape::kM * Mma::Shape::kK / Mma::kSparse / Mma::kElementsPerElementE> smem_buffer_E; + + __syncthreads(); if (threadIdx.x == 0) { typename Mma::ElementA *smem_ptr_A = smem_buffer_A.data(); @@ -1168,6 +1171,7 @@ struct SparseTestbed { /// Allocates workspace in device memory SparseTestbed() { + tensor_A.reset(cutlass::make_Coord(ThreadblockShape::kM, ThreadblockShape::kK / Sparse)); tensor_A_uncompressed.reset( diff --git a/test/unit/reduction/CMakeLists.txt b/test/unit/reduction/CMakeLists.txt index 7b4f267069..96c3716141 100644 --- a/test/unit/reduction/CMakeLists.txt +++ b/test/unit/reduction/CMakeLists.txt @@ -22,7 +22,6 @@ add_subdirectory(thread) add_subdirectory(kernel) - add_custom_target( cutlass_test_unit_reduction DEPENDS diff --git a/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu b/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu index e52af8edf9..8d2382e4cf 100644 --- a/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu +++ b/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu @@ -81,7 +81,7 @@ __global__ void kernel_gemm_threadblock_tensor_op_multiplicand_store( } } - // Use iterator to scatter results + // Use iterator to store results Iterator iter(ref_output, threadIdx.x); iter.store(frag); } diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 3ca637b2db..e43c821e64 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -26,6 +26,12 @@ if (CUTLASS_ENABLE_LIBRARY) add_subdirectory(library) endif() if (CUTLASS_ENABLE_PROFILER) - add_subdirectory(profiler) + if (NOT CUTLASS_ENABLE_LIBRARY) + message(SEND_ERROR "Build conflict: The CUTLASS profiler requires the CUTLASS library.") + message(SEND_ERROR " CUTLASS_ENABLE_PROFILER = ${CUTLASS_ENABLE_PROFILER}") + message(SEND_ERROR " CUTLASS_ENABLE_LIBRARY = ${CUTLASS_ENABLE_LIBRARY}") + else() + add_subdirectory(profiler) + endif() endif() diff --git a/tools/library/CMakeLists.txt b/tools/library/CMakeLists.txt index 294cd98f01..4bf7577fb8 100644 --- a/tools/library/CMakeLists.txt +++ b/tools/library/CMakeLists.txt @@ -63,6 +63,15 @@ cutlass_add_library( src/reference/gemm.cu src/reference/initialize_reference_operations.cu + + # cutlass reduction instances in cutlass library + src/reduction/reduction_device.cu + src/reduction/init_reduction_operations.cu + + # cutlass conv reference instances in cutlass library + src/reference/conv2d.cu + src/reference/conv3d.cu + ) file(GLOB_RECURSE GENERATOR_PYTHON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/scripts/*.py) @@ -136,7 +145,7 @@ function(cutlass_add_cutlass_library) cutlass_library_includes ) - set_target_properties(${__NAME} PROPERTIES DEBUG_POSTFIX ${CUTLASS_LIBRARY_DEBUG_POSTFIX}) + set_target_properties(${__NAME} PROPERTIES DEBUG_POSTFIX "${CUTLASS_LIBRARY_DEBUG_POSTFIX}") set(OUTPUT_NAME cutlass) diff --git a/tools/library/include/cutlass/library/handle.h b/tools/library/include/cutlass/library/handle.h index 58c6b30c7c..27d2bfe6a4 100644 --- a/tools/library/include/cutlass/library/handle.h +++ b/tools/library/include/cutlass/library/handle.h @@ -335,6 +335,10 @@ class Handle { using HandlePtr = std::unique_ptr; ///////////////////////////////////////////////////////////////////////////////////////////////// +/// Finds conv2d operation instances with Conv2d::ElementC = Reduction::ElementWorkspace +Operation const* find_conv_operation_for_parallel_reduction(Operation const *operation); +///////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace library } // namespace cutlass diff --git a/tools/library/include/cutlass/library/library.h b/tools/library/include/cutlass/library/library.h index f692437199..6a018a704c 100644 --- a/tools/library/include/cutlass/library/library.h +++ b/tools/library/include/cutlass/library/library.h @@ -53,6 +53,10 @@ #include "cutlass/layout/tensor.h" #include "cutlass/gemm/gemm.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" +#include "cutlass/conv/conv3d_problem_size.h" + ///////////////////////////////////////////////////////////////////////////////////////////////// namespace cutlass { @@ -79,6 +83,10 @@ enum class LayoutTypeID { kTensorNCDHW, kTensorNHWC, kTensorNDHWC, + kTensorNC32HW32, + kTensorC32RSK32, + kTensorNC64HW64, + kTensorC64RSK64, kInvalid }; @@ -138,6 +146,7 @@ enum class Provider { kReferenceHost, kReferenceDevice, kCUBLAS, + kCUDNN, kInvalid }; @@ -146,6 +155,8 @@ enum class Provider { /// Enumeration indicating the kind of operation enum class OperationKind { kGemm, + kConv2d, + kConv3d, kEqGemm, kSparseGemm, kReduction, @@ -204,6 +215,30 @@ enum class GemmKind { /// Mode of Universal GEMM using GemmUniversalMode = cutlass::gemm::GemmUniversalMode; +/// Enumeration indicating what kind of Conv2d operation to perform +enum class ConvKind { + kUnknown, + kFprop, + kDgrad, + kWgrad, + kInvalid +}; + +enum class ConvModeID { + kCrossCorrelation, + kConvolution, + kInvalid +}; + +// Iterator algorithm enum in order of general performance-efficiency +enum class IteratorAlgorithmID { + kNone, + kAnalytic, + kOptimized, + kInvalid +}; + + enum class EpilogueKind { kUnknown, kConversion, @@ -477,6 +512,66 @@ struct ReductionDescription : public OperationDescription { }; ///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Description of all Conv2d operations +struct ConvDescription : public OperationDescription { + /// Describes the convolution dimension support (2D or 3D) + int conv_dim; + + /// Describes the kind of convolution + ConvKind conv_kind; + + /// Describes the type of iterator algorithm (analytic or precomputed) + IteratorAlgorithmID iterator_algorithm; + + /// Describes the A operand + TensorDescription A; + + /// Describes the B operand + TensorDescription B; + + /// Describes the C operand + TensorDescription C; + + /// Describes the data type of the scalars passed to the epilogue + NumericTypeID element_epilogue; + + // + // Methods + // + // Returns Activation TensorDescription + TensorDescription activation() const { + switch(conv_kind) { + case library::ConvKind::kFprop : return A; + case library::ConvKind::kDgrad : return C; + case library::ConvKind::kWgrad : return B; + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns Filter TensorDescription + TensorDescription filter() const { + switch(conv_kind) { + case library::ConvKind::kFprop : return B; + case library::ConvKind::kDgrad : return B; + case library::ConvKind::kWgrad : return C; + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns Output TensorDescription + TensorDescription output() const { + switch(conv_kind) { + case library::ConvKind::kFprop : return C; + case library::ConvKind::kDgrad : return A; + case library::ConvKind::kWgrad : return A; + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + +}; + + ///////////////////////////////////////////////////////////////////////////////////////////////// /// Base class for all operations @@ -825,6 +920,204 @@ struct SparseGemmArguments { ///////////////////////////////////////////////////////////////////////////////////////////////// +/// Two dimensional convolution +// +// OperationKind: Conv2d +// +struct Conv2dConfiguration { + + conv::SplitKMode split_k_mode; + + /// Conv2d problem size + // contains strictly conv2d size (N,H,W,C,K,R,S,P,Q,padding,stride,dilation,mode) + // also includes (split_k_slices, groups) + conv::Conv2dProblemSize problem_size; + + /// Layout object for activations tensor + layout::TensorNHWC layout_activations; + + /// Layout object for filters tensor + layout::TensorNHWC layout_filters; + + /// Layout object for source tensor + layout::TensorNHWC layout_source; + + /// Layout object for output tensor + layout::TensorNHWC layout_output; + + // + // Methods + // + + // Mapping functions (A,B,C -> activation,filter,output) + layout::TensorNHWC layout_a(library::ConvKind const &conv_kind) const { + switch (conv_kind) { + case library::ConvKind::kFprop: return layout_activations; + case library::ConvKind::kDgrad: return layout_output; + case library::ConvKind::kWgrad: return layout_output; + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + layout::TensorNHWC layout_b(library::ConvKind const &conv_kind) const { + switch (conv_kind) { + case library::ConvKind::kFprop: return layout_filters; + case library::ConvKind::kDgrad: return layout_filters; + case library::ConvKind::kWgrad: return layout_activations; + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + layout::TensorNHWC layout_c(library::ConvKind const &conv_kind) const { + switch (conv_kind) { + case library::ConvKind::kFprop: return layout_output; + case library::ConvKind::kDgrad: return layout_activations; + case library::ConvKind::kWgrad: return layout_filters; + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } +}; + + +/// Three dimensional convolution +// +// OperationKind: Conv3d +// +struct Conv3dConfiguration { + + conv::SplitKMode split_k_mode; + + /// Conv2d problem size + // contains strictly conv2d size (N,D,H,W,C,K,T,R,S,Z,P,Q,padding,stride,dilation,mode) + // also includes (split_k_slices, groups) + conv::Conv3dProblemSize problem_size; + + /// Layout object for activations tensor + layout::TensorNDHWC layout_activations; + + /// Layout object for filters tensor + layout::TensorNDHWC layout_filters; + + /// Layout object for source tensor + layout::TensorNDHWC layout_source; + + /// Layout object for output tensor + layout::TensorNDHWC layout_output; + + // + // Methods + // + + // Mapping functions (A,B,C -> activation,filter,output) + layout::TensorNDHWC layout_a(library::ConvKind const &conv_kind) const { + switch (conv_kind) { + case library::ConvKind::kFprop: return layout_activations; + case library::ConvKind::kDgrad: return layout_output; + case library::ConvKind::kWgrad: return layout_output; + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + layout::TensorNDHWC layout_b(library::ConvKind const &conv_kind) const { + switch (conv_kind) { + case library::ConvKind::kFprop: return layout_filters; + case library::ConvKind::kDgrad: return layout_filters; + case library::ConvKind::kWgrad: return layout_activations; + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + layout::TensorNDHWC layout_c(library::ConvKind const &conv_kind) const { + switch (conv_kind) { + case library::ConvKind::kFprop: return layout_output; + case library::ConvKind::kDgrad: return layout_activations; + case library::ConvKind::kWgrad: return layout_filters; + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } +}; + +/// Arguments for CONV +struct ConvArguments { + + ///////////////////////////////////////////////////////// + /// ImplicitGemm matrices A, B, C, D + ///////////////////////////////////////////////////////// + /// pointer to implicit gemm matrix A + void const *A; + + /// pointer to implicit gemm matrix B + void const *B; + + /// pointer to implicit gemm matrix C + void const *C; + + /// pointer to implicit gemm desitination matrix D + void *D; + + /// Host or device pointer to alpha scalar + void const *alpha; + + /// Host or device pointer to beta scalar + void const *beta; + + /// Enumerant indicating whether alpha/beta point to host or device memory + ScalarPointerMode pointer_mode; + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Configuration for Reduction operations +// +// OperationKind: Reduction +// +struct ReductionConfiguration { + + /// Redcution problem size + MatrixCoord problem_size; + + /// Number of partitions to reduce + int partitions; + + /// Number of lements between each partition + int64_t partition_stride; + + /// leading dimension of 'w'orksace operand + int64_t ldw; + + /// leading dimension of 's'ource operand + int64_t lds; + + /// leading dimension of 'd'estination operand + int64_t ldd; +}; + +/// Arguments for Reduction +struct ReductionArguments { + + /// Pointer to workspace matrix + void const *workspace; + + /// Pointer to source matrix + void const *source; + + /// Pointer to destination matrix + void *destination; + + /// pointer to reference matrix + void *reference; + + /// Host or device pointer to alpha scalar + void const *alpha; + + /// Host or device pointer to beta scalar + void const *beta; + + /// Enumerant indicating whether alpha/beta point to host or device memory + ScalarPointerMode pointer_mode; +}; + } // namespace library } // namespace cutlass diff --git a/tools/library/include/cutlass/library/manifest.h b/tools/library/include/cutlass/library/manifest.h index 7adf0fbbce..2bde2884b4 100644 --- a/tools/library/include/cutlass/library/manifest.h +++ b/tools/library/include/cutlass/library/manifest.h @@ -51,6 +51,9 @@ class Manifest; // init and insert all cutlass gemm operations in manifest object (procedurally generated using generator.py) void initialize_all(Manifest &manifest); +// init and insert all reduction op in manifest object (manually instantiated in library/reduction) +void initialize_all_reduction_op(Manifest &manifest); + ///////////////////////////////////////////////////////////////////////////////////////////////////////// /// List of operations diff --git a/tools/library/include/cutlass/library/operation_table.h b/tools/library/include/cutlass/library/operation_table.h index 3821f65acb..ba19ca123c 100644 --- a/tools/library/include/cutlass/library/operation_table.h +++ b/tools/library/include/cutlass/library/operation_table.h @@ -208,6 +208,262 @@ using GemmOperationFunctionalMap = std::unordered_map< >; ///////////////////////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////// +// Data Structures for Conv Functional Maps +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Tuple uniquely identifying conv2d functional behavior +struct ConvFunctionalKey { + library::Provider provider; + library::ConvKind conv_kind; + library::NumericTypeID element_A; + library::LayoutTypeID layout_A; + library::NumericTypeID element_B; + library::LayoutTypeID layout_B; + library::NumericTypeID element_C; + library::LayoutTypeID layout_C; + library::NumericTypeID element_accumulator; + library::NumericTypeID element_compute; + + + // + // Methods + // + + inline + ConvFunctionalKey( + library::Provider provider = library::Provider::kInvalid, + library::ConvKind conv_kind = library::ConvKind::kFprop, + library::NumericTypeID element_A = library::NumericTypeID::kF16, + library::LayoutTypeID layout_A = library::LayoutTypeID::kTensorNHWC, + library::NumericTypeID element_B = library::NumericTypeID::kF16, + library::LayoutTypeID layout_B = library::LayoutTypeID::kTensorNHWC, + library::NumericTypeID element_C = library::NumericTypeID::kF16, + library::LayoutTypeID layout_C = library::LayoutTypeID::kTensorNHWC, + library::NumericTypeID element_accumulator = library::NumericTypeID::kF32, + library::NumericTypeID element_compute = library::NumericTypeID::kF32 + ): + provider(provider), + conv_kind(conv_kind), + element_A(element_A), + layout_A(layout_A), + element_B(element_B), + layout_B(layout_B), + element_C(element_C), + layout_C(layout_C), + element_accumulator(element_accumulator), + element_compute(element_compute) + { } + + inline + bool operator==(ConvFunctionalKey const &rhs) const { + return + (provider == rhs.provider) && + (conv_kind == rhs.conv_kind) && + (element_A == rhs.element_A) && + (layout_A == rhs.layout_A) && + (element_B == rhs.element_B) && + (layout_B == rhs.layout_B) && + (element_C == rhs.element_C) && + (layout_C == rhs.layout_C) && + (element_accumulator == rhs.element_accumulator) && + (element_compute == rhs.element_compute); + } + + inline + bool operator!=(ConvFunctionalKey const &rhs) const { + return !(*this == rhs); + } +}; +///////////////////////////////////////////////////////////////////////////////////////////////// +inline +std::ostream& operator<< (std::ostream& out, const cutlass::library::ConvFunctionalKey& key) { + out << "{\n" + << "provider: " << to_string(key.provider) << std::endl + << "conv_kind: " << to_string(key.conv_kind) << std::endl + << "element_A: " << to_string(key.element_A) << std::endl + << "layout_A: " << to_string(key.layout_A) << std::endl + << "element_B: " << to_string(key.element_B) << std::endl + << "layout_B: " << to_string(key.layout_B) << std::endl + << "element_C: " << to_string(key.element_C) << std::endl + << "layout_C: " << to_string(key.layout_C) << std::endl + << "element_accumulator: " << to_string(key.element_accumulator) << std::endl + << "element_compute: " << to_string(key.element_compute) << std::endl + << "}"; + + return out; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// +struct ConvFunctionalKeyHasher { + using IntHash = std::hash; + + inline + static size_t rotl(size_t key, int shl) { + return (key << shl) | (key >> (sizeof(key)*8 - shl)); + } + + inline + size_t operator()(ConvFunctionalKey const &key) const { + IntHash hash; + + return + rotl(hash(int(key.provider)), 1) ^ + rotl(hash(int(key.conv_kind)), 2) ^ + rotl(hash(int(key.element_A)), 3) ^ + rotl(hash(int(key.layout_A)), 4) ^ + rotl(hash(int(key.element_B)), 5) ^ + rotl(hash(int(key.layout_B)), 6) ^ + rotl(hash(int(key.element_C)), 7) ^ + rotl(hash(int(key.layout_C)), 8) ^ + rotl(hash(int(key.element_accumulator)), 9) ^ + rotl(hash(int(key.element_compute)), 10); + } +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Establishes a partial ordering to search for Conv2d operators +struct ConvPreferenceKey { + + int compute_capability; + IteratorAlgorithmID iterator_algorithm; + + + // + // Methods + // + + ConvPreferenceKey(): compute_capability(), iterator_algorithm() { } + + ConvPreferenceKey(int cc, IteratorAlgorithmID iterator_algorithm): + compute_capability(cc), iterator_algorithm(iterator_algorithm) { } + + bool operator<(ConvPreferenceKey const &rhs) const { + return (compute_capability < rhs.compute_capability) || + ((compute_capability == rhs.compute_capability) && (iterator_algorithm < rhs.iterator_algorithm)); + } + + bool operator==(ConvPreferenceKey const &rhs) const { + return (compute_capability == rhs.compute_capability) && + (iterator_algorithm == rhs.iterator_algorithm); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Maps minimum compute capability onto a vector of possible operations +using ConvOperationVectorMap = std::map< + ConvPreferenceKey, + std::vector +>; + +/// Maps a GemmFunctionalKey onto a vector of Operation * objects expected to be of kind kGemm +using ConvOperationFunctionalMap = std::unordered_map< + ConvFunctionalKey, + ConvOperationVectorMap, + ConvFunctionalKeyHasher +>; +///////////////////////////////////////////////////////////////////////////////////////////////// + + +/// Tuple uniquely identifying conv2d functional behavior +struct ReductionFunctionalKey { + library::Provider provider; + library::NumericTypeID element_workspace; + library::NumericTypeID element_accumulator; + library::NumericTypeID element_output; + library::NumericTypeID element_compute; + library::MathOperationID reduce_math_op; + library::EpilogueKind epilogue_math_op; + + + // + // Methods + // + + inline + ReductionFunctionalKey( + library::Provider provider = library::Provider::kInvalid, + library::NumericTypeID element_workspace = library::NumericTypeID::kF16, + library::NumericTypeID element_accumulator = library::NumericTypeID::kF32, + library::NumericTypeID element_output = library::NumericTypeID::kF16, + library::NumericTypeID element_compute = library::NumericTypeID::kF32, + library::MathOperationID reduce_math_op = library::MathOperationID::kAdd, + library::EpilogueKind epilogue_math_op = library::EpilogueKind::kLinearCombination + ): + provider(provider), + element_workspace(element_workspace), + element_accumulator(element_accumulator), + element_output(element_output), + element_compute(element_compute), + reduce_math_op(reduce_math_op), + epilogue_math_op(epilogue_math_op) + { } + + inline + bool operator==(ReductionFunctionalKey const &rhs) const { + return + (provider == rhs.provider) && + (element_workspace == rhs.element_workspace) && + (element_accumulator == rhs.element_accumulator) && + (element_output == rhs.element_output) && + (element_compute == rhs.element_compute) && + (reduce_math_op == rhs.reduce_math_op) && + (epilogue_math_op == rhs.epilogue_math_op); + } + + inline + bool operator!=(ReductionFunctionalKey const &rhs) const { + return !(*this == rhs); + } +}; + + +struct ReductionFunctionalKeyHasher { + using IntHash = std::hash; + + inline + static size_t rotl(size_t key, int shl) { + return (key << shl) | (key >> (sizeof(key)*8 - shl)); + } + + inline + size_t operator()(ReductionFunctionalKey const &key) const { + IntHash hash; + + return + rotl(hash(int(key.provider)), 1) ^ + rotl(hash(int(key.element_workspace)), 2) ^ + rotl(hash(int(key.element_accumulator)), 3) ^ + rotl(hash(int(key.element_output)), 4) ^ + rotl(hash(int(key.element_compute)), 5) ^ + rotl(hash(int(key.reduce_math_op)), 6) ^ + rotl(hash(int(key.epilogue_math_op)), 7); + } +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + +inline +std::ostream& operator<< (std::ostream& out, const ReductionFunctionalKey& key) { + out << "{\n" + << "provider: " << library::to_string(key.provider) << std::endl + << "element_workspace : " << library::to_string(key.element_workspace) << std::endl + << "element_accumulator : " << library::to_string(key.element_accumulator) << std::endl + << "element_output : " << library::to_string(key.element_output) << std::endl + << "element_compute : " << library::to_string(key.element_compute) << std::endl + << "}"; + return out; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// +// ReductionOperationFunctionalMap has NO preference key and a single instance per functional key +// i.e. only one tile size configuration per functional key +using ReductionOperationFunctionalMap = std::unordered_map< + ReductionFunctionalKey, + library::Operation const *, + ReductionFunctionalKeyHasher +>; + ///////////////////////////////////////////////////////////////////////////////////////////////// /// Table of cutlass::library::Operation instances @@ -218,6 +474,18 @@ class OperationTable { // provider (kCUTLASS) GemmOperationFunctionalMap gemm_operations; + /// Map of all operations of type kConv2d + // provider (kCUTLASS, kReferenceHost, kReferenceDevice) + ConvOperationFunctionalMap conv2d_operations; + + /// Map of all operations of type kConv3d + // provider (kCUTLASS, kReferenceHost, kReferenceDevice) + ConvOperationFunctionalMap conv3d_operations; + + /// Map of all operations of type kConv2d + // provider (kCUTLASS) + ReductionOperationFunctionalMap reduction_operations; + public: void append(Manifest const &manifest); diff --git a/tools/library/include/cutlass/library/util.h b/tools/library/include/cutlass/library/util.h index 526f836b2b..2e4a28c145 100644 --- a/tools/library/include/cutlass/library/util.h +++ b/tools/library/include/cutlass/library/util.h @@ -122,6 +122,27 @@ char const *to_string(SplitKMode split_k_mode, bool pretty = false); template <> SplitKMode from_string(std::string const &str); +/// Converts a ConvModeID enumerant to a string +char const *to_string(ConvModeID type, bool pretty = false); + +/// Converts a ConvModeID enumerant from a string +template <> +ConvModeID from_string(std::string const &str); + +/// Converts a IteratorAlgorithmID enumerant to a string +char const *to_string(IteratorAlgorithmID type, bool pretty = false); + +/// Converts a IteratorAlgorithmID enumerant from a string +template <> +IteratorAlgorithmID from_string(std::string const &str); + +/// Converts a ConvKind enumerant to a string +char const *to_string(ConvKind type, bool pretty = false); + +/// Converts a ConvKind enumerant from a string +template <> +ConvKind from_string(std::string const &str); + /// Lexical cast from int64_t to string std::string lexical_cast(int64_t int_value); diff --git a/tools/library/scripts/conv2d_operation.py b/tools/library/scripts/conv2d_operation.py new file mode 100644 index 0000000000..e164bd007e --- /dev/null +++ b/tools/library/scripts/conv2d_operation.py @@ -0,0 +1,344 @@ +# +# \file generator.py +# +# \brief Generates the CUTLASS Library's instances +# +# + +import enum +import os.path +import shutil + +from library import * + +################################################################################################### + +# +class Conv2dOperation: + # + def __init__(self, conv_kind, iterator_algorithm, arch, tile_description, A, B, C, element_epilogue, \ + stride_support, epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4): + + self.operation_kind = OperationKind.Conv2d + self.arch = arch + self.tile_description = tile_description + self.conv_kind = conv_kind + self.A = A + self.B = B + self.C = C + self.element_epilogue = element_epilogue + self.epilogue_functor = epilogue_functor + self.iterator_algorithm = iterator_algorithm + self.stride_support = stride_support + self.swizzling_functor = swizzling_functor + # + def is_complex(self): + complex_operators = [ + MathOperation.multiply_add_complex, + MathOperation.multiply_add_complex_gaussian + ] + return self.tile_description.math_instruction.math_operation in complex_operators + + # + def accumulator_type(self): + accum = self.tile_description.math_instruction.element_accumulator + + if self.is_complex(): + return get_complex_from_real(accum) + + return accum + + # + def core_name(self): + ''' The basic operation kind is prefixed with a letter indicating the accumulation type. ''' + + intermediate_type = '' + + if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp: + inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape) + if self.tile_description.math_instruction.element_a != self.A.element and \ + self.tile_description.math_instruction.element_a != self.accumulator_type(): + intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a] + else: + inst_shape = '' + + return "%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()], \ + inst_shape, intermediate_type, ConvKindNames[self.conv_kind], IteratorAlgorithmNames[self.iterator_algorithm]) + + # + def extended_name(self): + ''' Append data types if they differ from compute type. ''' + if self.C.element != self.tile_description.math_instruction.element_accumulator and \ + self.A.element != self.tile_description.math_instruction.element_accumulator: + extended_name = "${element_c}_${core_name}_${element_a}" + elif self.C.element == self.tile_description.math_instruction.element_accumulator and \ + self.A.element != self.tile_description.math_instruction.element_accumulator: + extended_name = "${core_name}_${element_a}" + else: + extended_name = "${core_name}" + + extended_name = SubstituteTemplate(extended_name, { + 'element_a': DataTypeNames[self.A.element], + 'element_c': DataTypeNames[self.C.element], + 'core_name': self.core_name() + }) + + return extended_name + + # + def layout_name(self): + return "%s" % (ShortLayoutTypeNames[self.A.layout]) + + # + def configuration_name(self): + ''' The full procedural name indicates architecture, extended name, tile size, and layout. ''' + + opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class] + + threadblock = "%dx%d_%dx%d" % ( + self.tile_description.threadblock_shape[0], + self.tile_description.threadblock_shape[1], + self.tile_description.threadblock_shape[2], + self.tile_description.stages + ) + + if self.stride_support == StrideSupport.Unity: + configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_unity_stride" + else: + configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}" + + return SubstituteTemplate( + configuration_name, + { + 'opcode_class': opcode_class_name, + 'extended_name': self.extended_name(), + 'threadblock': threadblock, + 'layout': self.layout_name(), + } + ) + + # + def procedural_name(self): + ''' The full procedural name indicates architecture, extended name, tile size, and layout. ''' + return self.configuration_name() + +################################################################################################### +# +# Emits single instances of a CUTLASS device-wide operator +# +################################################################################################### + +class EmitConv2dInstance: + def __init__(self): + self.template = """ + // Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}" + using ${operation_name}_base = + typename cutlass::conv::kernel::DefaultConv2d${conv_kind_name}< + ${element_a}, + ${layout_a}, + ${element_b}, + ${layout_b}, + ${element_c}, + ${layout_c}, + ${element_accumulator}, + ${opcode_class}, + ${arch}, + cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, + cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >, + cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, + ${epilogue_functor}< + ${element_c}, + ${epilogue_vector_length}, + ${element_accumulator}, + ${element_epilogue} + >, + ${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + ${stages}, + ${math_operator}, + ${iterator_algorithm}, + ${stride_support} + >::Kernel; +""" + + + def emit(self, operation): + + warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)] + + epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element]) + + values = { + 'operation_name': operation.procedural_name(), + 'conv_kind': ConvKindTag[operation.conv_kind], + 'conv_kind_name': ConvKindNames[operation.conv_kind].capitalize(), + 'element_a': DataTypeTag[operation.A.element], + 'layout_a': LayoutTag[operation.A.layout], + 'element_b': DataTypeTag[operation.B.element], + 'layout_b': LayoutTag[operation.B.layout], + 'element_c': DataTypeTag[operation.C.element], + 'layout_c': LayoutTag[operation.C.layout], + 'element_accumulator': DataTypeTag[operation.accumulator_type()], + 'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class], + 'arch': "cutlass::arch::Sm%d" % operation.arch, + 'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]), + 'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]), + 'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]), + 'warp_shape_m': str(warp_shape[0]), + 'warp_shape_n': str(warp_shape[1]), + 'warp_shape_k': str(warp_shape[2]), + 'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]), + 'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]), + 'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]), + 'epilogue_vector_length': str(epilogue_vector_length), + 'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor], + 'element_epilogue': str(DataTypeTag[operation.element_epilogue]), + 'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor], + 'stages': str(operation.tile_description.stages), + 'iterator_algorithm': IteratorAlgorithmTag[operation.iterator_algorithm], + 'iterator_algorithm_name': IteratorAlgorithmNames[operation.iterator_algorithm].capitalize(), + 'stride_support': StrideSupportTag[operation.stride_support], + 'math_operator': 'cutlass::arch::OpMultiplyAddComplex' if operation.is_complex() else \ + MathOperationTag[operation.tile_description.math_instruction.math_operation] + } + + return SubstituteTemplate(self.template, values) + +################################################################################################### +# +# Generator functions for all layouts +# +################################################################################################### + +# +def GenerateConv2dTensorOp(manifest, tile_descriptions, min_cc, align = 128): + + for tile in tile_descriptions: + for conv_kind in [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad]: + + if conv_kind == ConvKind.Fprop or (tile.math_instruction.element_accumulator in [DataType.f16, DataType.f32]): + + # + output_types = [tile.math_instruction.element_a, tile.math_instruction.element_accumulator] \ + if DataTypeSize[tile.math_instruction.element_accumulator] == 32 \ + else [tile.math_instruction.element_accumulator,] + + for output_type in output_types: + A = TensorDescription(tile.math_instruction.element_a, LayoutType.TensorNHWC, int(align / DataTypeSize[tile.math_instruction.element_a])) + B = TensorDescription(tile.math_instruction.element_b, LayoutType.TensorNHWC, int(align / DataTypeSize[tile.math_instruction.element_b])) + C = TensorDescription(output_type, LayoutType.TensorNHWC, max(1, int(align / DataTypeSize[output_type]))) + + manifest.append(Conv2dOperation(conv_kind, min_cc, tile, A, B, C, tile.math_instruction.element_accumulator)) + +################################################################################################### +# +# Emitters functions for all targets +# +################################################################################################### + +class EmitConv2dConfigurationLibrary: + def __init__(self, operation_path, configuration_name): + self.configuration_name = configuration_name + self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name) + + self.instance_emitter = EmitConv2dInstance() + + self.instance_template = """ +${operation_instance} + +// Derived class +struct ${operation_name} : + public ${operation_name}_base { }; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +""" + self.header_template = """ +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "library_internal.h" +#include "conv2d_operation.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// +""" + + self.configuration_header = """ + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_${configuration_name}(Manifest &manifest) { + +""" + + self.configuration_instance = """ + using Operation_${operation_name} = cutlass::conv::device::ImplicitGemmConvolution< + ${operation_name}>; + + manifest.append(new cutlass::library::Conv2dOperation< + Operation_${operation_name}>( + "${operation_name}")); + +""" + + self.configuration_epilogue = """ +} +""" + self.epilogue_template = """ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +""" + + # + def __enter__(self): + self.configuration_file = open(self.configuration_path, "w") + self.configuration_file.write(SubstituteTemplate(self.header_template, { + 'configuration_name': self.configuration_name + })) + self.operations = [] + return self + + # + def emit(self, operation): + self.operations.append(operation) + self.configuration_file.write(SubstituteTemplate(self.instance_template, { + 'configuration_name': self.configuration_name, + 'operation_name': operation.procedural_name(), + 'operation_instance': self.instance_emitter.emit(operation) + })) + + # + def __exit__(self, exception_type, exception_value, traceback): + + self.configuration_file.write(SubstituteTemplate(self.configuration_header, { + 'configuration_name': self.configuration_name + })) + + for operation in self.operations: + self.configuration_file.write(SubstituteTemplate(self.configuration_instance, { + 'configuration_name': self.configuration_name, + 'operation_name': operation.procedural_name() + })) + + self.configuration_file.write(self.configuration_epilogue) + self.configuration_file.write(self.epilogue_template) + self.configuration_file.close() + + +################################################################################################### +################################################################################################### + diff --git a/tools/library/scripts/conv3d_operation.py b/tools/library/scripts/conv3d_operation.py new file mode 100644 index 0000000000..4ba31b0395 --- /dev/null +++ b/tools/library/scripts/conv3d_operation.py @@ -0,0 +1,321 @@ +# +# \file generator.py +# +# \brief Generates the CUTLASS Library's instances +# +# + +import enum +import os.path +import shutil + +from library import * + +################################################################################################### + +# +class Conv3dOperation: + # + def __init__(self, conv_kind, iterator_algorithm, arch, tile_description, A, B, C, element_epilogue, \ + stride_support, epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4): + + self.operation_kind = OperationKind.Conv3d + self.arch = arch + self.tile_description = tile_description + self.conv_kind = conv_kind + self.A = A + self.B = B + self.C = C + self.element_epilogue = element_epilogue + self.epilogue_functor = epilogue_functor + self.iterator_algorithm = iterator_algorithm + self.stride_support = stride_support + self.swizzling_functor = swizzling_functor + + # + def core_name(self): + ''' The basic operation kind is prefixed with a letter indicating the accumulation type. ''' + + intermediate_type = '' + + if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp: + inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape) + if self.tile_description.math_instruction.element_a != self.A.element and \ + self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator: + intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a] + else: + inst_shape = '' + + return "%s%s%s%s3d_%s" % (ShortDataTypeNames[self.tile_description.math_instruction.element_accumulator], \ + inst_shape, intermediate_type, ConvKindNames[self.conv_kind], IteratorAlgorithmNames[self.iterator_algorithm]) + + # + def extended_name(self): + ''' Append data types if they differ from compute type. ''' + if self.C.element != self.tile_description.math_instruction.element_accumulator and \ + self.A.element != self.tile_description.math_instruction.element_accumulator: + extended_name = "${element_c}_${core_name}_${element_a}" + elif self.C.element == self.tile_description.math_instruction.element_accumulator and \ + self.A.element != self.tile_description.math_instruction.element_accumulator: + extended_name = "${core_name}_${element_a}" + else: + extended_name = "${core_name}" + + extended_name = SubstituteTemplate(extended_name, { + 'element_a': DataTypeNames[self.A.element], + 'element_c': DataTypeNames[self.C.element], + 'core_name': self.core_name() + }) + + return extended_name + + # + def configuration_name(self): + ''' The full procedural name indicates architecture, extended name, tile size, and layout. ''' + + opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class] + + threadblock = "%dx%d_%dx%d" % ( + self.tile_description.threadblock_shape[0], + self.tile_description.threadblock_shape[1], + self.tile_description.threadblock_shape[2], + self.tile_description.stages + ) + + if self.stride_support == StrideSupport.Unity: + configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_unity_stride" + else: + configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}" + + return SubstituteTemplate( + configuration_name, + { + 'opcode_class': opcode_class_name, + 'extended_name': self.extended_name(), + 'threadblock': threadblock, + } + ) + + # + def procedural_name(self): + ''' The full procedural name indicates architecture, extended name, tile size, and layout. ''' + return self.configuration_name() + +################################################################################################### +# +# Emits single instances of a CUTLASS device-wide operator +# +################################################################################################### + +class EmitConv3dInstance: + def __init__(self): + self.template = """ + // Conv3d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}" + using ${operation_name}_base = + typename cutlass::conv::kernel::DefaultConv3d${conv_kind_name}< + ${element_a}, + cutlass::layout::TensorNDHWC, + ${element_b}, + cutlass::layout::TensorNDHWC, + ${element_c}, + cutlass::layout::TensorNDHWC, + ${element_accumulator}, + ${opcode_class}, + ${arch}, + cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, + cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >, + cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, + ${epilogue_functor}< + ${element_c}, + ${epilogue_vector_length}, + ${element_accumulator}, + ${element_epilogue} + >, + ${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>, + ${stages}, + cutlass::arch::OpMultiplyAdd, + ${iterator_algorithm}, + ${stride_support} + >::Kernel; +""" + + + def emit(self, operation): + + warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)] + + epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element]) + + values = { + 'operation_name': operation.procedural_name(), + 'conv_kind': ConvKindTag[operation.conv_kind], + 'conv_kind_name': ConvKindNames[operation.conv_kind].capitalize(), + 'element_a': DataTypeTag[operation.A.element], + 'layout_a': LayoutTag[operation.A.layout], + 'element_b': DataTypeTag[operation.B.element], + 'layout_b': LayoutTag[operation.B.layout], + 'element_c': DataTypeTag[operation.C.element], + 'layout_c': LayoutTag[operation.C.layout], + 'element_accumulator': DataTypeTag[operation.tile_description.math_instruction.element_accumulator], + 'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class], + 'arch': "cutlass::arch::Sm%d" % operation.arch, + 'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]), + 'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]), + 'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]), + 'warp_shape_m': str(warp_shape[0]), + 'warp_shape_n': str(warp_shape[1]), + 'warp_shape_k': str(warp_shape[2]), + 'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]), + 'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]), + 'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]), + 'epilogue_vector_length': str(epilogue_vector_length), + 'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor], + 'element_epilogue': str(DataTypeTag[operation.element_epilogue]), + 'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor], + 'stages': str(operation.tile_description.stages), + 'iterator_algorithm': IteratorAlgorithmTag[operation.iterator_algorithm], + 'iterator_algorithm_name': IteratorAlgorithmNames[operation.iterator_algorithm].capitalize(), + 'stride_support': StrideSupportTag[operation.stride_support] + } + + return SubstituteTemplate(self.template, values) + +################################################################################################### +# +# Generator functions for all layouts +# +################################################################################################### + +# +def GenerateConv3dTensorOp(manifest, tile_descriptions, min_cc, align = 128): + + for tile in tile_descriptions: + for conv_kind in [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad]: + + if conv_kind == ConvKind.Fprop or (tile.math_instruction.element_accumulator in [DataType.f16, DataType.f32]): + + # + output_types = [tile.math_instruction.element_a, tile.math_instruction.element_accumulator] \ + if DataTypeSize[tile.math_instruction.element_accumulator] == 32 \ + else [tile.math_instruction.element_accumulator,] + + for output_type in output_types: + A = TensorDescription(tile.math_instruction.element_a, LayoutType.TensorNDHWC, int(align / DataTypeSize[tile.math_instruction.element_a])) + B = TensorDescription(tile.math_instruction.element_b, LayoutType.TensorNDHWC, int(align / DataTypeSize[tile.math_instruction.element_b])) + C = TensorDescription(output_type, LayoutType.TensorNDHWC, max(1, int(align / DataTypeSize[output_type]))) + + manifest.append(Conv3dOperation(conv_kind, min_cc, tile, A, B, C, tile.math_instruction.element_accumulator)) + +################################################################################################### +# +# Emitters functions for all targets +# +################################################################################################### + +class EmitConv3dConfigurationLibrary: + def __init__(self, operation_path, configuration_name): + self.configuration_name = configuration_name + self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name) + + self.instance_emitter = EmitConv3dInstance() + + self.instance_template = """ +${operation_instance} + +// Derived class +struct ${operation_name} : + public ${operation_name}_base { }; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +""" + self.header_template = """ +/* + Generated by conv3d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "library_internal.h" +#include "conv3d_operation.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// +""" + + self.configuration_header = """ + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_${configuration_name}(Manifest &manifest) { + +""" + + self.configuration_instance = """ + using Operation_${operation_name} = cutlass::conv::device::ImplicitGemmConvolution< + ${operation_name}>; + + manifest.append(new cutlass::library::Conv3dOperation< + Operation_${operation_name}>( + "${operation_name}")); + +""" + + self.configuration_epilogue = """ +} +""" + self.epilogue_template = """ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +""" + + # + def __enter__(self): + self.configuration_file = open(self.configuration_path, "w") + self.configuration_file.write(SubstituteTemplate(self.header_template, { + 'configuration_name': self.configuration_name + })) + self.operations = [] + return self + + # + def emit(self, operation): + self.operations.append(operation) + self.configuration_file.write(SubstituteTemplate(self.instance_template, { + 'configuration_name': self.configuration_name, + 'operation_name': operation.procedural_name(), + 'operation_instance': self.instance_emitter.emit(operation) + })) + + # + def __exit__(self, exception_type, exception_value, traceback): + + self.configuration_file.write(SubstituteTemplate(self.configuration_header, { + 'configuration_name': self.configuration_name + })) + + for operation in self.operations: + self.configuration_file.write(SubstituteTemplate(self.configuration_instance, { + 'configuration_name': self.configuration_name, + 'operation_name': operation.procedural_name() + })) + + self.configuration_file.write(self.configuration_epilogue) + self.configuration_file.write(self.epilogue_template) + self.configuration_file.close() + + +################################################################################################### +################################################################################################### + diff --git a/tools/library/scripts/generator.py b/tools/library/scripts/generator.py index f21acaaf6e..491997cb89 100644 --- a/tools/library/scripts/generator.py +++ b/tools/library/scripts/generator.py @@ -11,7 +11,6 @@ from library import * from manifest import * -from gemm_operation import * ################################################################################################### # @@ -118,10 +117,9 @@ def CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, data_t gemm_kinds = [GemmKind.PlanarComplex, GemmKind.PlanarComplexArray] - # by default, only generate the largest tile and largest alignment + # by default, planar complex gemm kernels are not generated if manifest.args.kernels == '': - tile_descriptions = [tile_descriptions[0],] - alignment_constraints = [alignment_constraints[0],] + return for gemm_kind in gemm_kinds: for layout in layouts: @@ -141,6 +139,103 @@ def CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, data_t return ########################################################################################################### +# ConvolutionOperator support variations +# ____________________________________________________________________ +# ConvolutionalOperator | Analytic | Optimized +# ____________________________________________________________________ +# | Fprop | (strided) | (strided) +# | Dgrad | (strided, unity*) | (unity) +# | Wgrad | (strided) | (strided) +# ____________________________________________________________________ +# +# Note : Operator marked (*) are supported but not generated to keep the instantiated kernel count low +########################################################################################################### +# Convolution for 2D operations +def CreateConv2dOperator(manifest, layout, tile_descriptions, data_type, alignment, \ + conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], epilogue_functor = EpilogueFunctor.LinearCombination): + + element_a, element_b, element_c, element_epilogue = data_type + + # one exceptional case + alignment_c = min(8, alignment) + + # iterator algorithm (analytic and optimized) + iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized] + + # by default, only generate the largest tile size + if manifest.args.kernels == '': + tile_descriptions = [tile_descriptions[0],] + + operations = [] + + for tile in tile_descriptions: + for conv_kind in conv_kinds: + for iterator_algorithm in iterator_algorithms: + A = TensorDescription(element_a, layout[0], alignment) + B = TensorDescription(element_b, layout[1], alignment) + C = TensorDescription(element_c, layout[2], alignment_c) + + # unity stride only for Optimized Dgrad + if (iterator_algorithm == IteratorAlgorithm.Optimized) and (conv_kind == ConvKind.Dgrad): + new_operation = Conv2dOperation(conv_kind, iterator_algorithm, tile.minimum_compute_capability, tile,\ + A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor) + + manifest.append(new_operation) + operations.append(new_operation) + + # strided dgrad is not supported by Optimized Dgrad + if (iterator_algorithm == IteratorAlgorithm.Optimized) and (conv_kind == ConvKind.Dgrad): + continue + + # strided support for Fprop (Analytic/Optimized), Dgrad (Analytic), and Wgrad (Analytic) + new_operation = Conv2dOperation(conv_kind, iterator_algorithm, tile.minimum_compute_capability, tile,\ + A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor) + + manifest.append(new_operation) + operations.append(new_operation) + + return operations + +# Convolution for 3D operations +def CreateConv3dOperator(manifest, layout, tile_descriptions, data_type, alignment, \ + conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], epilogue_functor = EpilogueFunctor.LinearCombination): + + element_a, element_b, element_c, element_epilogue = data_type + + # one exceptional case + alignment_c = min(8, alignment) + + # iterator algorithm (analytic and optimized) + iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized] + + # by default, only generate the largest tile size + if manifest.args.kernels == '': + tile_descriptions = [tile_descriptions[0],] + + operations = [] + + for tile in tile_descriptions: + for conv_kind in conv_kinds: + for iterator_algorithm in iterator_algorithms: + A = TensorDescription(element_a, layout, alignment) + B = TensorDescription(element_b, layout, alignment) + C = TensorDescription(element_c, layout, alignment_c) + + # optimized conv3d iterator algorithm is only for Wgrad + if (iterator_algorithm == IteratorAlgorithm.Optimized) \ + and ((conv_kind == ConvKind.Fprop) or (conv_kind == ConvKind.Dgrad)): + continue + + # strided support for Fprop (Analytic/Optimized), Dgrad (Analytic), and Wgrad (Analytic) + new_operation = Conv3dOperation(conv_kind, iterator_algorithm, tile.minimum_compute_capability, tile,\ + A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor) + + manifest.append(new_operation) + operations.append(new_operation) + + + return operations + ################################################################################################### ################################################################################################### @@ -191,11 +286,57 @@ def GenerateSM50_Simt(manifest, args): CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type, alignment_constraints) + if math_inst.element_a == DataType.f32: + conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC) + CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, 1) +# + +# +def GenerateSM50_Simt_complex(manifest, args): + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + math_instructions = [ + MathInstruction( \ + [1, 1, 1], \ + DataType.f32, DataType.f32, DataType.f32, \ + OpcodeClass.Simt, \ + MathOperation.multiply_add_complex), + ] + + min_cc = 50 + max_cc = 1024 + + alignment_constraints = [1,] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc), + ] + + data_type = [ + DataType.cf32, + DataType.cf32, + DataType.cf32, + DataType.cf32, + ] + + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints) + + conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC) + CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, 1) # # def GenerateSM50(manifest, args): GenerateSM50_Simt(manifest, args) + GenerateSM50_Simt_complex(manifest, args) ################################################################################################### ################################################################################################### @@ -362,6 +503,9 @@ def GenerateSM70_TensorOp_884(manifest, args): CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type, alignment_constraints) + conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC) + CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, 8) + # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation) if math_inst.element_a != math_inst.element_accumulator: @@ -375,6 +519,8 @@ def GenerateSM70_TensorOp_884(manifest, args): CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type_mixed, alignment_constraints) + CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, 8) + # def GenerateSM70_PlanarComplexTensorOp_884(manifest, args): @@ -504,50 +650,10 @@ def GenerateSM70_WmmaTensorOp_161616(manifest, args): # ################################################################################################## # -def GenerateSM70_Simt_complex(manifest, args): - math_instructions = [ - MathInstruction( \ - [1, 1, 1], \ - DataType.f32, DataType.f32, DataType.f32, \ - OpcodeClass.Simt, \ - MathOperation.multiply_add_complex), - ] - - min_cc = 70 - max_cc = 1024 - - alignment_constraints = [1,] - - for math_inst in math_instructions: - tile_descriptions = [ - TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 64, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([64, 32, 16], 2, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([32, 64, 16], 2, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([32, 32, 16], 2, [2, 2, 1], math_inst, min_cc, max_cc), - ] - data_type = [ - DataType.cf32, - DataType.cf32, - DataType.cf32, - DataType.cf32 - ] - - complex_transforms = [ - (ComplexTransform.none, ComplexTransform.none), - (ComplexTransform.conj, ComplexTransform.none), - (ComplexTransform.none, ComplexTransform.conj), - (ComplexTransform.conj, ComplexTransform.conj) - ] - -# def GenerateSM70(manifest, args): GenerateSM70_TensorOp_884(manifest, args) GenerateSM70_PlanarComplexTensorOp_884(manifest, args) - GenerateSM70_Simt_complex(manifest, args) # To limit build size, WMMA GEMMs are disabled for now. # @@ -607,6 +713,9 @@ def GenerateSM75_TensorOp_1688(manifest, args): CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type, alignment_constraints) + conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC) + CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, 8) + # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation) if math_inst.element_a != math_inst.element_accumulator: @@ -620,6 +729,8 @@ def GenerateSM75_TensorOp_1688(manifest, args): CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type_mixed, alignment_constraints) + CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, 8) + # # @@ -738,6 +849,10 @@ def GenerateSM75_TensorOp_8816_TN(manifest, args): CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) + conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC) + CreateConv2dOperator(manifest, conv_layout, tile_descriptions, + data_type, 16, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp) + # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation) if math_inst.element_a != math_inst.element_accumulator: @@ -753,6 +868,9 @@ def GenerateSM75_TensorOp_8816_TN(manifest, args): operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) + operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions, + data_type_mixed, 16, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp) + for op in operations: if op.tile_description.threadblock_shape[1] >= 128: op.C.alignment = 16 @@ -794,6 +912,8 @@ def GenerateSM75_TensorOp_8816_Interleaved(manifest, args): TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc), TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc), TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 64, 64], 2, [4, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 256, 64], 2, [1, 4, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), @@ -809,9 +929,13 @@ def GenerateSM75_TensorOp_8816_Interleaved(manifest, args): operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) +# conv_layout = (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32, LayoutType.TensorNC32HW32) +# +# operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions, +# data_type_mixed, 16, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp) + for op in operations: op.C.alignment = 8 - # # @@ -862,6 +986,10 @@ def GenerateSM75_TensorOp_8832_TN(manifest, args): CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) + conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC) + CreateConv2dOperator(manifest, conv_layout, tile_descriptions, + data_type, 32, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp) + # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation) if math_inst.element_a != math_inst.element_accumulator: @@ -877,6 +1005,9 @@ def GenerateSM75_TensorOp_8832_TN(manifest, args): operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) + operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions, + data_type_mixed, 32, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp) + for op in operations: if op.tile_description.threadblock_shape[1] >= 128: op.C.alignment = 8 @@ -920,9 +1051,9 @@ def GenerateSM75_TensorOp_8832_Interleaved(manifest, args): TileDescription([256, 128, 128], 2, [4, 2, 1], math_inst, min_cc, max_cc), TileDescription([128, 256, 128], 2, [2, 4, 1], math_inst, min_cc, max_cc), TileDescription([128, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 64, 128], 2, [4, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 256, 128], 2, [1, 4, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 64, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 64, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc), ] # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation) @@ -938,9 +1069,13 @@ def GenerateSM75_TensorOp_8832_Interleaved(manifest, args): operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) +# conv_layout = (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64, LayoutType.TensorNC64HW64) +# +# operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions, +# data_type_mixed, 32, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp) + for op in operations: op.C.alignment = 16 - # # @@ -1074,6 +1209,8 @@ def GenerateSM75_Simt_complex(manifest, args): (ComplexTransform.conj, ComplexTransform.conj) ] + conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC) + CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, 1) # def GenerateSM75(manifest, args): @@ -1124,6 +1261,7 @@ def GenerateSM80_TensorOp_16816(manifest, args): min_cc = 80 max_cc = 1024 + max_cc_smem_limited = 80 alignment_constraints = [8, 4, 2] @@ -1137,10 +1275,10 @@ def GenerateSM80_TensorOp_16816(manifest, args): TileDescription([128, 64, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 128, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 64, 32], 10, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc), - TileDescription([256, 64, 64], 4, [4, 1, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc), + TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([256, 64, 64], 4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([ 64, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited), TileDescription([128, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([128, 64, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc), @@ -1157,6 +1295,10 @@ def GenerateSM80_TensorOp_16816(manifest, args): CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type, alignment_constraints) + conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC) + CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, 8) + CreateConv3dOperator(manifest, LayoutType.TensorNDHWC, tile_descriptions, data_type, 8) + # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation) if math_inst.element_a != math_inst.element_accumulator: @@ -1170,6 +1312,8 @@ def GenerateSM80_TensorOp_16816(manifest, args): CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type_mixed, alignment_constraints) + CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, 8) + CreateConv3dOperator(manifest, LayoutType.TensorNDHWC, tile_descriptions, data_type_mixed, 8) # # @@ -1205,22 +1349,23 @@ def GenerateSM80_SparseTensorOp_16832(manifest, args): min_cc = 80 max_cc = 1024 + max_cc_smem_limited = 80 alignment_constraints = [8, 4, 2] for math_inst in math_instructions: tile_descriptions = [ - TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited), TileDescription([128, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([256, 64, 64], 3, [4, 1, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc), TileDescription([128, 64, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 128, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 64, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 128, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc), - TileDescription([256, 64, 128], 3, [4, 1, 1], math_inst, min_cc, max_cc), - TileDescription([128, 64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([256, 64, 128], 3, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited), TileDescription([ 64, 128, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 64, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc), ] @@ -1348,6 +1493,7 @@ def GenerateSM80_TensorOp_16832_TN(manifest, args): min_cc = 80 max_cc = 1024 + max_cc_smem_limited = 80 alignment_constraints = [16,] @@ -1361,10 +1507,10 @@ def GenerateSM80_TensorOp_16832_TN(manifest, args): TileDescription([128, 64, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 128, 64], 6, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 64, 64], 10, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc), - TileDescription([256, 64, 128], 4, [4, 1, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc), + TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([256, 64, 128], 4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([ 64, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited), TileDescription([128, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([128, 64, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 128, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc), @@ -1382,6 +1528,13 @@ def GenerateSM80_TensorOp_16832_TN(manifest, args): operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) + conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC) + CreateConv2dOperator(manifest, conv_layout, tile_descriptions, + data_type, 16, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp) + + operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions, + data_type_mixed, 16, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp) + for op in operations: if op.tile_description.threadblock_shape[1] >= 128: op.C.alignment = 16 @@ -1409,21 +1562,22 @@ def GenerateSM80_SparseTensorOp_16864_TN(manifest, args): min_cc = 80 max_cc = 1024 + max_cc_smem_limited = 80 alignment_constraints = [16,] tile_descriptions = [ - TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc), - TileDescription([128, 128, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([256, 64, 128], 3, [4, 1, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc), TileDescription([128, 64, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 128, 128], 6, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 64, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 128, 128], 3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([256, 64, 128], 3, [4, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([ 64, 128, 128], 6, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([ 64, 64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 64, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([ 64, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited), TileDescription([ 64, 64, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc), ] @@ -1489,10 +1643,14 @@ def GenerateSM80_TensorOp_16832_Interleaved(manifest, args): operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) - + +# conv_layout = (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32, LayoutType.TensorNC32HW32) +# +# operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions, +# data_type_mixed, 16, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp) + for op in operations: op.C.alignment = 8 - # # @@ -1520,6 +1678,7 @@ def GenerateSM80_TensorOp_16864_TN(manifest, args): min_cc = 80 max_cc = 1024 + max_cc_smem_limited = 80 alignment_constraints = [32,] @@ -1533,14 +1692,14 @@ def GenerateSM80_TensorOp_16864_TN(manifest, args): TileDescription([128, 64, 128], 6, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 128, 128], 6, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 64, 128], 10, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([256, 128, 256], 3, [4, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 256, 256], 3, [2, 4, 1], math_inst, min_cc, max_cc), - TileDescription([256, 64, 256], 4, [4, 1, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 256, 256], 4, [1, 4, 1], math_inst, min_cc, max_cc), - TileDescription([128, 128, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 128, 256], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 256, 256], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([256, 64, 256], 4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([ 64, 256, 256], 4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 128, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited), TileDescription([128, 64, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 64, 256], 5, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 256], 5, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited), ] data_type = [math_inst.element_a, math_inst.element_b, math_inst.element_accumulator, DataType.s32] @@ -1582,20 +1741,21 @@ def GenerateSM80_SparseTensorOp_168128_TN(manifest, args): min_cc = 80 max_cc = 1024 + max_cc_smem_limited = 80 alignment_constraints = [32,] tile_descriptions = [ - TileDescription([256, 128, 256], 3, [4, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 256, 256], 3, [2, 4, 1], math_inst, min_cc, max_cc), - TileDescription([128, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([256, 64, 256], 3, [4, 1, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 256, 256], 4, [1, 4, 1], math_inst, min_cc, max_cc), - TileDescription([128, 64, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 128, 256], 6, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 64, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 128, 512], 3, [2, 4, 1], math_inst, min_cc, max_cc), - TileDescription([128, 64, 512], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 64, 256], 3, [4, 1, 1], math_inst, min_cc, max_cc), + TileDescription([256, 128, 256], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 256, 256], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 128, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([ 64, 256, 256], 4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 64, 256], 3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([ 64, 128, 256], 6, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 128, 512], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 64, 512], 4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited), TileDescription([ 64, 128, 512], 3, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 64, 512], 3, [2, 2, 1], math_inst, min_cc, max_cc), ] @@ -1655,9 +1815,7 @@ def GenerateSM80_TensorOp_16864_Interleaved(manifest, args): TileDescription([256, 64, 128], 4, [4, 1, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 256, 128], 4, [1, 4, 1], math_inst, min_cc, max_cc), TileDescription([128, 128, 128], 5, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 64, 128], 6, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 128, 128], 6, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 64, 128], 10, [2, 2, 1], math_inst, min_cc, max_cc), ] data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32] @@ -1666,7 +1824,12 @@ def GenerateSM80_TensorOp_16864_Interleaved(manifest, args): operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) - + +# conv_layout = (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64, LayoutType.TensorNC64HW64) +# +# operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions, +# data_type_mixed, 32, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp) + for op in operations: op.C.alignment = 16 # @@ -1744,6 +1907,7 @@ def GenerateSM80_TensorOp_1688(manifest, args): min_cc = 80 max_cc = 1024 + max_cc_smem_limited = 80 alignment_constraints = [4, 2, 1] @@ -1757,11 +1921,11 @@ def GenerateSM80_TensorOp_1688(manifest, args): TileDescription([128, 64, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 128, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc), - TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc), - TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited), TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([64, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 64, 32], 5, [2, 2, 1], math_inst, min_cc, max_cc), @@ -1787,6 +1951,10 @@ def GenerateSM80_TensorOp_1688(manifest, args): CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type_mixed, alignment_constraints) + conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC) + CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, 4) + + CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, 4) # # @@ -1822,6 +1990,7 @@ def GenerateSM80_TensorOp_1688_fast_math(manifest, args): min_cc = 80 max_cc = 1024 + max_cc_smem_limited = 80 alignment_constraints = [4, 2, 1] @@ -1835,11 +2004,11 @@ def GenerateSM80_TensorOp_1688_fast_math(manifest, args): TileDescription([128, 64, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 128, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc), - TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc), - TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc), - TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited), TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 64, 32], 5, [2, 2, 1], math_inst, min_cc, max_cc), @@ -1850,6 +2019,8 @@ def GenerateSM80_TensorOp_1688_fast_math(manifest, args): CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type, alignment_constraints) + conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC) + CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, 4) # # @@ -1875,22 +2046,23 @@ def GenerateSM80_SparseTensorOp_16816_fast_math(manifest, args): min_cc = 80 max_cc = 1024 + max_cc_smem_limited = 80 alignment_constraints = [4, 2, 1] for math_inst in math_instructions: tile_descriptions = [ - TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc), - TileDescription([128, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited), TileDescription([256, 64, 32], 3, [4, 1, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc), - TileDescription([128, 64, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 128, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 64, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 128, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc), - TileDescription([256, 64, 64], 3, [4, 1, 1], math_inst, min_cc, max_cc), - TileDescription([128, 64, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([256, 64, 64], 3, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 64, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited), TileDescription([ 64, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([ 64, 64, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc), ] @@ -1971,13 +2143,14 @@ def GenerateSM80_TensorOp_884(manifest, args): min_cc = 80 max_cc = 1024 + max_cc_smem_limited = 80 alignment_constraints = [1,] tile_descriptions = [ - TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc), - TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc), - TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited), + TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc_smem_limited), TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), @@ -2090,7 +2263,7 @@ def GenerateSM80_TensorOp_884_complex_gaussian(manifest, args): ################################################################################################### # -def GenerateSM80_Simt(manifest, args): +def GenerateSM80_Simt_f32(manifest, args): layouts = [ (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), @@ -2136,8 +2309,55 @@ def GenerateSM80_Simt(manifest, args): CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type, alignment_constraints) + conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC) + CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, 1) # + +# +def GenerateSM80_Simt_f64(manifest, args): + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + math_instructions = [ + MathInstruction( \ + [1, 1, 1], \ + DataType.f64, DataType.f64, DataType.f64, \ + OpcodeClass.Simt, \ + MathOperation.multiply_add), + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [1,] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([128, 128, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc), + TileDescription([128, 32, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 32, 128, 8], 5, [1, 2, 1], math_inst, min_cc, max_cc), + ] + + data_type = [ + math_inst.element_a, + math_inst.element_b, + math_inst.element_accumulator, + math_inst.element_accumulator, + ] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints) +# + + ################################################################################################## # def GenerateSM80_Simt_complex(manifest, args): @@ -2154,7 +2374,29 @@ def GenerateSM80_Simt_complex(manifest, args): alignment_constraints = [1,] + data_type = [ + DataType.cf32, + DataType.cf32, + DataType.cf32, + DataType.cf32 + ] + + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + complex_transforms = [ + (ComplexTransform.none, ComplexTransform.none), + (ComplexTransform.conj, ComplexTransform.none), + (ComplexTransform.none, ComplexTransform.conj), + (ComplexTransform.conj, ComplexTransform.conj) + ] + for math_inst in math_instructions: + tile_descriptions = [ TileDescription([128, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc), TileDescription([128, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc), @@ -2165,20 +2407,11 @@ def GenerateSM80_Simt_complex(manifest, args): TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc), ] - data_type = [ - DataType.cf32, - DataType.cf32, - DataType.cf32, - DataType.cf32 - ] - complex_transforms = [ - (ComplexTransform.none, ComplexTransform.none), - (ComplexTransform.conj, ComplexTransform.none), - (ComplexTransform.none, ComplexTransform.conj), - (ComplexTransform.conj, ComplexTransform.conj) - ] + CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, alignment_constraints, complex_transforms) + conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC) + CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, 1) # ################################################################################################### @@ -2202,7 +2435,8 @@ def GenerateSM80(manifest, args): GenerateSM80_SparseTensorOp_168128_TN(manifest, args) GenerateSM80_TensorOp_16864_Interleaved(manifest, args) GenerateSM80_TensorOp_168256(manifest, args) - GenerateSM80_Simt(manifest, args) + GenerateSM80_Simt_f32(manifest, args) + GenerateSM80_Simt_f64(manifest, args) GenerateSM80_Simt_complex(manifest, args) ################################################################################################### diff --git a/tools/library/scripts/library.py b/tools/library/scripts/library.py index 2bb062da95..b9538cdbc5 100644 --- a/tools/library/scripts/library.py +++ b/tools/library/scripts/library.py @@ -71,6 +71,7 @@ class DataType(enum.Enum): cu16 = enum_auto() cu32 = enum_auto() cu64 = enum_auto() + invalid = enum_auto() # ShortDataTypeNames = { @@ -260,6 +261,8 @@ class MathOperation(enum.Enum): class LayoutType(enum.Enum): ColumnMajor = enum_auto() RowMajor = enum_auto() + ColumnMajorInterleaved2 = enum_auto() + RowMajorInterleaved2 = enum_auto() ColumnMajorInterleaved32 = enum_auto() RowMajorInterleaved32 = enum_auto() ColumnMajorInterleaved64 = enum_auto() @@ -268,13 +271,17 @@ class LayoutType(enum.Enum): TensorNDHWC = enum_auto() TensorNCHW = enum_auto() TensorNGHWC = enum_auto() - TensorNCxHW32 = enum_auto() - TensorNCxHW64 = enum_auto() + TensorNC32HW32 = enum_auto() + TensorNC64HW64 = enum_auto() + TensorC32RSK32 = enum_auto() + TensorC64RSK64 = enum_auto() # LayoutTag = { LayoutType.ColumnMajor: 'cutlass::layout::ColumnMajor', LayoutType.RowMajor: 'cutlass::layout::RowMajor', + LayoutType.ColumnMajorInterleaved2: 'cutlass::layout::ColumnMajorInterleaved<2>', + LayoutType.RowMajorInterleaved2: 'cutlass::layout::RowMajorInterleaved<2>', LayoutType.ColumnMajorInterleaved32: 'cutlass::layout::ColumnMajorInterleaved<32>', LayoutType.RowMajorInterleaved32: 'cutlass::layout::RowMajorInterleaved<32>', LayoutType.ColumnMajorInterleaved64: 'cutlass::layout::ColumnMajorInterleaved<64>', @@ -283,14 +290,18 @@ class LayoutType(enum.Enum): LayoutType.TensorNDHWC: 'cutlass::layout::TensorNDHWC', LayoutType.TensorNCHW: 'cutlass::layout::TensorNCHW', LayoutType.TensorNGHWC: 'cutlass::layout::TensorNGHWC', - LayoutType.TensorNCxHW32: 'cutlass::layout::TensorNCxHW32', - LayoutType.TensorNCxHW64: 'cutlass::layout::TensorNCxHW64' + LayoutType.TensorNC32HW32: 'cutlass::layout::TensorNCxHWx<32>', + LayoutType.TensorC32RSK32: 'cutlass::layout::TensorCxRSKx<32>', + LayoutType.TensorNC64HW64: 'cutlass::layout::TensorNCxHWx<64>', + LayoutType.TensorC64RSK64: 'cutlass::layout::TensorCxRSKx<64>', } # TransposedLayout = { LayoutType.ColumnMajor: LayoutType.RowMajor, LayoutType.RowMajor: LayoutType.ColumnMajor, + LayoutType.ColumnMajorInterleaved2: LayoutType.RowMajorInterleaved2, + LayoutType.RowMajorInterleaved2: LayoutType.ColumnMajorInterleaved2, LayoutType.ColumnMajorInterleaved32: LayoutType.RowMajorInterleaved32, LayoutType.RowMajorInterleaved32: LayoutType.ColumnMajorInterleaved32, LayoutType.ColumnMajorInterleaved64: LayoutType.RowMajorInterleaved64, @@ -301,17 +312,21 @@ class LayoutType(enum.Enum): # ShortLayoutTypeNames = { LayoutType.ColumnMajor: 'n', + LayoutType.ColumnMajorInterleaved32: 'n2', LayoutType.ColumnMajorInterleaved32: 'n32', LayoutType.ColumnMajorInterleaved64: 'n64', LayoutType.RowMajor: 't', + LayoutType.RowMajorInterleaved2: 't2', LayoutType.RowMajorInterleaved32: 't32', LayoutType.RowMajorInterleaved64: 't64', LayoutType.TensorNHWC: 'nhwc', LayoutType.TensorNDHWC: 'ndhwc', LayoutType.TensorNCHW: 'nchw', LayoutType.TensorNGHWC: 'nghwc', - LayoutType.TensorNCxHW32: 'ncxhw32', - LayoutType.TensorNCxHW64: 'ncxhw64' + LayoutType.TensorNC32HW32: 'nc32hw32', + LayoutType.TensorNC64HW64: 'nc64hw64', + LayoutType.TensorC32RSK32: 'c32rsk32', + LayoutType.TensorC64RSK64: 'c64rsk64' } # @@ -346,9 +361,14 @@ class OpcodeClass(enum.Enum): # class OperationKind(enum.Enum): Gemm = enum_auto() + Conv2d = enum_auto() + Conv3d = enum_auto() + # OperationKindNames = { OperationKind.Gemm: 'gemm' + , OperationKind.Conv2d: 'conv2d' + , OperationKind.Conv3d: 'conv3d' } # @@ -424,6 +444,61 @@ class SwizzlingFunctor(enum.Enum): SwizzlingFunctor.Identity4: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>', SwizzlingFunctor.Identity8: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>', } + +################################################################################################### + +# +class ConvKind(enum.Enum): + Fprop = enum_auto() + Dgrad = enum_auto() + Wgrad = enum_auto() + +# +ConvKindTag = { + ConvKind.Fprop: 'cutlass::conv::Operator::kFprop', + ConvKind.Dgrad: 'cutlass::conv::Operator::kDgrad', + ConvKind.Wgrad: 'cutlass::conv::Operator::kWgrad' +} + +ConvKindNames = { + ConvKind.Fprop: 'fprop', + ConvKind.Dgrad: 'dgrad', + ConvKind.Wgrad: 'wgrad', +} + +# +class IteratorAlgorithm(enum.Enum): + Analytic = enum_auto() + Optimized = enum_auto() + +# +IteratorAlgorithmTag = { + IteratorAlgorithm.Analytic: 'cutlass::conv::IteratorAlgorithm::kAnalytic', + IteratorAlgorithm.Optimized: 'cutlass::conv::IteratorAlgorithm::kOptimized', +} + +IteratorAlgorithmNames = { + IteratorAlgorithm.Analytic: 'analytic', + IteratorAlgorithm.Optimized: 'optimized', +} + +# +class StrideSupport(enum.Enum): + Strided = enum_auto() + Unity = enum_auto() + +# +StrideSupportTag = { + StrideSupport.Strided: 'cutlass::conv::StrideSupport::kStrided', + StrideSupport.Unity: 'cutlass::conv::StrideSupport::kUnity', +} + +StrideSupportNames = { + StrideSupport.Strided: '', + StrideSupport.Unity: 'unity_stride', +} + + ################################################################################################### # diff --git a/tools/library/scripts/manifest.py b/tools/library/scripts/manifest.py index 2f0aa24ecb..409ec09a27 100644 --- a/tools/library/scripts/manifest.py +++ b/tools/library/scripts/manifest.py @@ -10,6 +10,9 @@ from library import * from gemm_operation import * +from conv2d_operation import * +from conv3d_operation import * + ################################################################################################### class EmitOperationKindLibrary: @@ -20,6 +23,8 @@ def __init__(self, generated_path, kind, args): self.emitters = { OperationKind.Gemm: EmitGemmConfigurationLibrary + , OperationKind.Conv2d: EmitConv2dConfigurationLibrary + , OperationKind.Conv3d: EmitConv3dConfigurationLibrary } self.configurations = []; @@ -112,7 +117,10 @@ class Manifest: def __init__(self, args): self.operations = {} self.args = args - self.compute_capabilities = [int(x) for x in args.architectures.split(';')] + + architectures = args.architectures.split(';') if len(args.architectures) else ['50',] + self.compute_capabilities = [int(x) for x in architectures] + self.selected_kernels = [] if args.operations == 'all': @@ -121,6 +129,8 @@ def __init__(self, args): operations_list = [ OperationKind.Gemm + , OperationKind.Conv2d + , OperationKind.Conv3d ] self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')] diff --git a/tools/library/src/conv2d_operation.h b/tools/library/src/conv2d_operation.h new file mode 100644 index 0000000000..5e8f887fd1 --- /dev/null +++ b/tools/library/src/conv2d_operation.h @@ -0,0 +1,380 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief Defines operations for all CONV operation kinds in CUTLASS Library. +*/ + +#pragma once +#include +#include "cutlass/cutlass.h" +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/kernel/default_conv2d_dgrad.h" +#include "cutlass/conv/kernel/default_conv2d_wgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "cutlass/library/library.h" +#include "library_internal.h" +#include "cutlass/util/host_tensor.h" + +#include "cutlass/util/reference/host/convolution.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/core_io.h" +/////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace library { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +class Conv2dOperationBase : public Operation { +public: + + using Operator = Operator_; + + using ElementA = typename Operator::ElementA; + using LayoutA = typename Operator::LayoutA; + using ElementB = typename Operator::ElementB; + using LayoutB = typename Operator::LayoutB; + using ElementC = typename Operator::ElementC; + using LayoutC = typename Operator::LayoutC; + using ElementAccumulator = typename Operator::ElementAccumulator; + using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute; + static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = Operator::kIteratorAlgorithm; + static cutlass::conv::Operator const kConvolutionalOperator = Operator::kConvolutionalOperator; + + using OperatorArguments = typename Operator::Arguments; + +protected: + + /// + ConvDescription description_; + +public: + + /// Constructor + Conv2dOperationBase(char const *name = "unknown_conv2d") { + + description_.name = name; + description_.provider = Provider::kCUTLASS; + description_.kind = OperationKind::kConv2d; + description_.conv_dim = Operator::kConvDim; + + description_.iterator_algorithm = IteratorAlgorithmMap::kId; + + description_.tile_description.threadblock_shape = make_Coord( + Operator::ThreadblockShape::kM, + Operator::ThreadblockShape::kN, + Operator::ThreadblockShape::kK); + + description_.tile_description.threadblock_stages = Operator::kStages; + + description_.tile_description.warp_count = make_Coord( + Operator::ImplicitGemmKernel::WarpCount::kM, + Operator::ImplicitGemmKernel::WarpCount::kN, + Operator::ImplicitGemmKernel::WarpCount::kK); + + description_.tile_description.math_instruction.instruction_shape = make_Coord( + Operator::InstructionShape::kM, + Operator::InstructionShape::kN, + Operator::InstructionShape::kK); + + description_.tile_description.math_instruction.element_accumulator = + NumericTypeMap::kId; + + description_.tile_description.math_instruction.opcode_class = + OpcodeClassMap::kId; + + description_.tile_description.math_instruction.math_operation = + MathOperationMap::kId; + + description_.tile_description.minimum_compute_capability = + ArchMap::kMin; + + description_.tile_description.maximum_compute_capability = + ArchMap::kMax; + + description_.A = make_TensorDescription(); + description_.B = make_TensorDescription(); + description_.C = make_TensorDescription(); + description_.element_epilogue = NumericTypeMap::kId; + + // TODO: Add split k mode Serial and parallel to convolutions + // description_.split_k_mode = Operator::kSplitK ? SplitKMode::kSerial : SplitKMode::kNone; + + } + + /// Returns the description of the GEMM operation + virtual OperationDescription const & description() const { + return description_; + } +}; + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Conv2d library operation class for cutlass profiler +// +/////////////////////////////////////////////////////////////////////////////////////////////////// +template +class Conv2dOperation : public Conv2dOperationBase { +public: + + using Operator = Operator_; + + using ElementA = typename Operator::ElementA; + using LayoutA = typename Operator::LayoutA; + using ElementB = typename Operator::ElementB; + using LayoutB = typename Operator::LayoutB; + using ElementC = typename Operator::ElementC; + using LayoutC = typename Operator::LayoutC; + using ElementAccumulator = typename Operator::ElementAccumulator; + using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute; + static cutlass::conv::Operator const kConvolutionalOperator = Operator::kConvolutionalOperator; + + using OperatorArguments = typename Operator::Arguments; + +public: + /// Constructor + Conv2dOperation(char const *name = "unknown_conv2d_fprop") : Conv2dOperationBase(name) { + this->description_.conv_kind = ConvKindMap::kId; + } + +protected: + + /// Constructs the arguments structure given the configuration and arguments + static Status construct_arguments_( + OperatorArguments &operator_args, + Conv2dConfiguration const *configuration) { + + + operator_args.problem_size = configuration->problem_size; + + operator_args.ref_A = + { + nullptr, + LayoutA::packed(implicit_gemm_tensor_a_extent(kConvolutionalOperator, configuration->problem_size)) + }; + + operator_args.ref_B = + { + nullptr, + LayoutB::packed(implicit_gemm_tensor_b_extent(kConvolutionalOperator, configuration->problem_size)) + }; + + operator_args.ref_C = + { + nullptr, + LayoutC::packed(implicit_gemm_tensor_c_extent(kConvolutionalOperator, configuration->problem_size)) + }; + + operator_args.ref_D = + { + nullptr, + LayoutC::packed(implicit_gemm_tensor_c_extent(kConvolutionalOperator, configuration->problem_size)) + }; + + operator_args.split_k_mode = configuration->split_k_mode; + + return Status::kSuccess; + } + + /// Constructs the arguments structure given the configuration and arguments + static Status update_arguments_( + OperatorArguments &operator_args, + ConvArguments const *arguments) { + + if (arguments->pointer_mode == ScalarPointerMode::kHost) { + typename Operator::EpilogueOutputOp::Params params( + *static_cast(arguments->alpha), + *static_cast(arguments->beta) + ); + operator_args.output_op = params; + } + else if (arguments->pointer_mode == ScalarPointerMode::kDevice){ + typename Operator::EpilogueOutputOp::Params params( + static_cast(arguments->alpha), + static_cast(arguments->beta) + ); + operator_args.output_op = params; + } + else { + return Status::kErrorInvalidProblem; + } + + operator_args.ref_A.reset(static_cast(const_cast(arguments->A))); + operator_args.ref_B.reset(static_cast(const_cast(arguments->B))); + operator_args.ref_C.reset(static_cast(const_cast(arguments->C))); + operator_args.ref_D.reset(static_cast(const_cast(arguments->D))); + + return Status::kSuccess; + } + +public: + + /// Returns success if the operation can proceed + virtual Status can_implement( + void const *configuration_ptr, + void const *arguments_ptr) const { + + Conv2dConfiguration const *configuration = + static_cast(configuration_ptr); + + ConvArguments const *arguments = + static_cast(arguments_ptr); + + OperatorArguments args; + + Status status = construct_arguments_(args, configuration); + + if (status != Status::kSuccess) { + return status; + } + + status = update_arguments_(args, arguments); + + if (status != Status::kSuccess) { + return status; + } + + return Operator::can_implement(args); + + } + + /// Gets the host-side workspace + virtual uint64_t get_host_workspace_size( + void const *configuration) const { + + return sizeof(Operator); + } + + /// Gets the device-side workspace + virtual uint64_t get_device_workspace_size( + void const *configuration_ptr) const { + + OperatorArguments args; + + Status status = construct_arguments_( + args, + static_cast(configuration_ptr)); + + if (status != Status::kSuccess) { + return 0; + } + + return Operator::get_workspace_size(args); + } + + /// Initializes the workspace + virtual Status initialize( + void const *configuration_ptr, + void *host_workspace, + void *device_workspace, + cudaStream_t stream = nullptr) const { + + OperatorArguments args; + + Status status = construct_arguments_( + args, + static_cast(configuration_ptr)); + + if (status != Status::kSuccess) { + return status; + } + + Operator *op = new (host_workspace) Operator; + //std::cout << "initialize library::Conv2dOperation" << std::endl; + //print_operator_args(args); + return op->initialize(args, device_workspace, stream); + + } + + /// Runs the kernel + virtual Status run( + void const *arguments_ptr, + void *host_workspace, + void *device_workspace = nullptr, + cudaStream_t stream = nullptr) const { + + OperatorArguments args; + + Status status = update_arguments_( + args, + static_cast(arguments_ptr)); + + if (status != Status::kSuccess) { + return status; + } + + Operator *op = static_cast(host_workspace); + + status = op->update(args, device_workspace); + + if (status != Status::kSuccess) { + return status; + } + //std::cout << "run library::Conv2dOperation" << std::endl; + //print_operator_args(args); + return op->run(stream); + } + + /// Call print_operator_args from the Conv2dOperation::initialize() + // to dump arguments passed on to cutlass operator for debugging + void print_operator_args(OperatorArguments &operator_args) const { + std::cout << "Conv2dOperation::OperatorArguments" << std::endl + << " problem_size:" << std::endl + << operator_args.problem_size << std::endl + << " split_k_mode: " + << (operator_args.split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial" : "parallel") << std::endl + << " epilouge (alpha, beta): " + << operator_args.output_op.alpha << ", " + << operator_args.output_op.beta << std::endl + << " ref_A (ptr, {stride}): " + << operator_args.ref_A.data() << ", {" + << operator_args.ref_A.stride(0) << ", " + << operator_args.ref_A.stride(1) << ", " + << operator_args.ref_A.stride(2) << "}" << std::endl + << " ref_B (ptr, {stride}): " + << operator_args.ref_B.data() << ", {" + << operator_args.ref_B.stride(0) << ", " + << operator_args.ref_B.stride(1) << ", " + << operator_args.ref_B.stride(2) << "}" << std::endl + << " ref_C (ptr, {stride}): " + << operator_args.ref_C.data() << ", {" + << operator_args.ref_C.stride(0) << ", " + << operator_args.ref_C.stride(1) << ", " + << operator_args.ref_C.stride(2) << "}" << std::endl + << " ref_D (ptr, {stride}): " + << operator_args.ref_D.data() << ", {" + << operator_args.ref_D.stride(0) << ", " + << operator_args.ref_D.stride(1) << ", " + << operator_args.ref_D.stride(2) << "}" << std::endl; + } +}; + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tools/library/src/conv3d_operation.h b/tools/library/src/conv3d_operation.h new file mode 100644 index 0000000000..32ad036320 --- /dev/null +++ b/tools/library/src/conv3d_operation.h @@ -0,0 +1,378 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief Defines operations for all CONV operation kinds in CUTLASS Library. +*/ + +#pragma once +#include +#include "cutlass/cutlass.h" +#include "cutlass/conv/kernel/default_conv3d_fprop.h" +#include "cutlass/conv/kernel/default_conv3d_dgrad.h" +#include "cutlass/conv/kernel/default_conv3d_wgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "cutlass/library/library.h" +#include "library_internal.h" +#include "cutlass/util/host_tensor.h" + +#include "cutlass/util/reference/host/convolution.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/core_io.h" +/////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace library { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +class Conv3dOperationBase : public Operation { +public: + + using Operator = Operator_; + + using ElementA = typename Operator::ElementA; + using LayoutA = typename Operator::LayoutA; + using ElementB = typename Operator::ElementB; + using LayoutB = typename Operator::LayoutB; + using ElementC = typename Operator::ElementC; + using LayoutC = typename Operator::LayoutC; + using ElementAccumulator = typename Operator::ElementAccumulator; + using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute; + static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = Operator::kIteratorAlgorithm; + static cutlass::conv::Operator const kConvolutionalOperator = Operator::kConvolutionalOperator; + + using OperatorArguments = typename Operator::Arguments; + +protected: + + /// + ConvDescription description_; + +public: + + /// Constructor + Conv3dOperationBase(char const *name = "unknown_conv3d") { + + description_.name = name; + description_.provider = Provider::kCUTLASS; + description_.kind = OperationKind::kConv3d; + description_.conv_dim = Operator::kConvDim; + + description_.iterator_algorithm = IteratorAlgorithmMap::kId; + + description_.tile_description.threadblock_shape = make_Coord( + Operator::ThreadblockShape::kM, + Operator::ThreadblockShape::kN, + Operator::ThreadblockShape::kK); + + description_.tile_description.threadblock_stages = Operator::kStages; + + description_.tile_description.warp_count = make_Coord( + Operator::ImplicitGemmKernel::WarpCount::kM, + Operator::ImplicitGemmKernel::WarpCount::kN, + Operator::ImplicitGemmKernel::WarpCount::kK); + + description_.tile_description.math_instruction.instruction_shape = make_Coord( + Operator::InstructionShape::kM, + Operator::InstructionShape::kN, + Operator::InstructionShape::kK); + + description_.tile_description.math_instruction.element_accumulator = + NumericTypeMap::kId; + + description_.tile_description.math_instruction.opcode_class = + OpcodeClassMap::kId; + + description_.tile_description.minimum_compute_capability = + ArchMap::kMin; + + description_.tile_description.maximum_compute_capability = + ArchMap::kMax; + + description_.A = make_TensorDescription(); + description_.B = make_TensorDescription(); + description_.C = make_TensorDescription(); + description_.element_epilogue = NumericTypeMap::kId; + + } + + /// Returns the description of the GEMM operation + virtual OperationDescription const & description() const { + return description_; + } +}; + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Conv2d library operation class for cutlass profiler +// +/////////////////////////////////////////////////////////////////////////////////////////////////// +template +class Conv3dOperation : public Conv3dOperationBase { +public: + + using Operator = Operator_; + + using ElementA = typename Operator::ElementA; + using LayoutA = typename Operator::LayoutA; + using ElementB = typename Operator::ElementB; + using LayoutB = typename Operator::LayoutB; + using ElementC = typename Operator::ElementC; + using LayoutC = typename Operator::LayoutC; + using ElementAccumulator = typename Operator::ElementAccumulator; + using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute; + static cutlass::conv::Operator const kConvolutionalOperator = Operator::kConvolutionalOperator; + + using OperatorArguments = typename Operator::Arguments; + +public: + /// Constructor + Conv3dOperation(char const *name = "unknown_conv3d_fprop") : Conv3dOperationBase(name) { + this->description_.conv_kind = ConvKindMap::kId; + } + +protected: + + /// Constructs the arguments structure given the configuration and arguments + static Status construct_arguments_( + OperatorArguments &operator_args, + Conv3dConfiguration const *configuration) { + + + operator_args.problem_size = configuration->problem_size; + + operator_args.ref_A = + { + nullptr, + LayoutA::packed(implicit_gemm_tensor_a_extent(kConvolutionalOperator, configuration->problem_size)) + }; + + operator_args.ref_B = + { + nullptr, + LayoutB::packed(implicit_gemm_tensor_b_extent(kConvolutionalOperator, configuration->problem_size)) + }; + + operator_args.ref_C = + { + nullptr, + LayoutC::packed(implicit_gemm_tensor_c_extent(kConvolutionalOperator, configuration->problem_size)) + }; + + operator_args.ref_D = + { + nullptr, + LayoutC::packed(implicit_gemm_tensor_c_extent(kConvolutionalOperator, configuration->problem_size)) + }; + + operator_args.split_k_mode = configuration->split_k_mode; + + return Status::kSuccess; + } + + /// Constructs the arguments structure given the configuration and arguments + static Status update_arguments_( + OperatorArguments &operator_args, + ConvArguments const *arguments) { + + if (arguments->pointer_mode == ScalarPointerMode::kHost) { + typename Operator::EpilogueOutputOp::Params params( + *static_cast(arguments->alpha), + *static_cast(arguments->beta) + ); + operator_args.output_op = params; + } + else if (arguments->pointer_mode == ScalarPointerMode::kDevice){ + typename Operator::EpilogueOutputOp::Params params( + static_cast(arguments->alpha), + static_cast(arguments->beta) + ); + operator_args.output_op = params; + } + else { + return Status::kErrorInvalidProblem; + } + + operator_args.ref_A.reset(static_cast(const_cast(arguments->A))); + operator_args.ref_B.reset(static_cast(const_cast(arguments->B))); + operator_args.ref_C.reset(static_cast(const_cast(arguments->C))); + operator_args.ref_D.reset(static_cast(const_cast(arguments->D))); + + return Status::kSuccess; + } + +public: + + /// Returns success if the operation can proceed + virtual Status can_implement( + void const *configuration_ptr, + void const *arguments_ptr) const { + + Conv3dConfiguration const *configuration = + static_cast(configuration_ptr); + + ConvArguments const *arguments = + static_cast(arguments_ptr); + + OperatorArguments args; + + Status status = construct_arguments_(args, configuration); + + if (status != Status::kSuccess) { + return status; + } + + status = update_arguments_(args, arguments); + + if (status != Status::kSuccess) { + return status; + } + + return Operator::can_implement(args); + + } + + /// Gets the host-side workspace + virtual uint64_t get_host_workspace_size( + void const *configuration) const { + + return sizeof(Operator); + } + + /// Gets the device-side workspace + virtual uint64_t get_device_workspace_size( + void const *configuration_ptr) const { + + OperatorArguments args; + + Status status = construct_arguments_( + args, + static_cast(configuration_ptr)); + + if (status != Status::kSuccess) { + return 0; + } + + return Operator::get_workspace_size(args); + } + + /// Initializes the workspace + virtual Status initialize( + void const *configuration_ptr, + void *host_workspace, + void *device_workspace, + cudaStream_t stream = nullptr) const { + + OperatorArguments args; + + Status status = construct_arguments_( + args, + static_cast(configuration_ptr)); + + if (status != Status::kSuccess) { + return status; + } + + Operator *op = new (host_workspace) Operator; + //std::cout << "initialize library::Conv3dOperation" << std::endl; + //print_operator_args(args); + return op->initialize(args, device_workspace, stream); + + } + + /// Runs the kernel + virtual Status run( + void const *arguments_ptr, + void *host_workspace, + void *device_workspace = nullptr, + cudaStream_t stream = nullptr) const { + + OperatorArguments args; + + Status status = update_arguments_( + args, + static_cast(arguments_ptr)); + + if (status != Status::kSuccess) { + return status; + } + + Operator *op = static_cast(host_workspace); + + status = op->update(args, device_workspace); + + if (status != Status::kSuccess) { + return status; + } + //std::cout << "run library::Conv3dOperation" << std::endl; + //print_operator_args(args); + return op->run(stream); + } + + /// Call print_operator_args from the Conv3dOperation::initialize() + // to dump arguments passed on to cutlass operator for debugging + void print_operator_args(OperatorArguments &operator_args) const { + std::cout << "Conv3dOperation::OperatorArguments" << std::endl + << " problem_size: " + << operator_args.problem_size << std::endl + << " split_k_mode: " + << (operator_args.split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial" : "parallel") << std::endl + << " epilouge (alpha, beta): " + << operator_args.output_op.alpha << ", " + << operator_args.output_op.beta << std::endl + << " ref_A (ptr, {stride}): " + << operator_args.ref_A.data() << ", {" + << operator_args.ref_A.stride(0) << ", " + << operator_args.ref_A.stride(1) << ", " + << operator_args.ref_A.stride(2) << ", " + << operator_args.ref_A.stride(3) << "}" << std::endl + << " ref_B (ptr, {stride}): " + << operator_args.ref_B.data() << ", {" + << operator_args.ref_B.stride(0) << ", " + << operator_args.ref_B.stride(1) << ", " + << operator_args.ref_B.stride(2) << ", " + << operator_args.ref_B.stride(3) << "}" << std::endl + << " ref_C (ptr, {stride}): " + << operator_args.ref_C.data() << ", {" + << operator_args.ref_C.stride(0) << ", " + << operator_args.ref_C.stride(1) << ", " + << operator_args.ref_C.stride(2) << ", " + << operator_args.ref_C.stride(3) << "}" << std::endl + << " ref_D (ptr, {stride}): " + << operator_args.ref_D.data() << ", {" + << operator_args.ref_D.stride(0) << ", " + << operator_args.ref_D.stride(1) << ", " + << operator_args.ref_D.stride(2) << ", " + << operator_args.ref_D.stride(3) << "}" << std::endl; + } +}; + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tools/library/src/handle.cu b/tools/library/src/handle.cu index bdddf2d7ca..3f19def654 100644 --- a/tools/library/src/handle.cu +++ b/tools/library/src/handle.cu @@ -1037,8 +1037,70 @@ Status Handle::gemm_planar_complex_array( } ///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Finds conv operation instances with Conv::ElementC = Reduction::ElementWorkspace +Operation const* find_conv_operation_for_parallel_reduction(Operation const *operation) { + + ConvDescription const &conv_desc = + static_cast(operation->description()); + + // if the curren conv operation accumulator and output data type match return operation + if(conv_desc.tile_description.math_instruction.element_accumulator == conv_desc.C.element) { + return operation; + } + + // find conv operation to match conv output and reduction workspace data type + ConvFunctionalKey key( + library::Provider::kCUTLASS, + conv_desc.conv_kind, + conv_desc.A.element, + conv_desc.A.layout, + conv_desc.B.element, + conv_desc.B.layout, + conv_desc.tile_description.math_instruction.element_accumulator, + conv_desc.C.layout, + conv_desc.tile_description.math_instruction.element_accumulator, + conv_desc.element_epilogue); + + // conv operation table for conv2d or conv3d + auto conv_operations = (conv_desc.kind == OperationKind::kConv2d) ? + Singleton::get().operation_table.conv2d_operations : + Singleton::get().operation_table.conv3d_operations; + + // find ConvFunctionalKey in convolution operation table + auto operators_it = conv_operations.find(key); + + if (operators_it == conv_operations.end()) { + return nullptr; + } + + if (operators_it->second.empty()) { + return nullptr; + } + + // conv operation for same compute capability and iterator algorithm + ConvPreferenceKey preference_key( + conv_desc.tile_description.minimum_compute_capability, + conv_desc.iterator_algorithm); + + auto it = operators_it->second.find(preference_key); + + if(it == operators_it->second.end()) { + return nullptr; + } + + // return matching conv opertion (same tile sizes and instruction) + for (auto op : it->second) { + if (op->description().tile_description == operation->description().tile_description) { + return op; + } + } + + return nullptr; +} +///////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace library } // namespace cutlass ///////////////////////////////////////////////////////////////////////////////////////////////// - diff --git a/tools/library/src/library_internal.h b/tools/library/src/library_internal.h index 21190cc825..4bbd21c763 100644 --- a/tools/library/src/library_internal.h +++ b/tools/library/src/library_internal.h @@ -227,6 +227,23 @@ template <> struct LayoutMap { template <> struct LayoutMap { static LayoutTypeID const kId = LayoutTypeID::kTensorNDHWC; }; + +template <> struct LayoutMap> { + static LayoutTypeID const kId = LayoutTypeID::kTensorNC32HW32; +}; + +template <> struct LayoutMap> { + static LayoutTypeID const kId = LayoutTypeID::kTensorNC64HW64; +}; + +template <> struct LayoutMap> { + static LayoutTypeID const kId = LayoutTypeID::kTensorC32RSK32; +}; + +template <> struct LayoutMap> { + static LayoutTypeID const kId = LayoutTypeID::kTensorC64RSK64; +}; + ///////////////////////////////////////////////////////////////////////////////////////////////// template struct OpcodeClassMap; @@ -257,6 +274,43 @@ template <> struct ComplexTransformMap { ///////////////////////////////////////////////////////////////////////////////////////////////// +template struct ConvModeMap; + +template <> struct ConvModeMap { + static ConvModeID const kId = ConvModeID::kCrossCorrelation; +}; + +template <> struct ConvModeMap { + static ConvModeID const kId = ConvModeID::kConvolution; +}; + + +template struct ConvKindMap; + +template <> struct ConvKindMap { + static ConvKind const kId = ConvKind::kFprop; +}; + +template <> struct ConvKindMap { + static ConvKind const kId = ConvKind::kDgrad; +}; + +template <> struct ConvKindMap { + static ConvKind const kId = ConvKind::kWgrad; +}; + + +template struct IteratorAlgorithmMap; + +template <> struct IteratorAlgorithmMap { + static IteratorAlgorithmID const kId = IteratorAlgorithmID::kAnalytic; +}; + +template <> struct IteratorAlgorithmMap { + static IteratorAlgorithmID const kId = IteratorAlgorithmID::kOptimized; +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + template TensorDescription make_TensorDescription(int alignment = 1) { TensorDescription desc; diff --git a/tools/library/src/manifest.cpp b/tools/library/src/manifest.cpp index 29d2ef156f..12358dcdd3 100644 --- a/tools/library/src/manifest.cpp +++ b/tools/library/src/manifest.cpp @@ -36,6 +36,11 @@ namespace cutlass { namespace library { ////////////////////////////////////////////////////////////////////////////////////////////////////////// + +void initialize_reference_operations(Manifest &manifest); + +////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// Top-level initialization Status Manifest::initialize() { @@ -46,6 +51,12 @@ Status Manifest::initialize() { // initialize procedurally generated cutlass op in manifest object initialize_all(*this); + // initialize manually instanced conv3d reference op in manifest object + initialize_reference_operations(*this); + + // initialize manually instanced reduction reference op in manifest object + initialize_all_reduction_op(*this); + return Status::kSuccess; } diff --git a/tools/library/src/operation_table.cu b/tools/library/src/operation_table.cu index 64e4f264cf..482bded851 100644 --- a/tools/library/src/operation_table.cu +++ b/tools/library/src/operation_table.cu @@ -76,6 +76,55 @@ void OperationTable::append(Manifest const &manifest) { } + // insert all conv2d or conv3d operation into operation table + if (desc.kind == OperationKind::kConv2d || desc.kind == OperationKind::kConv3d) { + auto &conv_desc = static_cast(desc); + + ConvFunctionalKey functional_key( + conv_desc.provider, + conv_desc.conv_kind, + conv_desc.A.element, + conv_desc.A.layout, + conv_desc.B.element, + conv_desc.B.layout, + conv_desc.C.element, + conv_desc.C.layout, + conv_desc.tile_description.math_instruction.element_accumulator, + conv_desc.element_epilogue + ); + + Operation const *op = operation.get(); + + int cc = conv_desc.tile_description.minimum_compute_capability; + + ConvPreferenceKey preference_key(cc, conv_desc.iterator_algorithm); + + // insert conv operation to conv2d_operations or conv3d_operations map + (desc.kind == OperationKind::kConv2d) ? + conv2d_operations[functional_key][preference_key].push_back(op) : + conv3d_operations[functional_key][preference_key].push_back(op); + } + + // insert all reduction operation into operation table + if (desc.kind == OperationKind::kReduction) { + auto &reduce_desc = static_cast(desc); + + ReductionFunctionalKey functional_key( + reduce_desc.provider, + reduce_desc.element_workspace, + reduce_desc.tile_description.math_instruction.element_accumulator, + reduce_desc.element_output, + reduce_desc.element_epilogue, + library::MathOperationID::kAdd, + library::EpilogueKind::kLinearCombination + ); + + Operation const *op = operation.get(); + + reduction_operations[functional_key] = op; + + } + } } diff --git a/tools/library/src/reduction/init_reduction_operations.cu b/tools/library/src/reduction/init_reduction_operations.cu new file mode 100644 index 0000000000..5f86b64f78 --- /dev/null +++ b/tools/library/src/reduction/init_reduction_operations.cu @@ -0,0 +1,57 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief Initialize operations for reduction operation in CUTLASS Library. + +*/ + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +namespace cutlass { +namespace library { +/////////////////////////////////////////////////////////////////////////////////////////////// +// CUTLASS Reduction Instances // +/////////////////////////////////////////////////////////////////////////////////////////////// +void initialize_reduce_add_linear_combination_f32_f32_f16(Manifest &manifest); +void initialize_reduce_add_linear_combination_f32_f32_f32(Manifest &manifest); +void initialize_reduce_add_linear_combination_cf32_cf32_cf32(Manifest &manifest); + +// +// Entry point to construct operations +// +void initialize_all_reduction_op(Manifest &manifest) { + + initialize_reduce_add_linear_combination_f32_f32_f16(manifest); + initialize_reduce_add_linear_combination_f32_f32_f32(manifest); + initialize_reduce_add_linear_combination_cf32_cf32_cf32(manifest); + +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass diff --git a/tools/library/src/reduction/reduction_device.cu b/tools/library/src/reduction/reduction_device.cu new file mode 100644 index 0000000000..e2133cc0a5 --- /dev/null +++ b/tools/library/src/reduction/reduction_device.cu @@ -0,0 +1,145 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief Defines operations for reduction operation in CUTLASS Library. +*/ + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "reduction_operation.h" + +namespace cutlass { +namespace library { + +// naming convention initialize_reduce_[ReductionOp]_[EpilogueOp]_[ElementWorkspace]_[ElementAccumulator]_[ElementOutput] + +void initialize_reduce_add_linear_combination_f32_f32_f16(Manifest &manifest) { + + using ElementWorkspace = float; + using ElementAccumulator = float; + using ElementOutput = cutlass::half_t; + using ElementCompute = float; + + using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >; + + using ReductionOp = cutlass::reduction::thread::ReduceAdd< + ElementAccumulator, + typename EpilogueOutputOp::ElementAccumulator, + EpilogueOutputOp::kCount + >; + + using Operation_reduce_add_linear_combination_f32_f32_f16 = cutlass::reduction::device::ReduceSplitK< + cutlass::reduction::kernel::ReduceSplitK< + cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>, + EpilogueOutputOp, + ReductionOp + > + >; + + manifest.append(new ReductionOperation< + Operation_reduce_add_linear_combination_f32_f32_f16>( + "reduce_add_linear_combination_f32_f32_f16" + )); +} + + +void initialize_reduce_add_linear_combination_f32_f32_f32(Manifest &manifest) { + + using ElementWorkspace = float; + using ElementAccumulator = float; + using ElementOutput = float; + using ElementCompute = float; + + using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >; + + using ReductionOp = cutlass::reduction::thread::ReduceAdd< + ElementAccumulator, + typename EpilogueOutputOp::ElementAccumulator, + EpilogueOutputOp::kCount + >; + + using Operation_reduce_add_linear_combination_f32_f32_f32 = cutlass::reduction::device::ReduceSplitK< + cutlass::reduction::kernel::ReduceSplitK< + cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>, + EpilogueOutputOp, + ReductionOp + > + >; + + manifest.append(new ReductionOperation< + Operation_reduce_add_linear_combination_f32_f32_f32>( + "reduce_add_linear_combination_f32_f32_f32" + )); +} + +void initialize_reduce_add_linear_combination_cf32_cf32_cf32(Manifest &manifest) { + + using ElementWorkspace = cutlass::complex; + using ElementAccumulator = cutlass::complex; + using ElementOutput = cutlass::complex; + using ElementCompute = cutlass::complex; + + using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >; + + using ReductionOp = cutlass::reduction::thread::ReduceAdd< + ElementAccumulator, + typename EpilogueOutputOp::ElementAccumulator, + EpilogueOutputOp::kCount + >; + + using Operation_reduce_add_linear_combination_cf32_cf32_cf32 = cutlass::reduction::device::ReduceSplitK< + cutlass::reduction::kernel::ReduceSplitK< + cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>, + EpilogueOutputOp, + ReductionOp + > + >; + + manifest.append(new ReductionOperation< + Operation_reduce_add_linear_combination_cf32_cf32_cf32>( + "reduce_add_linear_combination_cf32_cf32_cf32" + )); +} + + +} +} diff --git a/tools/library/src/reduction/reduction_operation.h b/tools/library/src/reduction/reduction_operation.h new file mode 100644 index 0000000000..88572ff684 --- /dev/null +++ b/tools/library/src/reduction/reduction_operation.h @@ -0,0 +1,282 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief Defines operations for reduction operation in CUTLASS Library. +*/ + +#pragma once +#include +#include "cutlass/cutlass.h" +#include "cutlass/epilogue/thread/linear_combination.h" +#include "cutlass/reduction/thread/reduction_operators.h" +#include "cutlass/reduction/device/reduce_split_k.h" + +#include "cutlass/library/library.h" +#include "library_internal.h" +#include "cutlass/core_io.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace library { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +class ReductionOperation : public Operation { +public: + using Operator = Operator_; + + using ElementWorkspace = typename Operator::ElementWorkspace; + using ElementAccumulator = typename Operator::ElementAccumulator; + using ElementOutput = typename Operator::ElementOutput; + + using ElementCompute = typename Operator::OutputOp::ElementCompute; + + using OperatorArguments = typename Operator::Arguments; + +protected: + + /// + ReductionDescription description_; + +public: + + /// Constructor + ReductionOperation(char const *name = "unknown_reduction") { + + description_.name = name; + description_.provider = Provider::kCUTLASS; + description_.kind = OperationKind::kReduction; + + description_.tile_description.threadblock_shape = make_Coord(Operator::Shape::kRow, Operator::Shape::kColumn, 1); + + description_.tile_description.math_instruction.instruction_shape = make_Coord(1, 1, 1); + description_.tile_description.math_instruction.element_accumulator = NumericTypeMap::kId; + description_.tile_description.math_instruction.opcode_class = OpcodeClassID::kSimt; + description_.tile_description.math_instruction.math_operation = MathOperationID::kAdd; + + description_.tile_description.minimum_compute_capability = 50; + description_.tile_description.maximum_compute_capability = 1024; + + description_.element_workspace = NumericTypeMap::kId; + description_.element_output = NumericTypeMap::kId; + description_.element_epilogue = NumericTypeMap::kId; + + } + + /// Returns the description of the Reduction operation + virtual OperationDescription const & description() const { + return description_; + } + + +protected: + + /// Constructs the arguments structure given the configuration and arguments + static Status construct_arguments_( + OperatorArguments &operator_args, + ReductionConfiguration const *configuration) { + + operator_args.problem_size = configuration->problem_size; + operator_args.partitions = configuration->partitions; + operator_args.partition_stride = configuration->partition_stride; + + operator_args.workspace = {nullptr, int(configuration->ldw)}; + operator_args.source = {nullptr, int(configuration->lds)}; + operator_args.destination = {nullptr, int(configuration->ldd)}; + + return Status::kSuccess; + } + + /// Constructs the arguments structure given the configuration and arguments + static Status update_arguments_( + OperatorArguments &operator_args, + ReductionArguments const *arguments) { + + if (arguments->pointer_mode == ScalarPointerMode::kHost) { + typename Operator::OutputOp::Params params( + *static_cast(arguments->alpha), + *static_cast(arguments->beta) + ); + operator_args.output = params; + } + else if (arguments->pointer_mode == ScalarPointerMode::kDevice){ + typename Operator::OutputOp::Params params( + static_cast(arguments->alpha), + static_cast(arguments->beta) + ); + operator_args.output = params; + } + else { + return Status::kErrorInvalidProblem; + } + + operator_args.workspace.reset(static_cast(const_cast(arguments->workspace))); + operator_args.source.reset(static_cast(const_cast(arguments->source))); + operator_args.destination.reset(static_cast(const_cast(arguments->destination))); + + return Status::kSuccess; + } + +public: + + /// Returns success if the operation can proceed + virtual Status can_implement( + void const *configuration_ptr, + void const *arguments_ptr) const { + + ReductionConfiguration const *configuration = + static_cast(configuration_ptr); + + ReductionArguments const *arguments = + static_cast(arguments_ptr); + + OperatorArguments args; + + Status status = construct_arguments_(args, configuration); + + if (status != Status::kSuccess) { + return status; + } + + status = update_arguments_(args, arguments); + + if (status != Status::kSuccess) { + return status; + } + + return Operator::can_implement(args); + } + + /// Gets the host-side workspace + virtual uint64_t get_host_workspace_size( + void const *configuration) const { + + return sizeof(Operator); + } + + /// Gets the device-side workspace + virtual uint64_t get_device_workspace_size( + void const *configuration_ptr) const { + + OperatorArguments args; + + Status status = construct_arguments_( + args, + static_cast(configuration_ptr)); + + if (status != Status::kSuccess) { + return 0; + } + + return Operator::get_workspace_size(args); + } + + /// Initializes the workspace + virtual Status initialize( + void const *configuration_ptr, + void *host_workspace, + void *device_workspace, + cudaStream_t stream = nullptr) const { + + OperatorArguments args; + + Status status = construct_arguments_( + args, + static_cast(configuration_ptr)); + + if (status != Status::kSuccess) { + return status; + } + + Operator *op = new (host_workspace) Operator; + //std::cout << "initialize library::Reduction" << std::endl; + //print_operator_args(args); + return op->initialize(args, device_workspace, stream); + } + + /// Runs the kernel + virtual Status run( + void const *arguments_ptr, + void *host_workspace, + void *device_workspace = nullptr, + cudaStream_t stream = nullptr) const { + + OperatorArguments args; + + Status status = update_arguments_( + args, + static_cast(arguments_ptr)); + + if (status != Status::kSuccess) { + return status; + } + + Operator *op = static_cast(host_workspace); + + status = op->update(args, device_workspace); + + if (status != Status::kSuccess) { + return status; + } + + //std::cout << "run library::Reduction" << std::endl; + //print_operator_args(args); + return op->run(stream); + } + + /// Call print_operator_args from the Reduction::initialize() + // to dump arguments passed on to cutlass operator for debugging + void print_operator_args(OperatorArguments &operator_args) const { + std::cout << "Reduction::OperatorArguments" << std::endl + << " problem_size: " + << operator_args.problem_size << std::endl + << " partitions: " + << operator_args.partitions << std::endl + << " partition_stride: " + << operator_args.partition_stride << std::endl + << " epilouge (alpha, beta): " + << operator_args.output.alpha << ", " + << operator_args.output.beta << std::endl + << " workspace (ptr, stride): " + << operator_args.workspace.data() << ", " + << operator_args.workspace.stride(0) << std::endl + << " source (ptr, stride): " + << operator_args.source.data() << ", " + << operator_args.source.stride(0) << std::endl + << " destination (ptr, stride): " + << operator_args.destination.data() << ", " + << operator_args.destination.stride(0) << std::endl; + } +}; + + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tools/library/src/reference/conv2d.cu b/tools/library/src/reference/conv2d.cu new file mode 100644 index 0000000000..750ebdf31c --- /dev/null +++ b/tools/library/src/reference/conv2d.cu @@ -0,0 +1,223 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief + +*/ + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "conv_reference_operation.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace library { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +void initialize_conv2d_reference_operations(Manifest &manifest) { + + make_conv_all< + 2, + cutlass::half_t, cutlass::layout::TensorNHWC, + cutlass::half_t, cutlass::layout::TensorNHWC, + cutlass::half_t, cutlass::layout::TensorNHWC, + cutlass::half_t, + cutlass::half_t + >(manifest); + + make_conv_all< + 2, + cutlass::half_t, cutlass::layout::TensorNHWC, + cutlass::half_t, cutlass::layout::TensorNHWC, + cutlass::half_t, cutlass::layout::TensorNHWC, + float, + float + >(manifest); + + make_conv_all< + 2, + cutlass::half_t, cutlass::layout::TensorNHWC, + cutlass::half_t, cutlass::layout::TensorNHWC, + float, cutlass::layout::TensorNHWC, + float, + float + >(manifest); + + make_conv_all< + 2, + cutlass::bfloat16_t, cutlass::layout::TensorNHWC, + cutlass::bfloat16_t, cutlass::layout::TensorNHWC, + cutlass::bfloat16_t, cutlass::layout::TensorNHWC, + float, + float + >(manifest); + + make_conv_all< + 2, + cutlass::bfloat16_t, cutlass::layout::TensorNHWC, + cutlass::bfloat16_t, cutlass::layout::TensorNHWC, + float, cutlass::layout::TensorNHWC, + float, + float + >(manifest); + + make_conv_all< + 2, + cutlass::tfloat32_t, cutlass::layout::TensorNHWC, + cutlass::tfloat32_t, cutlass::layout::TensorNHWC, + cutlass::tfloat32_t, cutlass::layout::TensorNHWC, + float, + float + >(manifest); + + make_conv_all< + 2, + cutlass::tfloat32_t, cutlass::layout::TensorNHWC, + cutlass::tfloat32_t, cutlass::layout::TensorNHWC, + float, cutlass::layout::TensorNHWC, + float, + float + >(manifest); + + make_conv_all< + 2, + float, cutlass::layout::TensorNHWC, + float, cutlass::layout::TensorNHWC, + float, cutlass::layout::TensorNHWC, + float, + float + >(manifest); + + make_conv_all< + 2, + cutlass::complex, cutlass::layout::TensorNHWC, + cutlass::complex, cutlass::layout::TensorNHWC, + cutlass::complex, cutlass::layout::TensorNHWC, + cutlass::complex, + cutlass::complex + >(manifest); + + make_conv_fprop< + 2, + int8_t, cutlass::layout::TensorNHWC, + int8_t, cutlass::layout::TensorNHWC, + int32_t, cutlass::layout::TensorNHWC, + int32_t, + int32_t, + NumericConverterClamp + >(manifest); + + make_conv_fprop< + 2, + int8_t, cutlass::layout::TensorNHWC, + int8_t, cutlass::layout::TensorNHWC, + int8_t, cutlass::layout::TensorNHWC, + float, + int32_t, + NumericConverterClamp + >(manifest); + + make_conv_fprop< + 2, + uint8_t, cutlass::layout::TensorNHWC, + uint8_t, cutlass::layout::TensorNHWC, + uint8_t, cutlass::layout::TensorNHWC, + float, + int32_t, + NumericConverterClamp + >(manifest); + + make_conv_fprop< + 2, + uint8_t, cutlass::layout::TensorNHWC, + uint8_t, cutlass::layout::TensorNHWC, + int32_t, cutlass::layout::TensorNHWC, + int32_t, + int32_t, + NumericConverterClamp + >(manifest); + + make_conv_fprop< + 2, + uint8_t, cutlass::layout::TensorNHWC, + uint8_t, cutlass::layout::TensorNHWC, + int8_t, cutlass::layout::TensorNHWC, + float, + int32_t, + NumericConverterClamp + >(manifest); + + make_conv_fprop< + 2, + cutlass::int4b_t, cutlass::layout::TensorNHWC, + cutlass::int4b_t, cutlass::layout::TensorNHWC, + int32_t, cutlass::layout::TensorNHWC, + int32_t, + int32_t, + NumericConverterClamp + >(manifest); + + make_conv_fprop< + 2, + cutlass::int4b_t, cutlass::layout::TensorNHWC, + cutlass::int4b_t, cutlass::layout::TensorNHWC, + cutlass::int4b_t, cutlass::layout::TensorNHWC, + float, + int32_t, + NumericConverterClamp + >(manifest); + + make_conv_fprop< + 2, + cutlass::uint4b_t, cutlass::layout::TensorNHWC, + cutlass::uint4b_t, cutlass::layout::TensorNHWC, + int32_t, cutlass::layout::TensorNHWC, + int32_t, + int32_t, + NumericConverterClamp + >(manifest); + + make_conv_fprop< + 2, + cutlass::uint4b_t, cutlass::layout::TensorNHWC, + cutlass::uint4b_t, cutlass::layout::TensorNHWC, + cutlass::uint4b_t, cutlass::layout::TensorNHWC, + float, + int32_t, + NumericConverterClamp + >(manifest); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/tools/library/src/reference/conv3d.cu b/tools/library/src/reference/conv3d.cu new file mode 100644 index 0000000000..1e1544bff6 --- /dev/null +++ b/tools/library/src/reference/conv3d.cu @@ -0,0 +1,203 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief +*/ + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "conv_reference_operation.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace library { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +void initialize_conv3d_reference_operations(Manifest &manifest) { + + make_conv_all< + 3, + cutlass::half_t, cutlass::layout::TensorNDHWC, + cutlass::half_t, cutlass::layout::TensorNDHWC, + cutlass::half_t, cutlass::layout::TensorNDHWC, + cutlass::half_t, + cutlass::half_t + >(manifest); + + make_conv_all< + 3, + cutlass::half_t, cutlass::layout::TensorNDHWC, + cutlass::half_t, cutlass::layout::TensorNDHWC, + cutlass::half_t, cutlass::layout::TensorNDHWC, + float, + float + >(manifest); + + make_conv_all< + 3, + cutlass::half_t, cutlass::layout::TensorNDHWC, + cutlass::half_t, cutlass::layout::TensorNDHWC, + float, cutlass::layout::TensorNDHWC, + float, + float + >(manifest); + + make_conv_all< + 3, + cutlass::bfloat16_t, cutlass::layout::TensorNDHWC, + cutlass::bfloat16_t, cutlass::layout::TensorNDHWC, + cutlass::bfloat16_t, cutlass::layout::TensorNDHWC, + float, + float + >(manifest); + + make_conv_all< + 3, + cutlass::bfloat16_t, cutlass::layout::TensorNDHWC, + cutlass::bfloat16_t, cutlass::layout::TensorNDHWC, + float, cutlass::layout::TensorNDHWC, + float, + float + >(manifest); + + make_conv_all< + 3, + cutlass::tfloat32_t, cutlass::layout::TensorNDHWC, + cutlass::tfloat32_t, cutlass::layout::TensorNDHWC, + cutlass::tfloat32_t, cutlass::layout::TensorNDHWC, + float, + float + >(manifest); + + make_conv_all< + 3, + cutlass::tfloat32_t, cutlass::layout::TensorNDHWC, + cutlass::tfloat32_t, cutlass::layout::TensorNDHWC, + float, cutlass::layout::TensorNDHWC, + float, + float + >(manifest); + + make_conv_all< + 3, + float, cutlass::layout::TensorNDHWC, + float, cutlass::layout::TensorNDHWC, + float, cutlass::layout::TensorNDHWC, + float, + float + >(manifest); + + make_conv_fprop< + 3, + int8_t, cutlass::layout::TensorNDHWC, + int8_t, cutlass::layout::TensorNDHWC, + int32_t, cutlass::layout::TensorNDHWC, + int32_t, + int32_t, + NumericConverterClamp + >(manifest); + + make_conv_fprop< + 3, + int8_t, cutlass::layout::TensorNDHWC, + int8_t, cutlass::layout::TensorNDHWC, + int8_t, cutlass::layout::TensorNDHWC, + float, + int32_t, + NumericConverterClamp + >(manifest); + + make_conv_fprop< + 3, + uint8_t, cutlass::layout::TensorNDHWC, + uint8_t, cutlass::layout::TensorNDHWC, + int32_t, cutlass::layout::TensorNDHWC, + int32_t, + int32_t, + NumericConverterClamp + >(manifest); + + make_conv_fprop< + 3, + uint8_t, cutlass::layout::TensorNDHWC, + uint8_t, cutlass::layout::TensorNDHWC, + int8_t, cutlass::layout::TensorNDHWC, + float, + int32_t, + NumericConverterClamp + >(manifest); + + make_conv_fprop< + 3, + cutlass::int4b_t, cutlass::layout::TensorNDHWC, + cutlass::int4b_t, cutlass::layout::TensorNDHWC, + int32_t, cutlass::layout::TensorNDHWC, + int32_t, + int32_t, + NumericConverterClamp + >(manifest); + + make_conv_fprop< + 3, + cutlass::int4b_t, cutlass::layout::TensorNDHWC, + cutlass::int4b_t, cutlass::layout::TensorNDHWC, + cutlass::int4b_t, cutlass::layout::TensorNDHWC, + float, + int32_t, + NumericConverterClamp + >(manifest); + + make_conv_fprop< + 3, + cutlass::uint4b_t, cutlass::layout::TensorNDHWC, + cutlass::uint4b_t, cutlass::layout::TensorNDHWC, + int32_t, cutlass::layout::TensorNDHWC, + int32_t, + int32_t, + NumericConverterClamp + >(manifest); + + make_conv_fprop< + 3, + cutlass::uint4b_t, cutlass::layout::TensorNDHWC, + cutlass::uint4b_t, cutlass::layout::TensorNDHWC, + cutlass::uint4b_t, cutlass::layout::TensorNDHWC, + float, + int32_t, + NumericConverterClamp + >(manifest); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/tools/library/src/reference/conv_reference_operation.h b/tools/library/src/reference/conv_reference_operation.h new file mode 100644 index 0000000000..1e826ab29e --- /dev/null +++ b/tools/library/src/reference/conv_reference_operation.h @@ -0,0 +1,607 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief Defines operations for all CONV operation kinds in CUTLASS Library +*/ + +#pragma once + +#include +#include +#include + +#include "cutlass/cutlass.h" + +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" +#include "cutlass/library/util.h" +#include "library_internal.h" + +#include "cutlass/util/reference/host/convolution.h" +#include "cutlass/util/reference/device/convolution.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace library { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace detail { + +template < + Provider kProvider, + conv::Operator ConvolutionalOperator, + int ConvDim, + typename ElementA_, + typename LayoutA_, + typename ElementB_, + typename LayoutB_, + typename ElementC_, + typename LayoutC_, + typename ElementCompute_, + typename ElementAccumulator_ = ElementCompute_, + typename ConvertOp_ = NumericConverter, + typename InnerProductOp_ = multiply_add +> +struct ConvReferenceDispatcher; + +/// Dispatcher for Conv2d (partially specialied for kConvDim == 2) +template < + Provider kProvider, + conv::Operator kConvolutionalOperator, + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator, + typename ConvertOp, + typename InnerProductOp +> +struct ConvReferenceDispatcher< + kProvider, + kConvolutionalOperator, + 2, + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, + InnerProductOp> { + + static Status dispatch( + void const *configuration, + ElementA *ptr_A, + ElementB *ptr_B, + ElementC *ptr_C, + ElementC *ptr_D, + ElementCompute alpha, + ElementCompute beta, + cudaStream_t stream = nullptr + ) { + + Conv2dConfiguration const &config = + *static_cast(configuration); + + ConvKind const conv_kind = ConvKindMap::kId; + + if (kProvider == Provider::kReferenceHost) { + + cutlass::reference::host::Conv2d< + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC , + LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, + InnerProductOp + >( + kConvolutionalOperator, + config.problem_size, + {ptr_A, config.layout_a(conv_kind)}, + {ptr_B, config.layout_b(conv_kind)}, + {ptr_C, config.layout_c(conv_kind)}, + {ptr_D, config.layout_c(conv_kind)}, + alpha, + beta + ); + + return Status::kSuccess; + } + else if (kProvider == Provider::kReferenceDevice) { + return cutlass::reference::device::Conv2d< + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, + InnerProductOp + >( + kConvolutionalOperator, + config.problem_size, + {ptr_A, config.layout_a(conv_kind)}, + {ptr_B, config.layout_b(conv_kind)}, + {ptr_C, config.layout_c(conv_kind)}, + {ptr_D, config.layout_c(conv_kind)}, + alpha, + beta, + stream + ); + } + return Status::kErrorNotSupported; + } +}; + +/// Dispatcher for Conv3d (partially specialized for kConvDim == 3) +template < + Provider kProvider, + conv::Operator kConvolutionalOperator, + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator, + typename ConvertOp, + typename InnerProductOp +> +struct ConvReferenceDispatcher< + kProvider, + kConvolutionalOperator, + 3, + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, + InnerProductOp> { + + static Status dispatch( + void const *configuration, + ElementA *ptr_A, + ElementB *ptr_B, + ElementC *ptr_C, + ElementC *ptr_D, + ElementCompute alpha, + ElementCompute beta, + cudaStream_t stream = nullptr + ) { + + Conv3dConfiguration const &config = + *static_cast(configuration); + + ConvKind const conv_kind = ConvKindMap::kId; + + if (kProvider == Provider::kReferenceHost) { + cutlass::reference::host::Conv3d< + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC , + LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, + InnerProductOp + >( + kConvolutionalOperator, + config.problem_size, + {ptr_A, config.layout_a(conv_kind)}, + {ptr_B, config.layout_b(conv_kind)}, + {ptr_C, config.layout_c(conv_kind)}, + {ptr_D, config.layout_c(conv_kind)}, + alpha, + beta + ); + + return Status::kSuccess; + } + else if (kProvider == Provider::kReferenceDevice) { + return cutlass::reference::device::Conv3d< + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, + InnerProductOp + >( + kConvolutionalOperator, + config.problem_size, + {ptr_A, config.layout_a(conv_kind)}, + {ptr_B, config.layout_b(conv_kind)}, + {ptr_C, config.layout_c(conv_kind)}, + {ptr_D, config.layout_c(conv_kind)}, + alpha, + beta, + stream + ); + } + return Status::kErrorNotSupported; + } +}; + +} // namespace detail + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template < + Provider Provider_, + conv::Operator ConvolutionalOperator, + int ConvDim, + typename ElementA_, + typename LayoutA_, + typename ElementB_, + typename LayoutB_, + typename ElementC_, + typename LayoutC_, + typename ElementCompute_, + typename ElementAccumulator_ = ElementCompute_, + typename ConvertOp_ = NumericConverter, + typename InnerProductOp_ = multiply_add +> +class ConvReferenceOperation : public Operation { +public: + static Provider const kProvider = Provider_; + static conv::Operator const kConvolutionalOperator = ConvolutionalOperator; + static int const kConvDim = ConvDim; + + using ElementA = ElementA_; + using LayoutA = LayoutA_; + using ElementB = ElementB_; + using LayoutB = LayoutB_; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + using ElementCompute = ElementCompute_; + using ElementAccumulator = ElementAccumulator_; + using ConvertOp = ConvertOp_; + using InnerProductOp = InnerProductOp_; + +protected: + + /// Storage for the name string + std::string name_; + + /// + ConvDescription description_; + +public: + + /// Constructor + ConvReferenceOperation() { + + // Basic information + description_.provider = kProvider; + description_.kind = (kConvDim == 2 ? OperationKind::kConv2d : OperationKind::kConv3d); + description_.conv_kind = ConvKindMap::kId; + description_.conv_dim = kConvDim; + + // Tensor description + description_.A = make_TensorDescription(); + description_.B = make_TensorDescription(); + description_.C = make_TensorDescription(); + + // Epilogue compute and accumulator type description + description_.element_epilogue = NumericTypeMap::kId; + + description_.tile_description.math_instruction.element_accumulator = + NumericTypeMap::kId; + + // Iterator algorithm for convolution reference + description_.iterator_algorithm = IteratorAlgorithmID::kNone; + + // Compute capability for convolution reference + description_.tile_description.minimum_compute_capability = + (kProvider == Provider::kReferenceDevice ? 50 : 0); + + description_.tile_description.maximum_compute_capability = 1024; + + // Procedural name + std::stringstream ss; + + ss << "conv" << kConvDim << "d_" << to_string(description_.conv_kind) + << "_reference_" << to_string(description_.provider) + << "_" << to_string(description_.A.element) << to_string(description_.A.layout) + << "_" << to_string(description_.B.element) << to_string(description_.B.layout) + << "_" << to_string(description_.C.element) << to_string(description_.C.layout) + << "_" << to_string(description_.tile_description.math_instruction.element_accumulator); + + name_ = ss.str(); + + description_.name = name_.c_str(); + + // Epilogue compute and accumulator type description + description_.element_epilogue = NumericTypeMap::kId; + + description_.tile_description.math_instruction.element_accumulator = + NumericTypeMap::kId; + } + + /// Returns the description of the GEMM operation + virtual OperationDescription const & description() const { + return description_; + } + + virtual Status can_implement( + void const *configuration, + void const *arguments) const { + + return Status::kSuccess; + } + + virtual uint64_t get_host_workspace_size( + void const *configuration) const { + + switch (kConvDim) { + case 2: + return sizeof(Conv2dConfiguration); + case 3: + return sizeof(Conv3dConfiguration); + default: + break; + } + + return 0; + } + + virtual uint64_t get_device_workspace_size( + void const *configuration) const { + + return 0; + } + + virtual Status initialize( + void const *configuration, + void *host_workspace, + void *device_workspace = nullptr, + cudaStream_t stream = nullptr) const { + + std::memcpy(host_workspace, configuration, get_host_workspace_size(configuration)); + + return Status::kSuccess; + } + + virtual Status run( + void const *arguments, + void *host_workspace, + void *device_workspace = nullptr, + cudaStream_t stream = nullptr) const { + + ConvArguments const &args = *static_cast(arguments); + + ElementCompute alpha; + ElementCompute beta; + + alpha = *static_cast(args.alpha); + beta = *static_cast(args.beta); + + // TODO - respect pointer mode + + // Invoke 2D or 3D convolution + return detail::ConvReferenceDispatcher< + kProvider, + kConvolutionalOperator, + kConvDim, + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, + InnerProductOp + >::dispatch( + host_workspace, + static_cast(const_cast(args.A)), + static_cast(const_cast(args.B)), + static_cast(const_cast(args.C)), + static_cast(args.D), + alpha, + beta, + stream + ); + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Constructs Fprop reference operators. +template < + int kConvDim, + typename ElementA_, + typename LayoutA_, + typename ElementB_, + typename LayoutB_, + typename ElementC_, + typename LayoutC_, + typename ElementCompute_, + typename ElementAccumulator_ = ElementCompute_, + typename ConvertOp_ = NumericConverter, + typename InnerProductOp_ = multiply_add +> +void make_conv_fprop(Manifest &manifest) { + + manifest.append(new ConvReferenceOperation< + Provider::kReferenceHost, + conv::Operator::kFprop, + kConvDim, + ElementA_, LayoutA_, + ElementB_, LayoutB_, + ElementC_, LayoutC_, + ElementCompute_, + ElementAccumulator_, + ConvertOp_, + InnerProductOp_ + >); + + manifest.append(new ConvReferenceOperation< + Provider::kReferenceDevice, + conv::Operator::kFprop, + kConvDim, + ElementA_, LayoutA_, + ElementB_, LayoutB_, + ElementC_, LayoutC_, + ElementCompute_, + ElementAccumulator_, + ConvertOp_, + InnerProductOp_ + >); +} + +/// Constructs Dgrad and Wgrad reference operators. +template < + int kConvDim, + typename ElementA_, + typename LayoutA_, + typename ElementB_, + typename LayoutB_, + typename ElementC_, + typename LayoutC_, + typename ElementCompute_, + typename ElementAccumulator_ = ElementCompute_, + typename ConvertOp_ = NumericConverter, + typename InnerProductOp_ = multiply_add +> +void make_conv_backwards(Manifest &manifest) { + + manifest.append(new ConvReferenceOperation< + Provider::kReferenceHost, + conv::Operator::kDgrad, + kConvDim, + ElementA_, LayoutA_, + ElementB_, LayoutB_, + ElementC_, LayoutC_, + ElementCompute_, + ElementAccumulator_, + ConvertOp_, + InnerProductOp_ + >); + + manifest.append(new ConvReferenceOperation< + Provider::kReferenceDevice, + conv::Operator::kDgrad, + kConvDim, + ElementA_, LayoutA_, + ElementB_, LayoutB_, + ElementC_, LayoutC_, + ElementCompute_, + ElementAccumulator_, + ConvertOp_, + InnerProductOp_ + >); + + manifest.append(new ConvReferenceOperation< + Provider::kReferenceHost, + conv::Operator::kWgrad, + kConvDim, + ElementA_, LayoutA_, + ElementB_, LayoutB_, + ElementC_, LayoutC_, + ElementCompute_, + ElementAccumulator_, + ConvertOp_, + InnerProductOp_ + >); + + manifest.append(new ConvReferenceOperation< + Provider::kReferenceDevice, + conv::Operator::kWgrad, + kConvDim, + ElementA_, LayoutA_, + ElementB_, LayoutB_, + ElementC_, LayoutC_, + ElementCompute_, + ElementAccumulator_, + ConvertOp_, + InnerProductOp_ + >); +} + +/// Six operators for the price of one. +template < + int kConvDim, + typename ElementA_, + typename LayoutA_, + typename ElementB_, + typename LayoutB_, + typename ElementC_, + typename LayoutC_, + typename ElementCompute_, + typename ElementAccumulator_ = ElementCompute_, + typename ConvertOp_ = NumericConverter, + typename InnerProductOp_ = multiply_add +> +void make_conv_all(Manifest &manifest) { + + make_conv_fprop< + kConvDim, + ElementA_, LayoutA_, + ElementB_, LayoutB_, + ElementC_, LayoutC_, + ElementCompute_, + ElementAccumulator_, + ConvertOp_, + InnerProductOp_ + >(manifest); + + make_conv_backwards< + kConvDim, + ElementA_, LayoutA_, + ElementB_, LayoutB_, + ElementC_, LayoutC_, + ElementCompute_, + ElementAccumulator_, + ConvertOp_, + InnerProductOp_ + >(manifest); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/tools/library/src/reference/initialize_reference_operations.cu b/tools/library/src/reference/initialize_reference_operations.cu index 016d91a6f2..c749c2bca9 100644 --- a/tools/library/src/reference/initialize_reference_operations.cu +++ b/tools/library/src/reference/initialize_reference_operations.cu @@ -37,10 +37,14 @@ namespace cutlass { namespace library { void initialize_gemm_reference_operations(Manifest &manifest); +void initialize_conv2d_reference_operations(Manifest &manifest); +void initialize_conv3d_reference_operations(Manifest &manifest); /////////////////////////////////////////////////////////////////////////////////////////////////// void initialize_reference_operations(Manifest &manifest) { + initialize_conv2d_reference_operations(manifest); + initialize_conv3d_reference_operations(manifest); initialize_gemm_reference_operations(manifest); } diff --git a/tools/library/src/util.cu b/tools/library/src/util.cu index 13fb9dfc0a..b20f505425 100644 --- a/tools/library/src/util.cu +++ b/tools/library/src/util.cu @@ -50,6 +50,7 @@ Provider_enumerants[] = { {"host", "reference_host", Provider::kReferenceHost}, {"device", "reference_device", Provider::kReferenceDevice}, {"cublas", "cuBLAS", Provider::kCUBLAS}, + {"cudnn", "cuDNN", Provider::kCUDNN}, }; /// Converts a Provider enumerant to a string @@ -128,6 +129,9 @@ static struct { OperationKind_enumerants[] = { {"eq_gemm", "EqGemm", OperationKind::kEqGemm}, {"gemm", "Gemm", OperationKind::kGemm}, + {"conv2d", "Conv2d", OperationKind::kConv2d}, + {"conv3d", "Conv3d", OperationKind::kConv3d}, + {"spgemm", "SparseGemm", OperationKind::kSparseGemm}, }; /// Converts a Status enumerant to a string @@ -445,6 +449,10 @@ layout_aliases[] = { {LayoutTypeID::kTensorNCDHW, "ncdhw"}, {LayoutTypeID::kTensorNHWC, "nhwc"}, {LayoutTypeID::kTensorNDHWC, "ndhwc"}, + {LayoutTypeID::kTensorNC32HW32, "nc32hw32"}, + {LayoutTypeID::kTensorNC64HW64, "nc64hw64"}, + {LayoutTypeID::kTensorC32RSK32, "c32rsk32"}, + {LayoutTypeID::kTensorC64RSK64, "c64rsk64"}, {LayoutTypeID::kUnknown, "*"}, {LayoutTypeID::kInvalid, nullptr} @@ -474,22 +482,46 @@ LayoutTypeID from_string(std::string const &str) { /// Gets stride rank for the layout_id (static function) int get_layout_stride_rank(LayoutTypeID layout_id) { switch (layout_id) { - case LayoutTypeID::kColumnMajor: return cutlass::layout::ColumnMajor::kStrideRank; - case LayoutTypeID::kRowMajor: return cutlass::layout::RowMajor::kStrideRank; + case LayoutTypeID::kColumnMajor: + return cutlass::layout::ColumnMajor::kStrideRank; + case LayoutTypeID::kRowMajor: + return cutlass::layout::RowMajor::kStrideRank; case LayoutTypeID::kColumnMajorInterleavedK2: + return cutlass::layout::ColumnMajorInterleaved<2>::kStrideRank; case LayoutTypeID::kRowMajorInterleavedK2: + return cutlass::layout::RowMajorInterleaved<2>::kStrideRank; case LayoutTypeID::kColumnMajorInterleavedK4: + return cutlass::layout::ColumnMajorInterleaved<4>::kStrideRank; case LayoutTypeID::kRowMajorInterleavedK4: + return cutlass::layout::RowMajorInterleaved<4>::kStrideRank; case LayoutTypeID::kColumnMajorInterleavedK16: + return cutlass::layout::ColumnMajorInterleaved<16>::kStrideRank; case LayoutTypeID::kRowMajorInterleavedK16: + return cutlass::layout::RowMajorInterleaved<16>::kStrideRank; case LayoutTypeID::kColumnMajorInterleavedK32: + return cutlass::layout::ColumnMajorInterleaved<32>::kStrideRank; case LayoutTypeID::kRowMajorInterleavedK32: + return cutlass::layout::RowMajorInterleaved<32>::kStrideRank; case LayoutTypeID::kColumnMajorInterleavedK64: - case LayoutTypeID::kRowMajorInterleavedK64: return 1; + return cutlass::layout::ColumnMajorInterleaved<64>::kStrideRank; + case LayoutTypeID::kRowMajorInterleavedK64: + return cutlass::layout::RowMajorInterleaved<64>::kStrideRank; case LayoutTypeID::kTensorNCHW: - case LayoutTypeID::kTensorNHWC: return 3; - case LayoutTypeID::kTensorNDHWC: return 4; - default : throw std::runtime_error("Unsupported LayoutTypeID in LayoutType::get_stride_rank"); + return cutlass::layout::TensorNCHW::kStrideRank; + case LayoutTypeID::kTensorNHWC: + return cutlass::layout::TensorNHWC::kStrideRank; + case LayoutTypeID::kTensorNDHWC: + return cutlass::layout::TensorNDHWC::kStrideRank; + case LayoutTypeID::kTensorNC32HW32: + return cutlass::layout::TensorNCxHWx<32>::kStrideRank; + case LayoutTypeID::kTensorNC64HW64: + return cutlass::layout::TensorNCxHWx<64>::kStrideRank; + case LayoutTypeID::kTensorC32RSK32: + return cutlass::layout::TensorCxRSKx<32>::kStrideRank; + case LayoutTypeID::kTensorC64RSK64: + return cutlass::layout::TensorCxRSKx<64>::kStrideRank; + default: + throw std::runtime_error("Unsupported LayoutTypeID in LayoutType::get_stride_rank"); } } @@ -624,6 +656,136 @@ SplitKMode from_string(std::string const &str) { } ///////////////////////////////////////////////////////////////////////////////////////////////// +static struct { + char const *text; + char const *pretty; + ConvModeID enumerant; +} +ConvModeID_enumerants[] = { + {"cross", "", ConvModeID::kCrossCorrelation}, + {"conv", "", ConvModeID::kConvolution}, +}; + +/// Converts a ConvModeID enumerant to a string +char const *to_string(ConvModeID type, bool pretty) { + + for (auto const & possible : ConvModeID_enumerants) { + if (type == possible.enumerant) { + if (pretty) { + return possible.pretty; + } + else { + return possible.text; + } + } + } + + return pretty ? "Invalid" : "invalid"; +} + +/// Converts a ConvModeID enumerant from a string +template <> +ConvModeID from_string(std::string const &str) { + + for (auto const & possible : ConvModeID_enumerants) { + if ((str.compare(possible.text) == 0) || + (str.compare(possible.pretty) == 0)) { + return possible.enumerant; + } + } + + return ConvModeID::kInvalid; +} + + +static struct { + char const *text; + char const *pretty; + IteratorAlgorithmID enumerant; +} +IteratorAlgorithmID_enumerants[] = { + {"none", "", IteratorAlgorithmID::kNone}, + {"analytic", "", IteratorAlgorithmID::kAnalytic}, + {"optimized", "", IteratorAlgorithmID::kOptimized}, +}; + +/// Converts a ConvModeID enumerant to a string +char const *to_string(IteratorAlgorithmID type, bool pretty) { + + for (auto const & possible : IteratorAlgorithmID_enumerants) { + if (type == possible.enumerant) { + if (pretty) { + return possible.pretty; + } + else { + return possible.text; + } + } + } + + return pretty ? "Invalid" : "invalid"; +} + +/// Converts a ConvModeID enumerant from a string +template <> +IteratorAlgorithmID from_string(std::string const &str) { + + for (auto const & possible : IteratorAlgorithmID_enumerants) { + if ((str.compare(possible.text) == 0) || + (str.compare(possible.pretty) == 0)) { + return possible.enumerant; + } + } + + return IteratorAlgorithmID::kInvalid; +} +/////////////////////////////////////////////////////////////////////////////////////////////////// + +static struct { + char const *text; + char const *pretty; + ConvKind enumerant; +} +ConvKind_enumerants[] = { + {"unknown", "", ConvKind::kUnknown}, + {"fprop", "", ConvKind::kFprop}, + {"dgrad", "", ConvKind::kDgrad}, + {"wgrad", "", ConvKind::kWgrad}, +}; + +/// Converts a ConvKind enumerant to a string +char const *to_string(ConvKind type, bool pretty) { + + for (auto const & possible : ConvKind_enumerants) { + if (type == possible.enumerant) { + if (pretty) { + return possible.pretty; + } + else { + return possible.text; + } + } + } + + return pretty ? "Invalid" : "invalid"; +} + + +/// Converts a ConvKind enumerant from a string +template <> +ConvKind from_string(std::string const &str) { + + for (auto const & possible : ConvKind_enumerants) { + if ((str.compare(possible.text) == 0) || + (str.compare(possible.pretty) == 0)) { + return possible.enumerant; + } + } + + return ConvKind::kInvalid; +} +/////////////////////////////////////////////////////////////////////////////////////////////////// + /// Lexical cast a string to a byte array. Returns true if cast is successful or false if invalid. bool lexical_cast(std::vector &bytes, NumericTypeID type, std::string const &str) { int size_bytes = sizeof_bits(type) / 8; @@ -1224,5 +1386,3 @@ bool cast_from_double(std::vector &bytes, NumericTypeID type, double sr } // namespace cutlass /////////////////////////////////////////////////////////////////////////////////////////////////// - - diff --git a/tools/profiler/CMakeLists.txt b/tools/profiler/CMakeLists.txt index 52baacb1aa..3ac944a9f2 100644 --- a/tools/profiler/CMakeLists.txt +++ b/tools/profiler/CMakeLists.txt @@ -34,9 +34,12 @@ set(CUTLASS_TOOLS_PROFILER_SOURCES src/device_allocation.cu src/device_context.cu src/cublas_helpers.cpp + src/cudnn_helpers.cpp src/problem_space.cpp src/operation_profiler.cu src/gemm_operation_profiler.cu + src/conv2d_operation_profiler.cu + src/conv3d_operation_profiler.cu src/sparse_gemm_operation_profiler.cu ) @@ -58,7 +61,7 @@ set_target_properties(cutlass_profiler PROPERTIES EXPORT_NAME profiler) target_include_directories( cutlass_profiler PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/src # Source directory + ${CMAKE_CURRENT_LIST_DIR}/src ) # @@ -71,6 +74,7 @@ target_link_libraries( cutlass_lib cutlass_tools_util_includes $<$:nvidia::cublas> + $<$:nvidia::cudnn> cudart ) @@ -79,3 +83,16 @@ install( EXPORT NvidiaCutlass RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ) + +set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM --operation=Gemm --providers=cutlass --verification-providers=cublas,device --junit-output=test_cutlass_profiler_gemm) +set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D --operation=Conv2d --providers=cutlass --verification-providers=cudnn,device --junit-output=test_cutlass_profiler_conv2d) +set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D --operation=Conv3d --providers=cutlass --verification-providers=cudnn,device,host --junit-output=test_cutlass_profiler_conv3d) +cutlass_add_executable_tests( + test_profiler cutlass_profiler + DEPENDEES test_all + TEST_COMMAND_OPTIONS + CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM + CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D + CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D + DISABLE_EXECUTABLE_INSTALL_RULE + ) diff --git a/tools/profiler/src/conv2d_operation_profiler.cu b/tools/profiler/src/conv2d_operation_profiler.cu new file mode 100644 index 0000000000..4b91535719 --- /dev/null +++ b/tools/profiler/src/conv2d_operation_profiler.cu @@ -0,0 +1,1468 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief Convolution 2D profiling +*/ + +#include +#include +#include +#include + +#include "cutlass/core_io.h" + +#include "conv2d_operation_profiler.h" +#include "gpu_timer.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// +using namespace cutlass::library; + +namespace cutlass { +namespace profiler { + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Ctor +Conv2dOperationProfiler::Conv2dOperationProfiler(Options const &options): + OperationProfiler( + options, + library::OperationKind::kConv2d, + { + {ArgumentTypeID::kEnumerated, {"conv_kind"}, "Convolutional operator (fprop, dgrad, wgrad)"}, + {ArgumentTypeID::kInteger, {"n", "input_n"}, "Input N dimension of the Conv2d problem space"}, + {ArgumentTypeID::kInteger, {"h", "input_h"}, "Input H dimension of the Conv2d problem space"}, + {ArgumentTypeID::kInteger, {"w", "input_w"}, "Input W dimension of the Conv2d problem space"}, + {ArgumentTypeID::kInteger, {"c", "input_c"}, "Input C dimension of the Conv2d problem space"}, + {ArgumentTypeID::kInteger, {"k", "filter_k"}, "Filter K dimension of the Conv2d problem space"}, + {ArgumentTypeID::kInteger, {"r", "filter_r"}, "Filter R dimension of the Conv2d problem space"}, + {ArgumentTypeID::kInteger, {"s", "filter_s"}, "Filter S dimension of the Conv2d problem space"}, + {ArgumentTypeID::kInteger, {"p", "output_p"}, "Output P dimension of the Conv2d problem space"}, + {ArgumentTypeID::kInteger, {"q", "output_q"}, "Output Q dimension of the Conv2d problem space"}, + {ArgumentTypeID::kInteger, {"pad_h"}, "Padding in H direction"}, + {ArgumentTypeID::kInteger, {"pad_w"}, "Padding in W direction"}, + {ArgumentTypeID::kInteger, {"stride_h"}, "Stride in H direction"}, + {ArgumentTypeID::kInteger, {"stride_w"}, "Stride in W direction"}, + {ArgumentTypeID::kInteger, {"dilation_h"}, "Dilation in H direction"}, + {ArgumentTypeID::kInteger, {"dilation_w"}, "Dilation in W direction"}, + {ArgumentTypeID::kTensor, {"Activation"}, "Tensor storing the Activation operand"}, + {ArgumentTypeID::kTensor, {"Filter"}, "Tensor storing the Filter operand"}, + {ArgumentTypeID::kTensor, {"Output"}, "Tensor storing the Output operand"}, + {ArgumentTypeID::kEnumerated, {"conv_mode"}, "Convolution filter mode (conv, cross)"}, + {ArgumentTypeID::kEnumerated, {"iterator_algorithm", "iterator_algo"}, "Convolution iterator algorithm (analytic, optimized)"}, + {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"}, + {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"}, + {ArgumentTypeID::kEnumerated, {"split_k_mode", "split-k-mode"}, "SplitK mode for serial or parallel reduction (serial, parallel)"}, + {ArgumentTypeID::kInteger, {"split_k_slices", "split-k-slices"}, "Number of partitions of K dimension"}, + {ArgumentTypeID::kEnumerated, {"eq_gemm_provider", "eq-gemm-provider"}, "Enable profiling equivalent gemm by the following providers (cutlass)"}, + }, + { library::Provider::kReferenceDevice, library::Provider::kReferenceHost, library::Provider::kCUDNN } + ) { + + description_ = " Conv2d operation. Output(Tensor4D) = alpha * Input(Tensor4D) * Filter(Tensor4D) + beta * Input(Tensor4D)"; + +} + +/// Destructor +Conv2dOperationProfiler::~Conv2dOperationProfiler() { + +} + + +/// Prints usage statement for the math function +void Conv2dOperationProfiler::print_usage(std::ostream &out) const { + out << "Conv2d" << "\n\n"; + + OperationProfiler::print_usage(out); +} + +/// Prints examples +void Conv2dOperationProfiler::print_examples(std::ostream &out) const { + + out << "\nExamples:\n\n" + << "Profile a particular convolution (specify all the convolution parameters):\n" + << " $ cutlass_profiler --operation=Conv2d" + " --Activation=f16:nhwc --Filter=f16:nhwc --Output=f16 --accumulator-type=f32" + " --n=32 --h=14 --w=14 --c=8 --k=64 --r=3 --s=3" + " --pad_h=1 --pad_w=1" + " --stride::h=1 --stride::w=1" + " --dilation::h=1 --dilation::w=1\n\n"; +} + +#if 0 +// used this for debugging +static std::string byte_string(std::vector const &bytes) { + std::stringstream ss; + + ss << "0x"; + + for (size_t idx = bytes.size(); idx > 0; --idx) { + ss << std::hex << std::setw(2) << std::setfill('0') << uint32_t(bytes.at(idx - 1)); + } + + return ss.str(); +} +#endif + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Total number of bytes loaded +int64_t Conv2dOperationProfiler::Conv2dProblem::bytes( + library::ConvDescription const &operation_desc) const { + + cutlass::gemm::GemmCoord mnk = eq_gemm_size(operation_desc.conv_kind); + + // Input bytes read and Output bytes written for the gemm problem + int64_t bytes_ = + int64_t(library::sizeof_bits(operation_desc.A.element) * mnk.m() / 8) * mnk.k() + + int64_t(library::sizeof_bits(operation_desc.B.element) * mnk.n() / 8) * mnk.k() + + int64_t(library::sizeof_bits(operation_desc.C.element) * mnk.m() / 8) * mnk.n(); + + // Set is_beta_zero true if beta is zero + bool is_beta_zero = std::all_of(beta.begin(), beta.end(), [](uint8_t i) { return i==0; }); + + // Output bytes read for the gemm problem for non-zero beta values + if (!is_beta_zero) { + bytes_ += int64_t(library::sizeof_bits(operation_desc.C.element) * mnk.m() / 8) * mnk.n(); + } + + return bytes_; +} + +/// Total number of flops computed +int64_t Conv2dOperationProfiler::Conv2dProblem::flops( + library::ConvDescription const &operation_desc) const { + + cutlass::gemm::GemmCoord mnk = eq_gemm_size(operation_desc.conv_kind); + + int64_t flops_mainloop_ = int64_t(mnk.m()) * mnk.n() * mnk.k() * 2; + int64_t flops_epilogue_ = int64_t(mnk.m()) * int64_t(mnk.n()) * 2; + + // Adjust mainloop flop for dgrad strided + if (operation_desc.conv_kind == library::ConvKind::kDgrad) { + flops_mainloop_ = flops_mainloop_ / (stride_h * stride_w); + } + int64_t flops_total_ = flops_mainloop_ + flops_epilogue_; + + //complex-valued support + switch (operation_desc.tile_description.math_instruction.math_operation) { + case library::MathOperationID::kMultiplyAddComplex: + flops_total_ *=4; + break; + + default: break; + } + + return flops_total_; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Extracts the problem dimensions +Status Conv2dOperationProfiler::initialize_configuration( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + library::ConvDescription const &operation_desc = + static_cast(operation->description()); + + if (!arg_as_int(problem_.n, "n", problem_space, problem)) { + // default value + problem_.n = 1; + } + + if (!arg_as_int(problem_.h, "h", problem_space, problem)) { + // default value + problem_.h = 16; + } + + if (!arg_as_int(problem_.w, "w", problem_space, problem)) { + // default value + problem_.w = 16; + } + + if (!arg_as_int(problem_.c, "c", problem_space, problem)) { + // default value + problem_.c = 64; + } + + if (!arg_as_int(problem_.k, "k", problem_space, problem)) { + // default value + problem_.k = 64; + } + + if (!arg_as_int(problem_.r, "r", problem_space, problem)) { + // default value + problem_.r = 3; + } + + if (!arg_as_int(problem_.s, "s", problem_space, problem)) { + // default value + problem_.s = 3; + } + + if (!arg_as_int(problem_.pad_h, "pad_h", problem_space, problem)) { + // default value + problem_.pad_h = 1; + } + + if (!arg_as_int(problem_.pad_w, "pad_w", problem_space, problem)) { + // default value + problem_.pad_w = 1; + } + + if (!arg_as_int(problem_.stride_h, "stride_h", problem_space, problem)) { + // default value + problem_.stride_h = 1; + } + + if (!arg_as_int(problem_.stride_w, "stride_w", problem_space, problem)) { + // default value + problem_.stride_w = 1; + } + + if (!arg_as_int(problem_.dilation_h, "dilation_h", problem_space, problem)) { + // default value + problem_.dilation_h = 1; + } + + if (!arg_as_int(problem_.dilation_w, "dilation_w", problem_space, problem)) { + // default value + problem_.dilation_w = 1; + } + + //////////////////////// Convolution output dimensions p and q //////////////////////// + // Cutlass convolutions support arbitrary output sizes and not constriant by // + // input, filter, padding, striding, dilation sizes. // + // cuDNN sets the output dimensions (p, q) using following equations: // + // // + // output = div_up(input + 2 * pad - ((filter - 1) * dilation + 1) + 1, stride) // + // where; div_up(a, b) : (a - 1)/b + 1 // + // // + // Thus, when output p and q dimensions are unspecified by the user // + // cutlass profiler sets p and q which are cuDNN compliant. // + // // + //////////////////////////////////////////////////////////////////////////////////////// + // set convolution output p + if (!arg_as_int(problem_.p, "p", problem_space, problem)) { + // default value (set using cudnn formula for output height, when p is not provided) + problem_.p = ( + problem_.h + + 2 * problem_.pad_h - + ((problem_.r - 1) * problem_.dilation_h + 1) + ) / (problem_.stride_h) + + 1; + } + + // set convolution output q + if (!arg_as_int(problem_.q, "q", problem_space, problem)) { + // default value (set using cudnn formula for output width, when q is not provided) + problem_.q = ( + problem_.w + + 2 * problem_.pad_w - + ((problem_.s - 1) * problem_.dilation_w + 1) + ) / (problem_.stride_w) + + 1; + } + ///////////////////////////////////////////////////////////////////////////////////////// + + + if (!arg_as_SplitKModeID(problem_.split_k_mode, "split_k_mode", problem_space, problem)) { + // default value + problem_.split_k_mode = library::SplitKMode::kSerial; + } + + if (!arg_as_int(problem_.split_k_slices, "split_k_slices", problem_space, problem)) { + // default value + problem_.split_k_slices = 1; + } + + if (!arg_as_ConvModeID(problem_.conv_mode, "conv_mode", problem_space, problem)) { + // default value + problem_.conv_mode = library::ConvModeID::kCrossCorrelation; + } + + if (!arg_as_ProviderID(problem_.eq_gemm_provider, "eq_gemm_provider", problem_space, problem)) { + // default value + problem_.eq_gemm_provider = library::Provider::kNone; + } + + if (!conv_kind_satisfies(operation_desc.conv_kind, "conv_kind", problem_space, problem)) { + return Status::kErrorInvalidProblem; + } + + if (!iterator_algorithm_satisfies(operation_desc.iterator_algorithm, "iterator_algorithm", problem_space, problem)) { + return Status::kErrorInvalidProblem; + } + + if (!tensor_description_satisfies(operation_desc.activation(), "Activation", problem_space, problem)) { + return Status::kErrorInvalidProblem; + } + + if (!tensor_description_satisfies(operation_desc.filter(), "Filter", problem_space, problem)) { + return Status::kErrorInvalidProblem; + } + + if (!tensor_description_satisfies(operation_desc.output(), "Output", problem_space, problem)) { + return Status::kErrorInvalidProblem; + } + + if (!arg_as_scalar( + problem_.alpha, + operation_desc.element_epilogue, + "alpha", + problem_space, + problem)) { + + if (!cast_from_double(problem_.alpha, operation_desc.element_epilogue, 1)) { + return Status::kErrorInternal; + } + } + + if (!arg_as_scalar( + problem_.beta, + operation_desc.element_epilogue, + "beta", + problem_space, + problem)) { + + if (!cast_from_double(problem_.beta, operation_desc.element_epilogue, 0)) { + return Status::kErrorInternal; + } + } + + // initialize library::Conv2dConfiguration + conv_workspace_.configuration.problem_size = conv::Conv2dProblemSize( + int(problem_.n), + int(problem_.h), + int(problem_.w), + int(problem_.c), + int(problem_.k), + int(problem_.r), + int(problem_.s), + int(problem_.p), + int(problem_.q), + int(problem_.pad_h), + int(problem_.pad_w), + int(problem_.stride_h), + int(problem_.stride_w), + int(problem_.dilation_h), + int(problem_.dilation_w), + static_cast(static_cast(problem_.conv_mode)), + int(problem_.split_k_slices), + 1 // groups + ); + + conv_workspace_.configuration.split_k_mode = static_cast(static_cast(problem_.split_k_mode)); + + conv_workspace_.configuration.layout_activations.stride() = make_Coord( + int(problem_.c), + int(problem_.w) * int(problem_.c), + int(problem_.h) * int(problem_.w) * int(problem_.c) + ); + + conv_workspace_.configuration.layout_filters.stride() = make_Coord( + int(problem_.c), + int(problem_.s) * int(problem_.c), + int(problem_.r) * int(problem_.s) * int(problem_.c) + ); + + conv_workspace_.configuration.layout_output.stride() = make_Coord( + int(problem_.k), + int(problem_.q) * int(problem_.k), + int(problem_.q) * int(problem_.p) * int(problem_.k) + ); + + + // initialize library::ConvArguments + conv_workspace_.arguments.A = nullptr; + conv_workspace_.arguments.B = nullptr; + conv_workspace_.arguments.C = nullptr; + conv_workspace_.arguments.D = nullptr; + conv_workspace_.arguments.alpha = problem_.alpha.data(); + conv_workspace_.arguments.beta = problem_.beta.data(); + conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost; + + // initialize reduction operation for parallel splitKMode + if(conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + if(!initialize_reduction_configuration_(options, report, device_context, operation, problem_space, problem)) { + return Status::kErrorInternal; + } + } + + initialize_result_(this->model_result_, options, operation_desc, problem_space); + + return operation->can_implement(&conv_workspace_.configuration, &conv_workspace_.arguments); +} + +/// Initializes the performance result +void Conv2dOperationProfiler::initialize_result_( + PerformanceResult &result, + Options const &options, + library::ConvDescription const &operation_desc, + ProblemSpace const &problem_space) { + + result.provider = library::Provider::kCUTLASS; + result.disposition = Disposition::kNotRun; + result.status = Status::kSuccess; + result.operation_name = operation_desc.name; + + result.arguments.resize(problem_space.rank()); + + set_argument(result, "Activation", problem_space, + std::string(library::to_string(operation_desc.activation().element)) + + ":" + library::to_string(operation_desc.activation().layout)); + + set_argument(result, "Filter", problem_space, + std::string(library::to_string(operation_desc.filter().element)) + + ":" + library::to_string(operation_desc.filter().layout)); + + set_argument(result, "Output", problem_space, + std::string(library::to_string(operation_desc.output().element)) + + ":" + library::to_string(operation_desc.output().layout)); + + set_argument(result, "conv_kind", problem_space, library::to_string(operation_desc.conv_kind)); + + set_argument(result, "iterator_algorithm", problem_space, std::string(library::to_string(operation_desc.iterator_algorithm))); + + set_argument(result, "n", problem_space, problem_.n); + set_argument(result, "h", problem_space, problem_.h); + set_argument(result, "w", problem_space, problem_.w); + set_argument(result, "c", problem_space, problem_.c); + + set_argument(result, "k", problem_space, problem_.k); + set_argument(result, "r", problem_space, problem_.r); + set_argument(result, "s", problem_space, problem_.s); + + set_argument(result, "p", problem_space, problem_.p); + set_argument(result, "q", problem_space, problem_.q); + + set_argument(result, "pad_h", problem_space, problem_.pad_h); + set_argument(result, "pad_w", problem_space, problem_.pad_w); + + set_argument(result, "stride_h", problem_space, problem_.stride_h); + set_argument(result, "stride_w", problem_space, problem_.stride_w); + + set_argument(result, "dilation_h", problem_space, problem_.dilation_h); + set_argument(result, "dilation_w", problem_space, problem_.dilation_w); + + set_argument(result, "split_k_mode", problem_space, + std::string(library::to_string(problem_.split_k_mode))); + set_argument(result, "split_k_slices", problem_space, problem_.split_k_slices); + + set_argument(result, "conv_mode", problem_space, + std::string(library::to_string(problem_.conv_mode))); + + set_argument(result, "alpha", problem_space, + library::lexical_cast(problem_.alpha, operation_desc.element_epilogue)); + + set_argument(result, "beta", problem_space, + library::lexical_cast(problem_.beta, operation_desc.element_epilogue)); + + set_argument(result, "eq_gemm_provider", problem_space, + std::string(library::to_string(problem_.eq_gemm_provider))); + + OperationProfiler::initialize_result_(result, operation_desc, problem_space); + + // Bytes of activation, filter, and output tensors + int64_t activation_bytes = int64_t(library::sizeof_bits(operation_desc.activation().element) / 8) * + conv_workspace_.configuration.problem_size.activation_size(); + + int64_t filter_bytes = int64_t(library::sizeof_bits(operation_desc.filter().element) / 8) * + conv_workspace_.configuration.problem_size.filter_size(); + + int64_t output_bytes = int64_t(library::sizeof_bits(operation_desc.output().element) / 8) * + conv_workspace_.configuration.problem_size.output_size(); + + // Bytes of activation, filter, and output tensors + result.bytes = problem_.bytes(operation_desc); + + // Theoritical flops required for the computation + result.flops = problem_.flops(operation_desc); + + // Measured runtime + result.runtime = 0; + +} + +/// Initialize reduction problem dimenstions and library::Operation +bool Conv2dOperationProfiler::initialize_reduction_configuration_( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + library::ConvDescription const &conv_desc = + static_cast(operation->description()); + + library::ConvKind const &conv_kind = conv_desc.conv_kind; + + if (!cast_from_double(problem_.alpha_one, conv_desc.element_epilogue, 1)) { + return false; + } + + if (!cast_from_double(problem_.beta_zero, conv_desc.element_epilogue, 0)) { + return false; + } + + /// This chooses the appropriate stride element of the row-major C tensor. + int const & tensor_c_stride_idx = (conv_kind == library::ConvKind::kWgrad ? 2 : 0); + + /// intialize library::ReductionConfiguration + conv_workspace_.reduction_configuration.problem_size = problem_.eq_gemm_size(conv_kind).mn(); + conv_workspace_.reduction_configuration.partitions = int(problem_.split_k_slices); + conv_workspace_.reduction_configuration.partition_stride = problem_.eq_gemm_size(conv_kind).mn().product(); + conv_workspace_.reduction_configuration.ldw = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx]; + conv_workspace_.reduction_configuration.lds = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx]; + conv_workspace_.reduction_configuration.ldd = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx]; + + // find reduction operation + library::ReductionFunctionalKey reduction_key( + library::Provider::kCUTLASS, + conv_desc.tile_description.math_instruction.element_accumulator, // element workspace + conv_desc.tile_description.math_instruction.element_accumulator, // element accumulator + conv_desc.C.element, // element output + conv_desc.element_epilogue // element compute + ); + +#if 0// debug print to check which reduction instance is selected + std::cout << reduction_key << "\n"; +#endif + auto reduction_it = Singleton::get().operation_table.reduction_operations.find(reduction_key); + + if(reduction_it == Singleton::get().operation_table.reduction_operations.end()) { + + return false; + } + + // initialize reduction operation required for parallel split-k conv2d operator + reduction_op_ = reduction_it->second; + + // reduction operation found and initialized + return true; +} + + +/// Initializes workspace +Status Conv2dOperationProfiler::initialize_workspace( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + // initialize conv2d underlying operation to handle parallel reduction + library::Operation const* underlying_operation = operation; + + if(conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + if (!(underlying_operation = library::find_conv_operation_for_parallel_reduction(operation))) { + return Status::kErrorNotSupported; + } + } + + library::ConvDescription const &operation_desc = + static_cast(underlying_operation->description()); + + // Compute the number of copies of the problem to avoid L2 camping. + if (!options.profiling.workspace_count) { + int64_t bytes = problem_.bytes(operation_desc); + if (bytes < 3 * int64_t(options.device.properties.l2CacheSize)) { + conv_workspace_.problem_count = + 1 + int((3 * int64_t(options.device.properties.l2CacheSize)) / bytes); + } + else { + conv_workspace_.problem_count = 1; + } + } + else { + conv_workspace_.problem_count = options.profiling.workspace_count; + } + + + if (options.execution_mode != ExecutionMode::kDryRun) { + + conv_workspace_.A = device_context.allocate_tensor( + options, + "A", + operation_desc.A.element, + operation_desc.A.layout, + problem_.extent_a(operation_desc.conv_kind), + conv_workspace_.stride_a(operation_desc.conv_kind), + conv_workspace_.problem_count + ); + + conv_workspace_.B = device_context.allocate_tensor( + options, + "B", + operation_desc.B.element, + operation_desc.B.layout, + problem_.extent_b(operation_desc.conv_kind), + conv_workspace_.stride_b(operation_desc.conv_kind), + conv_workspace_.problem_count + ); + + conv_workspace_.C = device_context.allocate_tensor( + options, + "C", + operation_desc.C.element, + operation_desc.C.layout, + problem_.extent_c(operation_desc.conv_kind), + conv_workspace_.stride_c(operation_desc.conv_kind), + conv_workspace_.problem_count + ); + + conv_workspace_.Computed = device_context.allocate_tensor( + "D", + operation_desc.C.element, + operation_desc.C.layout, + problem_.extent_c(operation_desc.conv_kind), + conv_workspace_.stride_c(operation_desc.conv_kind), + conv_workspace_.problem_count + ); + + conv_workspace_.Reference = device_context.allocate_tensor( + "Reference", + operation_desc.C.element, + operation_desc.C.layout, + problem_.extent_c(operation_desc.conv_kind), + conv_workspace_.stride_c(operation_desc.conv_kind), + conv_workspace_.problem_count + ); + + } + + // + // Initialize the CUTLASS operation + // + Status status = Status::kSuccess; + + if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) { + + if (options.execution_mode != ExecutionMode::kDryRun) { + + uint64_t workspace_size = underlying_operation->get_host_workspace_size(&conv_workspace_.configuration); + conv_workspace_.host_workspace.resize(workspace_size, 0); + + workspace_size = underlying_operation->get_device_workspace_size(&conv_workspace_.configuration); + conv_workspace_.device_workspace.reset(library::NumericTypeID::kU8, workspace_size); + + status = underlying_operation->initialize( + &conv_workspace_.configuration, + conv_workspace_.host_workspace.data(), + conv_workspace_.device_workspace.data()); + + if (status != Status::kSuccess) { + return status; + } + + if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + workspace_size = reduction_op_->get_host_workspace_size(&conv_workspace_.reduction_configuration); + conv_workspace_.reduction_host_workspace.resize(workspace_size, 0); + + status = reduction_op_->initialize( + &conv_workspace_.reduction_configuration, + conv_workspace_.reduction_host_workspace.data(), + nullptr); + + if (status != Status::kSuccess) { + return status; + } + } + } + + // + // If CUTLASS is enabled, generate a result for it + // + results_.push_back(model_result_); + results_.back().provider = library::Provider::kCUTLASS; + results_.back().op_kind = library::OperationKind::kConv2d; + results_.back().disposition = Disposition::kNotRun; + + for(auto provider : verification_providers_) { + results_.back().verification_map[provider] = Disposition::kNotRun; + } + } + + return status; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Verifies CUTLASS against references +bool Conv2dOperationProfiler::verify_cutlass( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + if (!options.profiling.provider_enabled(library::Provider::kCUTLASS)) { + return true; + } + + if (options.execution_mode == ExecutionMode::kDryRun) { + return true; + } + + cudaError_t result; + + // Initialize structure containing Conv2d arguments + conv_workspace_.arguments.A = conv_workspace_.A->data(); + conv_workspace_.arguments.B = conv_workspace_.B->data(); + conv_workspace_.arguments.C = conv_workspace_.C->data(); + conv_workspace_.arguments.D = conv_workspace_.Computed->data(); + conv_workspace_.arguments.alpha = problem_.alpha.data(); + conv_workspace_.arguments.beta = problem_.beta.data(); + conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost; + + conv_workspace_.Computed->copy_from_device(conv_workspace_.C->data()); + + if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + // update library::ConvArguments for parallel split-k reduction + conv_workspace_.arguments.D = conv_workspace_.device_workspace.data(); + conv_workspace_.arguments.alpha = problem_.alpha_one.data(); + conv_workspace_.arguments.beta = problem_.beta_zero.data(); + + /// intialize library::ReductionArguments + conv_workspace_.reduction_arguments.workspace = conv_workspace_.device_workspace.data(); + conv_workspace_.reduction_arguments.source = conv_workspace_.C->data(); + conv_workspace_.reduction_arguments.destination = conv_workspace_.Computed->data(); + conv_workspace_.reduction_arguments.alpha = problem_.alpha.data(); + conv_workspace_.reduction_arguments.beta = problem_.beta.data(); + conv_workspace_.reduction_arguments.pointer_mode = library::ScalarPointerMode::kHost; + } + + // + // Run the CUTLASS operation + // + // initialize conv2d underlying operation to handle parallel reduction + library::Operation const* underlying_operation = operation; + + if(conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + if (!(underlying_operation = library::find_conv_operation_for_parallel_reduction(operation))) { + results_.back().disposition = Disposition::kFailed; + return false; + } + } + +#if 0 + std::cout << "profiling : " << std::endl + << "conv2d : " << operation->description().name << std::endl + << "underlying conv2d : " << underlying_operation->description().name << std::endl + << "reduction : " << reduction_op_->description().name << std::endl; +#endif + + // run cutlass conv2d operation + results_.back().status = underlying_operation->run( + &conv_workspace_.arguments, + conv_workspace_.host_workspace.data(), + conv_workspace_.device_workspace.data()); + + if (results_.back().status != Status::kSuccess) { + results_.back().disposition = Disposition::kFailed; + return false; + } + + // Run parallel reduction kernel for parallel split_k_mode + if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + + results_.back().status = reduction_op_->run( + &conv_workspace_.reduction_arguments, + conv_workspace_.reduction_host_workspace.data(), + nullptr); + + if (results_.back().status != Status::kSuccess) { + results_.back().disposition = Disposition::kFailed; + return false; + } + + } + + // Synchronize before running device reference + result = cudaDeviceSynchronize(); + if (result != cudaSuccess) { + results_.back().disposition = Disposition::kFailed; + return false; + } + + // CUTLASS op ran the but not yet verified against any verification provider + results_.back().disposition = Disposition::kNotVerified; + + // + // Run verification providers + // + + if (options.verification.enabled) { + +#if CUTLASS_ENABLE_CUDNN + // Run verification cudnn reference + if (options.verification.provider_enabled(library::Provider::kCUDNN)) { + + // Guard against unsupported cases + auto const & conv_desc = static_cast(operation->description()); + + Status status = cudnn_satisfies(conv_desc, conv_workspace_.configuration); + + // Initialize reference data to the source data + conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data()); + + if (status == Status::kSuccess) { + // call cudnn verification if supported + verify_with_cudnn_( + options, + report, + device_context, + operation, + problem_space, + problem); + } + + else if (status == Status::kErrorInvalidProblem) { + results_.back().verification_map[library::Provider::kCUDNN] = Disposition::kInvalidProblem; + } + + else { + // set verification map for cudnn to not supported + results_.back().verification_map[library::Provider::kCUDNN] = Disposition::kNotSupported; + } + } +#endif // #if CUTLASS_ENABLE_CUDNN + + // Run verification device reference + if (options.verification.provider_enabled(library::Provider::kReferenceDevice)) { + + // Restore reference data back to initial source data + conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data()); + + verify_with_device_reference_( + options, + report, + device_context, + operation, + problem_space, + problem); + } + + // Run verification host reference + if (options.verification.provider_enabled(library::Provider::kReferenceHost)) { + + // Restore reference data back to initial source data + conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data()); + + verify_with_host_reference_( + options, + report, + device_context, + operation, + problem_space, + problem); + } + + // Update disposition to worst case verification outcome among all + // verification providers which are supported + bool is_any_verification_run_passed = false; + for(auto &m : results_.back().verification_map) { + if(m.second == Disposition::kFailed || m.second == Disposition::kIncorrect) { + results_.back().disposition = m.second; + return true; + } + if(!is_any_verification_run_passed && m.second == Disposition::kPassed) { + is_any_verification_run_passed = true; + } + } + + if(is_any_verification_run_passed) { + results_.back().disposition = Disposition::kPassed; + } + } + + // Return true means continue profiling + return true; +} + + +/// Verifies CUTLASS against host reference +bool Conv2dOperationProfiler::verify_with_host_reference_( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + Status status; + + // + // Find host reference operation using conv2d functional description key + // + library::OperationDescription const &desc = operation->description(); + + auto &conv_desc = static_cast(desc); + + library::ConvFunctionalKey conv2d_key( + library::Provider::kReferenceHost, + conv_desc.conv_kind, + conv_desc.A.element, + conv_desc.A.layout, + conv_desc.B.element, + conv_desc.B.layout, + conv_desc.C.element, + conv_desc.C.layout, + conv_desc.tile_description.math_instruction.element_accumulator, + conv_desc.element_epilogue); + +#if 0 // debug print to check which host refererence instance is selected + std::cout << conv2d_key << "\n"; +#endif + + auto operators_it = Singleton::get().operation_table.conv2d_operations.find(conv2d_key); + + if(operators_it == Singleton::get().operation_table.conv2d_operations.end()) { + + results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun; + return true; + } + + // conv2d host reference minimum cc is 0 (CPU) and no iterator algorithm + library::ConvPreferenceKey preference_key(0, library::IteratorAlgorithmID::kNone); + auto cc_it = operators_it->second.find(preference_key); + + if(cc_it == operators_it->second.end()) { + results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun; + return true; + } + + // host refernce has only one instances in Conv2dOperationVectorMap + library::Operation const *reference_op = cc_it->second[0]; + + // + // Copy input tensors A, B, and C from device to host buffers + // + conv_workspace_.host_tensor_a.resize(conv_workspace_.A->bytes()); + conv_workspace_.host_tensor_b.resize(conv_workspace_.B->bytes()); + conv_workspace_.host_tensor_c.resize(conv_workspace_.C->bytes()); + + conv_workspace_.A->copy_to_host(conv_workspace_.host_tensor_a.data()); + conv_workspace_.B->copy_to_host(conv_workspace_.host_tensor_b.data()); + conv_workspace_.C->copy_to_host(conv_workspace_.host_tensor_c.data()); + + // + // Initialize structure containing Conv2d arguments + // + conv_workspace_.arguments.A = conv_workspace_.host_tensor_a.data(); + conv_workspace_.arguments.B = conv_workspace_.host_tensor_b.data(); + conv_workspace_.arguments.C = conv_workspace_.host_tensor_c.data(); + conv_workspace_.arguments.D = conv_workspace_.host_tensor_c.data(); + + conv_workspace_.arguments.alpha = problem_.alpha.data(); + conv_workspace_.arguments.beta = problem_.beta.data(); + conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost; + + // + // Intialize host reference operation + // + std::vector host_workspace_reference_op; + + uint64_t workspace_size = reference_op->get_host_workspace_size(&conv_workspace_.configuration); + host_workspace_reference_op.resize(workspace_size, 0); + + reference_op->initialize( + &conv_workspace_.configuration, + host_workspace_reference_op.data()); + + // + // Run host reference operation + // + status = reference_op->run( + &conv_workspace_.arguments, + host_workspace_reference_op.data()); + + // Handle errors + if (status != Status::kSuccess) { + results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotVerified; + return true; + } + + // + // Copy host reference output to device memory for equality check on device + // + conv_workspace_.Reference->copy_from_host(conv_workspace_.arguments.D); + + // + // Verify results + // + results_.back().verification_map[library::Provider::kReferenceHost] = compare_tensors( + options, + *conv_workspace_.Computed, + *conv_workspace_.Reference, + conv_workspace_.Computed->batch_stride() + ); + + // Save workspace if incorrect + if (options.verification.save_workspace == SaveWorkspace::kIncorrect && + results_.back().verification_map[library::Provider::kReferenceHost] == Disposition::kIncorrect) { + + save_workspace( + device_context, + options, + static_cast(operation->description()), + library::Provider::kCUTLASS, + library::Provider::kReferenceHost); + } + + // Return true means continue profiling + return true; +} + + +/// Verifies CUTLASS against host reference +bool Conv2dOperationProfiler::verify_with_device_reference_( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + Status status; + + // + // Find device reference operation using conv2d functional description key + // + library::OperationDescription const &desc = operation->description(); + + auto &conv_desc = static_cast(desc); + + library::ConvFunctionalKey conv2d_key( + library::Provider::kReferenceDevice, + conv_desc.conv_kind, + conv_desc.A.element, + conv_desc.A.layout, + conv_desc.B.element, + conv_desc.B.layout, + conv_desc.C.element, + conv_desc.C.layout, + conv_desc.tile_description.math_instruction.element_accumulator, + conv_desc.element_epilogue); + + auto operators_it = Singleton::get().operation_table.conv2d_operations.find(conv2d_key); + + if(operators_it == Singleton::get().operation_table.conv2d_operations.end()) { + + results_.back().verification_map[library::Provider::kReferenceDevice] = Disposition::kNotRun; + + return true; + } + + // conv2d device reference minimum cc is 50 and no iterator algorithm + library::ConvPreferenceKey preference_key(50, library::IteratorAlgorithmID::kNone); + auto cc_it = operators_it->second.find(preference_key); + + if(cc_it == operators_it->second.end()) { + results_.back().verification_map[library::Provider::kReferenceDevice] = Disposition::kNotRun; + + return true; + } + + // device refernce has only one instances in Conv2dOperationVectorMap + library::Operation const *reference_op = cc_it->second[0]; + + // + // Intialize device reference operation + // + std::vector host_workspace_reference_op; + + uint64_t workspace_size = reference_op->get_host_workspace_size(&conv_workspace_.configuration); + host_workspace_reference_op.resize(workspace_size, 0); + + reference_op->initialize( + &conv_workspace_.configuration, + host_workspace_reference_op.data()); + + // Initialize structure containing Conv2d arguments + conv_workspace_.arguments.A = conv_workspace_.A->data(); + conv_workspace_.arguments.B = conv_workspace_.B->data(); + conv_workspace_.arguments.C = conv_workspace_.C->data(); + conv_workspace_.arguments.D = conv_workspace_.Reference->data(); + conv_workspace_.arguments.alpha = problem_.alpha.data(); + conv_workspace_.arguments.beta = problem_.beta.data(); + conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost; + + // + // Run device reference operation + // + status = reference_op->run( + &conv_workspace_.arguments, + host_workspace_reference_op.data()); + + + // Handle errors + if (status != Status::kSuccess) { + results_.back().verification_map[library::Provider::kReferenceDevice] = Disposition::kNotVerified; + return true; + } + + // + // Verify results + // + results_.back().verification_map[library::Provider::kReferenceDevice] = compare_tensors( + options, + *conv_workspace_.Computed, + *conv_workspace_.Reference, + conv_workspace_.Computed->batch_stride() + ); + + // Save workspace if incorrect + if (options.verification.save_workspace == SaveWorkspace::kIncorrect && + results_.back().verification_map[library::Provider::kReferenceDevice] == Disposition::kIncorrect) { + + save_workspace( + device_context, + options, + static_cast(operation->description()), + library::Provider::kCUTLASS, + library::Provider::kReferenceDevice); + } + + // Return true means continue profiling + return true; +} + +/// Measures performance results +bool Conv2dOperationProfiler::profile( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + + if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) { + + // Initialize structure containing Conv2d arguments + conv_workspace_.arguments.A = conv_workspace_.A->data(); + conv_workspace_.arguments.B = conv_workspace_.B->data(); + conv_workspace_.arguments.C = conv_workspace_.C->data(); + conv_workspace_.arguments.D = conv_workspace_.Computed->data(); + conv_workspace_.arguments.alpha = problem_.alpha.data(); + conv_workspace_.arguments.beta = problem_.beta.data(); + conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost; + + if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + // update library::ConvArguments for parallel split-k reduction + conv_workspace_.arguments.D = conv_workspace_.device_workspace.data(); + conv_workspace_.arguments.alpha = problem_.alpha_one.data(); + conv_workspace_.arguments.beta = problem_.beta_zero.data(); + + /// intialize library::ReductionArguments + conv_workspace_.reduction_arguments.workspace = conv_workspace_.device_workspace.data(); + conv_workspace_.reduction_arguments.source = conv_workspace_.C->data(); + conv_workspace_.reduction_arguments.destination = conv_workspace_.Computed->data(); + conv_workspace_.reduction_arguments.alpha = problem_.alpha.data(); + conv_workspace_.reduction_arguments.beta = problem_.beta.data(); + conv_workspace_.reduction_arguments.pointer_mode = library::ScalarPointerMode::kHost; + } + + results_.back().status = profile_cutlass_( + results_.back().runtime, + options, + operation, + &conv_workspace_.arguments, + conv_workspace_.host_workspace.data(), + conv_workspace_.device_workspace.data() + ); + } + return true; + +} + +/// Method to profile a CUTLASS Operation +Status Conv2dOperationProfiler::profile_cutlass_( + double &runtime, + Options const &options, + library::Operation const *operation, + void *arguments, + void *host_workspace, + void *device_workspace) { + + GpuTimer timer; + + // initialize conv2d underlying operation to handle parallel reduction + library::Operation const* underlying_operation = operation; + + library::ConvArguments *conv_arguments = static_cast(arguments); + + if(conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + if (!(underlying_operation = library::find_conv_operation_for_parallel_reduction(operation))) { + return Status::kErrorNotSupported; + } + } + + // + // Optional sleep to limit power consumption and thermals + // + + sleep(options.profiling.sleep_duration); + + // + // Warmup loop + // + + Status status; + + for (int iteration = 0; iteration < options.profiling.warmup_iterations; ++iteration) { + + // Setup rotating workspace + int workspace_idx = options.profiling.warmup_iterations + iteration; + int problem_idx = (workspace_idx % conv_workspace_.problem_count); + + conv_arguments->A = conv_workspace_.A->batch_data(problem_idx); + conv_arguments->B = conv_workspace_.B->batch_data(problem_idx); + conv_arguments->C = conv_workspace_.C->batch_data(problem_idx); + conv_arguments->D = conv_workspace_.Computed->batch_data(problem_idx); + + if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + // update library::ConvArguments for parallel split-k reduction + conv_arguments->D = conv_workspace_.device_workspace.data(); + + /// intialize library::ReductionArguments + conv_workspace_.reduction_arguments.workspace = conv_workspace_.device_workspace.data(); + conv_workspace_.reduction_arguments.source = conv_workspace_.C->batch_data(problem_idx); + conv_workspace_.reduction_arguments.destination = conv_workspace_.Computed->batch_data(problem_idx); + } + + // Run underlying conv2d operation + status = underlying_operation->run( + arguments, + host_workspace, + device_workspace); + + // Run parallel reduction kernel for parallel split_k_mode + if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + + status = reduction_op_->run( + &conv_workspace_.reduction_arguments, + conv_workspace_.reduction_host_workspace.data(), + nullptr); + } + + if (status != Status::kSuccess) { + return status; + } + } + + // + // Initialize GPU timer + // + + timer.start(); + + // + // Profiling loop + // + + int Iterations = options.profiling.iterations; + + int iteration = 0; + for (; iteration < Iterations; ++iteration) { + + // Setup rotating workspace + int problem_idx = (iteration % conv_workspace_.problem_count); + + conv_arguments->A = conv_workspace_.A->batch_data(problem_idx); + conv_arguments->B = conv_workspace_.B->batch_data(problem_idx); + conv_arguments->C = conv_workspace_.C->batch_data(problem_idx); + conv_arguments->D = conv_workspace_.Computed->batch_data(problem_idx); + + if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + // update library::ConvArguments for parallel split-k reduction + conv_arguments->D = conv_workspace_.device_workspace.data(); + + /// intialize library::ReductionArguments + conv_workspace_.reduction_arguments.workspace = conv_workspace_.device_workspace.data(); + conv_workspace_.reduction_arguments.source = conv_workspace_.C->batch_data(problem_idx); + conv_workspace_.reduction_arguments.destination = conv_workspace_.Computed->batch_data(problem_idx); + } + + // Run underlying conv2d operation + status = underlying_operation->run( + arguments, + host_workspace, + device_workspace); + + // Run parallel reduction kernel for parallel split_k_mode + if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + + status = reduction_op_->run( + &conv_workspace_.reduction_arguments, + conv_workspace_.reduction_host_workspace.data(), + nullptr); + } + + if (status != Status::kSuccess) { + return status; + } + } + + // + // Wait for completion + // + + timer.stop_and_wait(); + + // + // Update performance result + // + + runtime = timer.duration(iteration); + + return status; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// +#if CUTLASS_ENABLE_CUDNN + +/// Verifies CUTLASS against cudnn reference +bool Conv2dOperationProfiler::verify_with_cudnn_( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + auto &conv_desc = static_cast(operation->description()); + + // + // Construct cudnn operators + // + + CudnnCreate handle; + cudnnStatus_t status = handle.get_cudnn_create_status(); + + if (status != CUDNN_STATUS_SUCCESS) { + + results_.back().verification_map[library::Provider::kCUDNN] = get_cutlass_disposition(status); + return true; + } + + // + // Initialize state + // + + // Initialize structure containing Conv2d arguments + conv_workspace_.arguments.A = conv_workspace_.A->data(); + conv_workspace_.arguments.B = conv_workspace_.B->data(); + conv_workspace_.arguments.D = conv_workspace_.Reference->data(); + conv_workspace_.arguments.alpha = problem_.alpha.data(); + conv_workspace_.arguments.beta = problem_.beta.data(); + conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost; + + // cuDNN does not support four tensor arguments, so we copy the tensor C data into + // tensor D. + conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data()); + conv_workspace_.arguments.C = conv_workspace_.arguments.D; + + try { + + // + // Construct dispatcher to cudnn operator + // + + detail::cudnnConvDispatcher conv_op( + conv_desc, + conv_workspace_.configuration, + conv_workspace_.arguments, + handle + ); + + if (conv_op.status != Status::kSuccess) { + if (conv_op.status == Status::kErrorNotSupported) { + results_.back().verification_map[library::Provider::kCUDNN] = Disposition::kNotSupported; + + } else { + results_.back().verification_map[library::Provider::kCUDNN] = Disposition::kFailed; + } + return true; + } + + + status = conv_op(handle); + + // Handle errors + if (status != CUDNN_STATUS_SUCCESS) { + + results_.back().verification_map[library::Provider::kCUDNN] = get_cutlass_disposition(status); + return true; + } + + // + // Verify results + // + + results_.back().verification_map[library::Provider::kCUDNN] = compare_tensors( + options, + *conv_workspace_.Computed, + *conv_workspace_.Reference, + conv_workspace_.Computed->batch_stride() + ); + + // Save workspace if incorrect + if (options.verification.save_workspace == SaveWorkspace::kIncorrect && + results_.back().verification_map[library::Provider::kCUDNN] == Disposition::kIncorrect) { + + save_workspace( + device_context, + options, + conv_desc, + library::Provider::kCUTLASS, + library::Provider::kCUDNN); + } + } + catch (...) { + results_.back().verification_map[library::Provider::kCUDNN] = Disposition::kFailed; + } + + // Return true means continue profiling + return true; +} + +#endif // #if CUTLASS_ENABLE_CUDNN + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace profiler +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tools/profiler/src/conv2d_operation_profiler.h b/tools/profiler/src/conv2d_operation_profiler.h new file mode 100644 index 0000000000..40c003e1d4 --- /dev/null +++ b/tools/profiler/src/conv2d_operation_profiler.h @@ -0,0 +1,431 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief Defines profiling functionality for convolution + +*/ + +#pragma once + +#include +#include +#include +#include +#include + +// CUTLASS Library includes +#include "cutlass/library/library.h" +#include "cutlass/library/util.h" +#include "cutlass/library/handle.h" +#include "cutlass/library/manifest.h" +#include "cutlass/library/singleton.h" + +// Profiler includes +#include "options.h" +#include "device_context.h" +#include "operation_profiler.h" +#include "performance_result.h" +#include "problem_space.h" +#include "reduction_operation_profiler.h" +#if CUTLASS_ENABLE_CUDNN +#include "cudnn_helpers.h" +#endif //#if CUTLASS_ENABLE_CUDNN +#include "debug.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace profiler { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Abstract base class for each math function +class Conv2dOperationProfiler : public OperationProfiler { +public: + + /// Problem structure obtained from problem space + struct Conv2dProblem { + + int64_t n, h, w, c, p, q, k, r, s; + int64_t pad_h, pad_w; + int64_t stride_h, stride_w; + int64_t dilation_h, dilation_w; + + std::vector alpha; + std::vector beta; + + library::SplitKMode split_k_mode; + int64_t split_k_slices; + + library::ConvModeID conv_mode; + + library::Provider eq_gemm_provider; + + // convolution with parallel interleaved reduction + // convolution epilogue (alpha, beta) = (1.0, 0.0) + // reduction epilogue (alpha, beta) = (Conv2dProblem::alpha, Conv2dProblem::beta) + std::vector alpha_one; + std::vector beta_zero; + + // + // Methods + // + + /// Total number of bytes loaded + int64_t bytes(library::ConvDescription const &operation_desc) const; + + /// Total number of flops computed + int64_t flops(library::ConvDescription const &operation_desc) const; + + void set_default_output_size() { + p = ((h + pad_h - r * dilation_h) / stride_h) + 1; + q = ((w + pad_w - s * dilation_w) / stride_w) + 1; + } + + // Returns equivalent gemm problem size for convolution + cutlass::gemm::GemmCoord eq_gemm_size(library::ConvKind const &conv_kind) const { + + switch (conv_kind) { + case library::ConvKind::kFprop: return cutlass::gemm::GemmCoord(int(n * p * q), int(k), int(r * s * c)); + case library::ConvKind::kDgrad: return cutlass::gemm::GemmCoord(int(n * h * w), int(c), int(k * r * s)); + case library::ConvKind::kWgrad: return cutlass::gemm::GemmCoord(int(k), int(r * s * c), int(n * p * q)); + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns extent for tensor A + std::vector extent_a(library::ConvKind const &conv_kind) const { + + switch (conv_kind) { + case library::ConvKind::kFprop: return {int(n), int(h), int(w), int(c)}; + case library::ConvKind::kDgrad: return {int(n), int(p), int(q), int(k)}; + case library::ConvKind::kWgrad: return {int(n), int(p), int(q), int(k)}; + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns extent for tensor B + std::vector extent_b(library::ConvKind const &conv_kind) const { + + switch (conv_kind) { + case library::ConvKind::kFprop: return {int(k), int(r), int(s), int(c)}; + case library::ConvKind::kDgrad: return {int(k), int(r), int(s), int(c)}; + case library::ConvKind::kWgrad: return {int(n), int(h), int(w), int(c)}; + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns extent for tensor C + std::vector extent_c(library::ConvKind const &conv_kind) const { + + switch (conv_kind) { + case library::ConvKind::kFprop: return {int(n), int(p), int(q), int(k)}; + case library::ConvKind::kDgrad: return {int(n), int(h), int(w), int(c)}; + case library::ConvKind::kWgrad: return {int(k), int(r), int(s), int(c)}; + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns layout for equivalent gemm matrix A + library::LayoutTypeID eq_gemm_layout_a(library::ConvKind const &conv_kind) const { + + switch (conv_kind) { + case library::ConvKind::kFprop: return library::LayoutTypeID::kRowMajor; // TN Gemm + case library::ConvKind::kDgrad: return library::LayoutTypeID::kRowMajor; // TT Gemm + case library::ConvKind::kWgrad: return library::LayoutTypeID::kColumnMajor; // NT Gemm + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns layout for equivalent gemm matrix B + library::LayoutTypeID eq_gemm_layout_b(library::ConvKind const &conv_kind) const { + + switch (conv_kind) { + case library::ConvKind::kFprop: return library::LayoutTypeID::kColumnMajor; // TN Gemm + case library::ConvKind::kDgrad: return library::LayoutTypeID::kRowMajor; // TT Gemm + case library::ConvKind::kWgrad: return library::LayoutTypeID::kRowMajor; // NT Gemm + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns layout for equivalent gemm matrix C + library::LayoutTypeID eq_gemm_layout_c(library::ConvKind const &conv_kind) const { + + switch (conv_kind) { + // Gemm operator assumes column-major output + case library::ConvKind::kFprop: + case library::ConvKind::kDgrad: + case library::ConvKind::kWgrad: return library::LayoutTypeID::kColumnMajor; + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns leading dimenstion for equivalent gemm matrix A + int64_t eq_gemm_lda(library::ConvKind const &conv_kind) const { + + switch (conv_kind) { + case library::ConvKind::kFprop: return eq_gemm_size(conv_kind).k(); + case library::ConvKind::kDgrad: return eq_gemm_size(conv_kind).k(); + case library::ConvKind::kWgrad: return eq_gemm_size(conv_kind).m(); + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns leading dimenstion for equivalent gemm matrix B + int64_t eq_gemm_ldb(library::ConvKind const &conv_kind) const { + + switch (conv_kind) { + case library::ConvKind::kFprop: return eq_gemm_size(conv_kind).k(); + case library::ConvKind::kDgrad: return eq_gemm_size(conv_kind).n(); + case library::ConvKind::kWgrad: return eq_gemm_size(conv_kind).n(); + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns leading dimenstion for equivalent gemm matrix C + int64_t eq_gemm_ldc(library::ConvKind const &conv_kind) const { + + switch (conv_kind) { + case library::ConvKind::kFprop: + case library::ConvKind::kDgrad: + case library::ConvKind::kWgrad: return eq_gemm_size(conv_kind).m(); + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + }; + + /// Workspace used + struct Conv2dWorkspace { + + /// Conv device allocations + DeviceAllocation *A; + DeviceAllocation *B; + DeviceAllocation *C; + DeviceAllocation *Computed; + DeviceAllocation *Reference; + + /// Library configuration and arguments for convolution operator + library::Conv2dConfiguration configuration; + library::ConvArguments arguments; + + /// Number of copies of the problem workspace which are visited sequentially during + /// profiling to avoid camping in the last level cache. + int problem_count; + + /// Buffer used for the cutlass conv2d operations' host workspace + std::vector host_workspace; + + /// Buffer used for the cutlass operations' device workspace + DeviceAllocation device_workspace; + + /// Library configuration and arguments for reduction operator + library::ReductionConfiguration reduction_configuration; + library::ReductionArguments reduction_arguments; + + /// Buffer used for the cutlass reduction operations' host workspace + std::vector reduction_host_workspace; + + /// Host data buffers for host reference operation + /// host buffer for tensor + std::vector host_tensor_a; + + /// host buffer for tensor b + std::vector host_tensor_b; + + /// host buffer for tensor c + std::vector host_tensor_c; + + + // + // Methods + // + + Conv2dWorkspace(): + A(nullptr), B(nullptr), C(nullptr), Computed(nullptr), Reference(nullptr) { } + + // Returns stride vector for tensor A + std::vector stride_a(library::ConvKind const &conv_kind) { + return { + configuration.layout_a(conv_kind).stride()[0], + configuration.layout_a(conv_kind).stride()[1], + configuration.layout_a(conv_kind).stride()[2] + }; + } + + // Returns stride vector for tensor B + std::vector stride_b(library::ConvKind const &conv_kind) { + + return { + configuration.layout_b(conv_kind).stride()[0], + configuration.layout_b(conv_kind).stride()[1], + configuration.layout_b(conv_kind).stride()[2] + }; + } + + // Returns stride vector for tensor C + std::vector stride_c(library::ConvKind const &conv_kind) { + + return { + configuration.layout_c(conv_kind).stride()[0], + configuration.layout_c(conv_kind).stride()[1], + configuration.layout_c(conv_kind).stride()[2] + }; + } + }; + +protected: + + // + // Data members + // + + /// CONV problem obtained from problem space + Conv2dProblem problem_; + + /// Device memory allocations + Conv2dWorkspace conv_workspace_; + + /// CUTLASS parallel reduction operation to follow this* conv2d operation + library::Operation const *reduction_op_; + +public: + // + // Methods + // + + /// Ctor + Conv2dOperationProfiler(Options const &options); + + /// Destructor + virtual ~Conv2dOperationProfiler(); + + /// Prints usage statement for the math function + virtual void print_usage(std::ostream &out) const; + + /// Prints examples + virtual void print_examples(std::ostream &out) const; + + /// Extracts the problem dimensions + virtual Status initialize_configuration( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + + /// Initializes workspace + virtual Status initialize_workspace( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + + /// Verifies CUTLASS against references + virtual bool verify_cutlass( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + + /// Measures performance results + virtual bool profile( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + +protected: + /// Method to profile an initialized CUTLASS operation + virtual Status profile_cutlass_( + double &runtime, + Options const &options, + library::Operation const *operation, + void *arguments, + void *host_workspace, + void *device_workspace); + + + /// Initialize reduction problem dimenstions and library::Operation + bool initialize_reduction_configuration_( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + + /// Initializes the performance result + void initialize_result_( + PerformanceResult &result, + Options const &options, + library::ConvDescription const &operation_desc, + ProblemSpace const &problem_space); + + /// Verifies CUTLASS against host reference + bool verify_with_host_reference_( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + + /// Verifies CUTLASS against device reference + bool verify_with_device_reference_( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + +#if CUTLASS_ENABLE_CUDNN + + /// Verifies CUTLASS against cudnn reference + bool verify_with_cudnn_( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + +#endif //#if CUTLASS_ENABLE_CUDNN + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace profiler +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tools/profiler/src/conv3d_operation_profiler.cu b/tools/profiler/src/conv3d_operation_profiler.cu new file mode 100644 index 0000000000..67f21d8f7a --- /dev/null +++ b/tools/profiler/src/conv3d_operation_profiler.cu @@ -0,0 +1,1345 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief Convolution 3D profiling + +*/ + +#include +#include +#include +#include + +#include "cutlass/core_io.h" + +#include "conv3d_operation_profiler.h" +#include "gpu_timer.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// +using namespace cutlass::library; + +namespace cutlass { +namespace profiler { + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Ctor +Conv3dOperationProfiler::Conv3dOperationProfiler(Options const &options): + OperationProfiler( + options, + library::OperationKind::kConv3d, + { + {ArgumentTypeID::kEnumerated, {"conv_kind"}, "Convolutional operator (fprop, dgrad, wgrad)"}, + {ArgumentTypeID::kInteger, {"n", "input_n"}, "Input N dimension of the Conv3d problem space"}, + {ArgumentTypeID::kInteger, {"d", "input_d"}, "Input D dimension of the Conv3d problem space"}, + {ArgumentTypeID::kInteger, {"h", "input_h"}, "Input H dimension of the Conv3d problem space"}, + {ArgumentTypeID::kInteger, {"w", "input_w"}, "Input W dimension of the Conv3d problem space"}, + {ArgumentTypeID::kInteger, {"c", "input_c"}, "Input C dimension of the Conv3d problem space"}, + {ArgumentTypeID::kInteger, {"k", "filter_k"}, "Filter K dimension of the Conv3d problem space"}, + {ArgumentTypeID::kInteger, {"t", "filter_t"}, "Filter T dimension of the Conv3d problem space"}, + {ArgumentTypeID::kInteger, {"r", "filter_r"}, "Filter R dimension of the Conv3d problem space"}, + {ArgumentTypeID::kInteger, {"s", "filter_s"}, "Filter S dimension of the Conv3d problem space"}, + {ArgumentTypeID::kInteger, {"z", "output_z"}, "Output Z dimension of the Conv3d problem space"}, + {ArgumentTypeID::kInteger, {"p", "output_p"}, "Output P dimension of the Conv3d problem space"}, + {ArgumentTypeID::kInteger, {"q", "output_q"}, "Output Q dimension of the Conv3d problem space"}, + {ArgumentTypeID::kInteger, {"pad_d"}, "Padding in D direction"}, + {ArgumentTypeID::kInteger, {"pad_h"}, "Padding in H direction"}, + {ArgumentTypeID::kInteger, {"pad_w"}, "Padding in W direction"}, + {ArgumentTypeID::kInteger, {"stride_d"}, "Stride in D direction"}, + {ArgumentTypeID::kInteger, {"stride_h"}, "Stride in H direction"}, + {ArgumentTypeID::kInteger, {"stride_w"}, "Stride in W direction"}, + {ArgumentTypeID::kInteger, {"dilation_d"}, "Dilation in D direction"}, + {ArgumentTypeID::kInteger, {"dilation_h"}, "Dilation in H direction"}, + {ArgumentTypeID::kInteger, {"dilation_w"}, "Dilation in W direction"}, + {ArgumentTypeID::kTensor, {"Activation"}, "Tensor storing the Activation operand"}, + {ArgumentTypeID::kTensor, {"Filter"}, "Tensor storing the Filter operand"}, + {ArgumentTypeID::kTensor, {"Output"}, "Tensor storing the Output operand"}, + {ArgumentTypeID::kEnumerated, {"conv_mode"}, "Convolution filter mode (conv, cross)"}, + {ArgumentTypeID::kEnumerated, {"iterator_algorithm", "iterator_algo"}, "Convolution iterator algorithm (analytic, optimized)"}, + {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"}, + {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"}, + {ArgumentTypeID::kEnumerated, {"split_k_mode", "split-k-mode"}, "SplitK mode for serial or parallel reduction (serial, parallel)"}, + {ArgumentTypeID::kInteger, {"split_k_slices", "split-k-slices"}, "Number of partitions of K dimension"}, + {ArgumentTypeID::kEnumerated, {"eq_gemm_provider", "eq-gemm-provider"}, "Enable profiling equivalent gemm by the following providers (cutlass)"}, + }, + { library::Provider::kReferenceDevice, library::Provider::kReferenceHost, library::Provider::kCUDNN } + ) { + + description_ = " Conv3d operation. Output(Tensor5D) = alpha * Input(Tensor5D) * Filter(Tensor5D) + beta * Input(Tensor5D)"; + +} + +/// Destructor +Conv3dOperationProfiler::~Conv3dOperationProfiler() { + +} + + +/// Prints usage statement for the math function +void Conv3dOperationProfiler::print_usage(std::ostream &out) const { + out << "Conv3d" << "\n\n"; + + OperationProfiler::print_usage(out); +} + +/// Prints examples +void Conv3dOperationProfiler::print_examples(std::ostream &out) const { + + out << "\nExamples:\n\n" + << "Profile a particular convolution (specify all the convolution parameters):\n" + << " $ cutlass_profiler --operation=Conv3d" + " --Activation=f16:ndhwc --Filter=f16:ndhwc --Output=f16 --accumulator-type=f32" + " --n=32 --d=16 --h=14 --w=14 --c=8 --k=64 --t=3 --r=3 --s=3" + " --pad_d=1 --pad_h=1 --pad_w=1" + " --stride_d=1 --stride::h=1 --stride::w=1" + " --dilation_d=1 --dilation::h=1 --dilation::w=1\n\n"; +} + +#if 0 +// used this for debugging +static std::string byte_string(std::vector const &bytes) { + std::stringstream ss; + + ss << "0x"; + + for (size_t idx = bytes.size(); idx > 0; --idx) { + ss << std::hex << std::setw(2) << std::setfill('0') << uint32_t(bytes.at(idx - 1)); + } + + return ss.str(); +} +#endif + +///////////////////////////////////////////////////////////////////////////////////////////////// + + +/// Total number of bytes loaded +int64_t Conv3dOperationProfiler::Conv3dProblem::bytes(library::ConvDescription const &operation_desc) const { + cutlass::gemm::GemmCoord mnk = eq_gemm_size(operation_desc.conv_kind); + + // Input bytes read and Output bytes written for the gemm problem + int64_t bytes_ = + int64_t(library::sizeof_bits(operation_desc.A.element) * mnk.m() / 8) * mnk.k() + + int64_t(library::sizeof_bits(operation_desc.B.element) * mnk.n() / 8) * mnk.k() + + int64_t(library::sizeof_bits(operation_desc.C.element) * mnk.m() / 8) * mnk.n(); + + // Set is_beta_zero true if beta is zero + bool is_beta_zero = std::all_of(beta.begin(), beta.end(), [](uint8_t i) { return i==0; }); + + // Output bytes read for the gemm problem for non-zero beta values + if (!is_beta_zero) { + bytes_ += int64_t(library::sizeof_bits(operation_desc.C.element) * mnk.m() / 8) * mnk.n(); + } + + return bytes_; +} + +/// Total number of flops computed +int64_t Conv3dOperationProfiler::Conv3dProblem::flops( + library::ConvDescription const &operation_desc) const { + + cutlass::gemm::GemmCoord mnk = eq_gemm_size(operation_desc.conv_kind); + + int64_t flops_mainloop_ = int64_t(mnk.m()) * mnk.n() * mnk.k() * 2; + int64_t flops_epilogue_ = int64_t(mnk.m()) * int64_t(mnk.n()) * 2; + + // Adjust mainloop flop for dgrad strided + if (operation_desc.conv_kind == library::ConvKind::kDgrad) { + flops_mainloop_ = flops_mainloop_ / ( stride_d * stride_h * stride_w); + } + + return (flops_mainloop_ + flops_epilogue_); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Extracts the problem dimensions +Status Conv3dOperationProfiler::initialize_configuration( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + library::ConvDescription const &operation_desc = + static_cast(operation->description()); + + if (!arg_as_int(problem_.n, "n", problem_space, problem)) { + // default value + problem_.n = 1; + } + + if (!arg_as_int(problem_.d, "d", problem_space, problem)) { + // default value + problem_.d = 8; + } + + if (!arg_as_int(problem_.h, "h", problem_space, problem)) { + // default value + problem_.h = 14; + } + + if (!arg_as_int(problem_.w, "w", problem_space, problem)) { + // default value + problem_.w = 14; + } + + if (!arg_as_int(problem_.c, "c", problem_space, problem)) { + // default value + problem_.c = 32; + } + + if (!arg_as_int(problem_.k, "k", problem_space, problem)) { + // default value + problem_.k = 32; + } + + if (!arg_as_int(problem_.t, "t", problem_space, problem)) { + // default value + problem_.t = 3; + } + + if (!arg_as_int(problem_.r, "r", problem_space, problem)) { + // default value + problem_.r = 3; + } + + if (!arg_as_int(problem_.s, "s", problem_space, problem)) { + // default value + problem_.s = 3; + } + + if (!arg_as_int(problem_.pad_d, "pad_d", problem_space, problem)) { + // default value + problem_.pad_d = 1; + } + + if (!arg_as_int(problem_.pad_w, "pad_w", problem_space, problem)) { + // default value + problem_.pad_w = 1; + } + if (!arg_as_int(problem_.pad_h, "pad_h", problem_space, problem)) { + // default value + problem_.pad_h = 1; + } + + if (!arg_as_int(problem_.stride_d, "stride_d", problem_space, problem)) { + // default value + problem_.stride_d = 1; + } + + if (!arg_as_int(problem_.stride_h, "stride_h", problem_space, problem)) { + // default value + problem_.stride_h = 1; + } + + if (!arg_as_int(problem_.stride_w, "stride_w", problem_space, problem)) { + // default value + problem_.stride_w = 1; + } + + if (!arg_as_int(problem_.dilation_d, "dilation_d", problem_space, problem)) { + // default value + problem_.dilation_d = 1; + } + + if (!arg_as_int(problem_.dilation_h, "dilation_h", problem_space, problem)) { + // default value + problem_.dilation_h = 1; + } + + if (!arg_as_int(problem_.dilation_w, "dilation_w", problem_space, problem)) { + // default value + problem_.dilation_w = 1; + } + + //////////////////////// Convolution output dimensions p and q //////////////////////// + // Cutlass convolutions support arbitrary output sizes and not constriant by // + // input, filter, padding, striding, dilation sizes. // + // cuDNN sets the output dimensions (p, q) using following equations: // + // // + // output = div_up(input + 2 * pad - ((filter - 1) * dilation + 1) + 1, stride) // + // where; div_up(a, b) : (a - 1)/b + 1 // + // // + // Thus, when output p and q dimensions are unspecified by the user // + // cutlass profiler sets p and q which are cuDNN compliant. // + // // + //////////////////////////////////////////////////////////////////////////////////////// + // set convolution output z + if (!arg_as_int(problem_.z, "z", problem_space, problem)) { + // default value (set using cudnn formula for output height, when p is not provided) + problem_.z = ( + problem_.d + + 2 * problem_.pad_d - + ((problem_.t - 1) * problem_.dilation_d + 1) + ) / (problem_.stride_d) + + 1; + } + + // set convolution output p + if (!arg_as_int(problem_.p, "p", problem_space, problem)) { + // default value (set using cudnn formula for output height, when p is not provided) + problem_.p = ( + problem_.h + + 2 * problem_.pad_h - + ((problem_.r - 1) * problem_.dilation_h + 1) + ) / (problem_.stride_h) + + 1; + } + + // set convolution output q + if (!arg_as_int(problem_.q, "q", problem_space, problem)) { + // default value (set using cudnn formula for output width, when q is not provided) + problem_.q = ( + problem_.w + + 2 * problem_.pad_w - + ((problem_.s - 1) * problem_.dilation_w + 1) + ) / (problem_.stride_w) + + 1; + } + ///////////////////////////////////////////////////////////////////////////////////////// + + + if (!arg_as_SplitKModeID(problem_.split_k_mode, "split_k_mode", problem_space, problem)) { + // default value + problem_.split_k_mode = library::SplitKMode::kSerial; + } + + if (!arg_as_int(problem_.split_k_slices, "split_k_slices", problem_space, problem)) { + // default value + problem_.split_k_slices = 1; + } + + if (!arg_as_ConvModeID(problem_.conv_mode, "conv_mode", problem_space, problem)) { + // default value + problem_.conv_mode = library::ConvModeID::kCrossCorrelation; + } + + if (!arg_as_ProviderID(problem_.eq_gemm_provider, "eq_gemm_provider", problem_space, problem)) { + // default value + problem_.eq_gemm_provider = library::Provider::kNone; + } + + if (!conv_kind_satisfies(operation_desc.conv_kind, "conv_kind", problem_space, problem)) { + return Status::kErrorInvalidProblem; + } + + if (!iterator_algorithm_satisfies(operation_desc.iterator_algorithm, "iterator_algorithm", problem_space, problem)) { + return Status::kErrorInvalidProblem; + } + + if (!tensor_description_satisfies(operation_desc.activation(), "Activation", problem_space, problem)) { + return Status::kErrorInvalidProblem; + } + + if (!tensor_description_satisfies(operation_desc.filter(), "Filter", problem_space, problem)) { + return Status::kErrorInvalidProblem; + } + + if (!tensor_description_satisfies(operation_desc.output(), "Output", problem_space, problem)) { + return Status::kErrorInvalidProblem; + } + + if (!arg_as_scalar( + problem_.alpha, + operation_desc.element_epilogue, + "alpha", + problem_space, + problem)) { + + if (!cast_from_double(problem_.alpha, operation_desc.element_epilogue, 1)) { + return Status::kErrorInternal; + } + } + + if (!arg_as_scalar( + problem_.beta, + operation_desc.element_epilogue, + "beta", + problem_space, + problem)) { + + if (!cast_from_double(problem_.beta, operation_desc.element_epilogue, 0)) { + return Status::kErrorInternal; + } + } + + // initialize library::ConvConfiguration + conv_workspace_.configuration.problem_size = conv::Conv3dProblemSize( + int(problem_.n), + int(problem_.d), + int(problem_.h), + int(problem_.w), + int(problem_.c), + int(problem_.k), + int(problem_.t), + int(problem_.r), + int(problem_.s), + int(problem_.z), + int(problem_.p), + int(problem_.q), + int(problem_.pad_d), + int(problem_.pad_h), + int(problem_.pad_w), + int(problem_.stride_d), + int(problem_.stride_h), + int(problem_.stride_w), + int(problem_.dilation_d), + int(problem_.dilation_h), + int(problem_.dilation_w), + static_cast(static_cast(problem_.conv_mode)), + int(problem_.split_k_slices), + 1 // groups + ); + + conv_workspace_.configuration.split_k_mode = static_cast(static_cast(problem_.split_k_mode)); + + conv_workspace_.configuration.layout_activations.stride() = make_Coord( + int(problem_.c), + int(problem_.w) * int(problem_.c), + int(problem_.h) * int(problem_.w) * int(problem_.c), + int(problem_.d) * int(problem_.h) * int(problem_.w) * int(problem_.c) + ); + + conv_workspace_.configuration.layout_filters.stride() = make_Coord( + int(problem_.c), + int(problem_.s) * int(problem_.c), + int(problem_.r) * int(problem_.s) * int(problem_.c), + int(problem_.t) * int(problem_.r) * int(problem_.s) * int(problem_.c) + ); + + conv_workspace_.configuration.layout_output.stride() = make_Coord( + int(problem_.k), + int(problem_.q) * int(problem_.k), + int(problem_.q) * int(problem_.p) * int(problem_.k), + int(problem_.z) * int(problem_.q) * int(problem_.p) * int(problem_.k) + ); + + + // initialize library::ConvArguments + conv_workspace_.arguments.A = nullptr; + conv_workspace_.arguments.B = nullptr; + conv_workspace_.arguments.C = nullptr; + conv_workspace_.arguments.D = nullptr; + conv_workspace_.arguments.alpha = problem_.alpha.data(); + conv_workspace_.arguments.beta = problem_.beta.data(); + conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost; + + // initialize reduction operation for parallel splitKMode not supported for conv3d + if(conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + if(!initialize_reduction_configuration_(options, report, device_context, operation, problem_space, problem)) { + return Status::kErrorInternal; + } + } + + initialize_result_(this->model_result_, options, operation_desc, problem_space); + + return operation->can_implement(&conv_workspace_.configuration, &conv_workspace_.arguments); +} + +/// Initializes the performance result +void Conv3dOperationProfiler::initialize_result_( + PerformanceResult &result, + Options const &options, + library::ConvDescription const &operation_desc, + ProblemSpace const &problem_space) { + + result.provider = library::Provider::kCUTLASS; + result.disposition = Disposition::kNotRun; + result.status = Status::kSuccess; + result.operation_name = operation_desc.name; + + result.arguments.resize(problem_space.rank()); + + set_argument(result, "Activation", problem_space, + std::string(library::to_string(operation_desc.activation().element)) + + ":" + library::to_string(operation_desc.activation().layout)); + + set_argument(result, "Filter", problem_space, + std::string(library::to_string(operation_desc.filter().element)) + + ":" + library::to_string(operation_desc.filter().layout)); + + set_argument(result, "Output", problem_space, + std::string(library::to_string(operation_desc.output().element)) + + ":" + library::to_string(operation_desc.output().layout)); + + set_argument(result, "conv_kind", problem_space, library::to_string(operation_desc.conv_kind)); + + set_argument(result, "iterator_algorithm", problem_space, std::string(library::to_string(operation_desc.iterator_algorithm))); + + set_argument(result, "n", problem_space, problem_.n); + set_argument(result, "d", problem_space, problem_.d); + set_argument(result, "h", problem_space, problem_.h); + set_argument(result, "w", problem_space, problem_.w); + set_argument(result, "c", problem_space, problem_.c); + + set_argument(result, "k", problem_space, problem_.k); + set_argument(result, "t", problem_space, problem_.t); + set_argument(result, "r", problem_space, problem_.r); + set_argument(result, "s", problem_space, problem_.s); + + set_argument(result, "z", problem_space, problem_.z); + set_argument(result, "p", problem_space, problem_.p); + set_argument(result, "q", problem_space, problem_.q); + + set_argument(result, "pad_d", problem_space, problem_.pad_d); + set_argument(result, "pad_h", problem_space, problem_.pad_h); + set_argument(result, "pad_w", problem_space, problem_.pad_w); + + set_argument(result, "stride_d", problem_space, problem_.stride_d); + set_argument(result, "stride_h", problem_space, problem_.stride_h); + set_argument(result, "stride_w", problem_space, problem_.stride_w); + + set_argument(result, "dilation_d", problem_space, problem_.dilation_d); + set_argument(result, "dilation_h", problem_space, problem_.dilation_h); + set_argument(result, "dilation_w", problem_space, problem_.dilation_w); + + set_argument(result, "split_k_mode", problem_space, + std::string(library::to_string(problem_.split_k_mode))); + set_argument(result, "split_k_slices", problem_space, problem_.split_k_slices); + + set_argument(result, "conv_mode", problem_space, + std::string(library::to_string(problem_.conv_mode))); + + set_argument(result, "alpha", problem_space, + library::lexical_cast(problem_.alpha, operation_desc.element_epilogue)); + + set_argument(result, "beta", problem_space, + library::lexical_cast(problem_.beta, operation_desc.element_epilogue)); + + set_argument(result, "eq_gemm_provider", problem_space, + std::string(library::to_string(problem_.eq_gemm_provider))); + + OperationProfiler::initialize_result_(result, operation_desc, problem_space); + + // Bytes of activation, filter, and output tensors + result.bytes = problem_.bytes(operation_desc); + + // Theoritical flops required for the computation + result.flops = problem_.flops(operation_desc); + + // Measured runtime + result.runtime = 0; + +} + +/// Initialize reduction problem dimenstions and library::Operation +bool Conv3dOperationProfiler::initialize_reduction_configuration_( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + library::ConvDescription const &conv_desc = + static_cast(operation->description()); + + library::ConvKind const &conv_kind = conv_desc.conv_kind; + + if (!cast_from_double(problem_.alpha_one, conv_desc.element_epilogue, 1)) { + return false; + } + + if (!cast_from_double(problem_.beta_zero, conv_desc.element_epilogue, 0)) { + return false; + } + + /// This chooses the appropriate stride element of the row-major C tensor. + int const & tensor_c_stride_idx = (conv_kind == library::ConvKind::kWgrad ? 3 : 0); + + /// intialize library::ReductionConfiguration + conv_workspace_.reduction_configuration.problem_size = problem_.eq_gemm_size(conv_kind).mn(); + conv_workspace_.reduction_configuration.partitions = int(problem_.split_k_slices); + conv_workspace_.reduction_configuration.partition_stride = problem_.eq_gemm_size(conv_kind).mn().product(); + conv_workspace_.reduction_configuration.ldw = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx]; + conv_workspace_.reduction_configuration.lds = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx]; + conv_workspace_.reduction_configuration.ldd = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx]; + + // find reduction operation + library::ReductionFunctionalKey reduction_key( + library::Provider::kCUTLASS, + conv_desc.tile_description.math_instruction.element_accumulator, // element workspace + conv_desc.tile_description.math_instruction.element_accumulator, // element accumulator + conv_desc.C.element, // element output + conv_desc.element_epilogue // element compute + ); + +#if 0// debug print to check which reduction instance is selected + std::cout << reduction_key << "\n"; +#endif + auto reduction_it = Singleton::get().operation_table.reduction_operations.find(reduction_key); + + if(reduction_it == Singleton::get().operation_table.reduction_operations.end()) { + + return false; + } + + // initialize reduction operation required for parallel split-k conv2d operator + reduction_op_ = reduction_it->second; + + // reduction operation found and initialized + return true; +} + + +/// Initializes workspace +Status Conv3dOperationProfiler::initialize_workspace( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + // initialize conv2d underlying operation to handle parallel reduction + library::Operation const* underlying_operation = operation; + + if(conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + if (!(underlying_operation = library::find_conv_operation_for_parallel_reduction(operation))) { + return Status::kErrorNotSupported; + } + } + + library::ConvDescription const &operation_desc = + static_cast(underlying_operation->description()); + + // Compute the number of copies of the problem to avoid L2 camping. + if (!options.profiling.workspace_count) { + int64_t bytes = problem_.bytes(operation_desc); + if (bytes < 3 * int64_t(options.device.properties.l2CacheSize)) { + conv_workspace_.problem_count = + 1 + int((3 * int64_t(options.device.properties.l2CacheSize)) / bytes); + } + else { + conv_workspace_.problem_count = 1; + } + } + else { + conv_workspace_.problem_count = options.profiling.workspace_count; + } + + + if (options.execution_mode != ExecutionMode::kDryRun) { + + conv_workspace_.A = device_context.allocate_tensor( + options, + "A", + operation_desc.A.element, + operation_desc.A.layout, + problem_.extent_a(operation_desc.conv_kind), + conv_workspace_.stride_a(operation_desc.conv_kind), + conv_workspace_.problem_count + ); + + conv_workspace_.B = device_context.allocate_tensor( + options, + "B", + operation_desc.B.element, + operation_desc.B.layout, + problem_.extent_b(operation_desc.conv_kind), + conv_workspace_.stride_b(operation_desc.conv_kind), + conv_workspace_.problem_count + ); + + conv_workspace_.C = device_context.allocate_tensor( + options, + "C", + operation_desc.C.element, + operation_desc.C.layout, + problem_.extent_c(operation_desc.conv_kind), + conv_workspace_.stride_c(operation_desc.conv_kind), + conv_workspace_.problem_count + ); + + conv_workspace_.Computed = device_context.allocate_tensor( + "D", + operation_desc.C.element, + operation_desc.C.layout, + problem_.extent_c(operation_desc.conv_kind), + conv_workspace_.stride_c(operation_desc.conv_kind), + conv_workspace_.problem_count + ); + + conv_workspace_.Reference = device_context.allocate_tensor( + "Reference", + operation_desc.C.element, + operation_desc.C.layout, + problem_.extent_c(operation_desc.conv_kind), + conv_workspace_.stride_c(operation_desc.conv_kind), + conv_workspace_.problem_count + ); + + } + + // + // Initialize the CUTLASS operation + // + Status status = Status::kSuccess; + + if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) { + + if (options.execution_mode != ExecutionMode::kDryRun) { + + uint64_t workspace_size = underlying_operation->get_host_workspace_size(&conv_workspace_.configuration); + conv_workspace_.host_workspace.resize(workspace_size, 0); + + workspace_size = underlying_operation->get_device_workspace_size(&conv_workspace_.configuration); + conv_workspace_.device_workspace.reset(library::NumericTypeID::kU8, workspace_size); + + status = underlying_operation->initialize( + &conv_workspace_.configuration, + conv_workspace_.host_workspace.data(), + conv_workspace_.device_workspace.data()); + + if (status != Status::kSuccess) { + return status; + } + + if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + workspace_size = reduction_op_->get_host_workspace_size(&conv_workspace_.reduction_configuration); + conv_workspace_.reduction_host_workspace.resize(workspace_size, 0); + + status = reduction_op_->initialize( + &conv_workspace_.reduction_configuration, + conv_workspace_.reduction_host_workspace.data(), + nullptr); + + if (status != Status::kSuccess) { + return status; + } + } + } + + // + // If CUTLASS is enabled, generate a result for it + // + results_.push_back(model_result_); + results_.back().provider = library::Provider::kCUTLASS; + results_.back().op_kind = library::OperationKind::kConv3d; + results_.back().disposition = Disposition::kNotRun; + + for(auto provider : verification_providers_) { + results_.back().verification_map[provider] = Disposition::kNotRun; + } + } + + return status; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Verifies CUTLASS against references +bool Conv3dOperationProfiler::verify_cutlass( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + if (!options.profiling.provider_enabled(library::Provider::kCUTLASS)) { + return true; + } + + if (options.execution_mode == ExecutionMode::kDryRun) { + return true; + } + + cudaError_t result; + + // Initialize structure containing Conv arguments + set_cutlass_operator_arguments_(); + + conv_workspace_.Computed->copy_from_device(conv_workspace_.C->data()); + + // + // Run the CUTLASS operation + // + // initialize conv2d underlying operation to handle parallel reduction + library::Operation const* underlying_operation = operation; + + if(conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + if (!(underlying_operation = library::find_conv_operation_for_parallel_reduction(operation))) { + results_.back().disposition = Disposition::kFailed; + return false; + } + } + +#if 0 + std::cout << "profiling : " << std::endl + << "conv2d : " << operation->description().name << std::endl + << "underlying conv2d : " << underlying_operation->description().name << std::endl + << "reduction : " << reduction_op_->description().name << std::endl; +#endif + + // run cutlass conv2d operation + results_.back().status = underlying_operation->run( + &conv_workspace_.arguments, + conv_workspace_.host_workspace.data(), + conv_workspace_.device_workspace.data()); + + if (results_.back().status != Status::kSuccess) { + results_.back().disposition = Disposition::kFailed; + return false; + } + + // Run parallel reduction kernel for parallel split_k_mode + if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + + results_.back().status = reduction_op_->run( + &conv_workspace_.reduction_arguments, + conv_workspace_.reduction_host_workspace.data(), + nullptr); + + if (results_.back().status != Status::kSuccess) { + results_.back().disposition = Disposition::kFailed; + return false; + } + + } + + // Synchronize before running device reference + result = cudaDeviceSynchronize(); + if (result != cudaSuccess) { + results_.back().disposition = Disposition::kFailed; + return false; + } + + // CUTLASS op ran the but not yet verified against any verification provider + results_.back().disposition = Disposition::kNotVerified; + + // + // Run verification providers + // + + if (options.verification.enabled) { + +#if CUTLASS_ENABLE_CUDNN + // Run verification cudnn reference + if (options.verification.provider_enabled(library::Provider::kCUDNN)) { + + // Guard against unsupported cases + auto const & conv_desc = static_cast(operation->description()); + + Status status = cudnn_satisfies(conv_desc, conv_workspace_.configuration); + + // Initialize reference data to the source data + conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data()); + + if (status == Status::kSuccess) { + // call cudnn verification if supported + verify_with_cudnn_( + options, + report, + device_context, + operation, + problem_space, + problem); + } + + else if (status == Status::kErrorInvalidProblem) { + results_.back().verification_map[library::Provider::kCUDNN] = Disposition::kInvalidProblem; + } + + else { + // set verification map for cudnn to not supported + results_.back().verification_map[library::Provider::kCUDNN] = Disposition::kNotSupported; + } + } +#endif // #if CUTLASS_ENABLE_CUDNN + + // Run verification host reference + if (options.verification.provider_enabled(library::Provider::kReferenceHost)) { + + // Restore reference data back to initial source data + conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data()); + + verify_with_host_reference_( + options, + report, + device_context, + operation, + problem_space, + problem); + } + + // Update disposition to worst case verification outcome among all + // verification providers which are supported + bool is_any_verification_run_passed = false; + for(auto &m : results_.back().verification_map) { + if(m.second == Disposition::kFailed || m.second == Disposition::kIncorrect) { + results_.back().disposition = m.second; + return true; + } + if(!is_any_verification_run_passed && m.second == Disposition::kPassed) { + is_any_verification_run_passed = true; + } + } + + if(is_any_verification_run_passed) { + results_.back().disposition = Disposition::kPassed; + } + } + + // Return true means continue profiling + return true; +} + + +/// Verifies CUTLASS against host reference +bool Conv3dOperationProfiler::verify_with_host_reference_( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + Status status; + + // + // Find host reference operation using conv functional description key + // + library::OperationDescription const &desc = operation->description(); + + auto &conv_desc = static_cast(desc); + + library::ConvFunctionalKey conv_key( + library::Provider::kReferenceHost, + conv_desc.conv_kind, + conv_desc.A.element, + conv_desc.A.layout, + conv_desc.B.element, + conv_desc.B.layout, + conv_desc.C.element, + conv_desc.C.layout, + conv_desc.tile_description.math_instruction.element_accumulator, + conv_desc.element_epilogue); + +#if 0 // debug print to check which host refererence instance is selected + std::cout << conv_key << "\n"; +#endif + + auto operators_it = Singleton::get().operation_table.conv3d_operations.find(conv_key); + + if(operators_it == Singleton::get().operation_table.conv3d_operations.end()) { + + results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun; + return true; + } + + // conv3d host reference minimum cc is 0 (CPU) and no iterator algorithm + library::ConvPreferenceKey preference_key(0, library::IteratorAlgorithmID::kNone); + auto cc_it = operators_it->second.find(preference_key); + + if(cc_it == operators_it->second.end()) { + results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun; + return true; + } + + // host refernce has only one instances in ConvOperationVectorMap + library::Operation const *reference_op = cc_it->second[0]; + + // + // Copy input tensors A, B, and C from device to host buffers + // + conv_workspace_.host_tensor_a.resize(conv_workspace_.A->bytes()); + conv_workspace_.host_tensor_b.resize(conv_workspace_.B->bytes()); + conv_workspace_.host_tensor_c.resize(conv_workspace_.C->bytes()); + conv_workspace_.A->copy_to_host(conv_workspace_.host_tensor_a.data()); + conv_workspace_.B->copy_to_host(conv_workspace_.host_tensor_b.data()); + conv_workspace_.C->copy_to_host(conv_workspace_.host_tensor_c.data()); + + // + // Initialize structure containing Conv3d arguments + // + conv_workspace_.arguments.A = conv_workspace_.host_tensor_a.data(); + conv_workspace_.arguments.B = conv_workspace_.host_tensor_b.data(); + conv_workspace_.arguments.C = conv_workspace_.host_tensor_c.data(); + conv_workspace_.arguments.D = conv_workspace_.host_tensor_c.data(); + conv_workspace_.arguments.alpha = problem_.alpha.data(); + conv_workspace_.arguments.beta = problem_.beta.data(); + conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost; + + // + // Intialize host reference operation + // + std::vector host_workspace_reference_op; + + uint64_t workspace_size = reference_op->get_host_workspace_size(&conv_workspace_.configuration); + host_workspace_reference_op.resize(workspace_size, 0); + + reference_op->initialize( + &conv_workspace_.configuration, + host_workspace_reference_op.data()); + + // + // Run host reference operation + // + status = reference_op->run( + &conv_workspace_.arguments, + host_workspace_reference_op.data()); + + // Handle errors + if (status != Status::kSuccess) { + results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotVerified; + return true; + } + + // + // Copy host reference output to device memory for equality check on device + // + conv_workspace_.Reference->copy_from_host(conv_workspace_.arguments.D); + + // + // Verify results + // + results_.back().verification_map[library::Provider::kReferenceHost] = compare_tensors( + options, + *conv_workspace_.Computed, + *conv_workspace_.Reference, + conv_workspace_.Computed->batch_stride() + ); + + // Save workspace if incorrect + if (options.verification.save_workspace == SaveWorkspace::kIncorrect && + results_.back().verification_map[library::Provider::kReferenceHost] == Disposition::kIncorrect) { + + save_workspace( + device_context, + options, + static_cast(operation->description()), + library::Provider::kCUTLASS, + library::Provider::kReferenceHost); + } + + // Return true means continue profiling + return true; +} + + +/// Verifies CUTLASS against host reference +bool Conv3dOperationProfiler::verify_with_device_reference_( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + // TODO: verify cutlass conv3d against device reference + + // Return true means continue profiling + return true; +} + +/// Measures performance results +bool Conv3dOperationProfiler::profile( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + + if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) { + + set_cutlass_operator_arguments_(); + + results_.back().status = profile_cutlass_( + results_.back().runtime, + options, + operation, + &conv_workspace_.arguments, + conv_workspace_.host_workspace.data(), + conv_workspace_.device_workspace.data() + ); + } + return true; + +} + +/// Updates the arguments structure for the CUTLASS operator based on +/// the problem index. +void Conv3dOperationProfiler::set_cutlass_operator_arguments_(int problem_idx) { + // Initialize structure containing Conv3d arguments + conv_workspace_.arguments.A = conv_workspace_.A->batch_data(problem_idx); + conv_workspace_.arguments.B = conv_workspace_.B->batch_data(problem_idx); + conv_workspace_.arguments.C = conv_workspace_.C->batch_data(problem_idx); + conv_workspace_.arguments.D = conv_workspace_.Computed->batch_data(problem_idx); + conv_workspace_.arguments.alpha = problem_.alpha.data(); + conv_workspace_.arguments.beta = problem_.beta.data(); + conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost; + + if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + // update library::ConvArguments for parallel split-k reduction + conv_workspace_.arguments.D = conv_workspace_.device_workspace.data(); + conv_workspace_.arguments.alpha = problem_.alpha_one.data(); + conv_workspace_.arguments.beta = problem_.beta_zero.data(); + + /// intialize library::ReductionArguments + conv_workspace_.reduction_arguments.workspace = conv_workspace_.device_workspace.data(); + conv_workspace_.reduction_arguments.source = conv_workspace_.C->batch_data(problem_idx); + conv_workspace_.reduction_arguments.destination = conv_workspace_.Computed->batch_data(problem_idx); + conv_workspace_.reduction_arguments.alpha = problem_.alpha.data(); + conv_workspace_.reduction_arguments.beta = problem_.beta.data(); + conv_workspace_.reduction_arguments.pointer_mode = library::ScalarPointerMode::kHost; + } +} + +/// Method to profile a CUTLASS Operation +Status Conv3dOperationProfiler::profile_cutlass_( + double &runtime, + Options const &options, + library::Operation const *operation, + void *arguments, + void *host_workspace, + void *device_workspace) { + + GpuTimer timer; + + // initialize conv2d underlying operation to handle parallel reduction + library::Operation const* underlying_operation = operation; + + if(conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + if (!(underlying_operation = library::find_conv_operation_for_parallel_reduction(operation))) { + return Status::kErrorNotSupported; + } + } + + // + // Optional sleep to limit power consumption and thermals + // + + sleep(options.profiling.sleep_duration); + + // + // Warmup loop + // + + Status status; + + for (int iteration = 0; iteration < options.profiling.warmup_iterations; ++iteration) { + + // Setup rotating workspace + int workspace_idx = options.profiling.warmup_iterations + iteration; + int problem_idx = (workspace_idx % conv_workspace_.problem_count); + + set_cutlass_operator_arguments_(problem_idx); + + // Run underlying conv2d operation + status = underlying_operation->run( + arguments, + host_workspace, + device_workspace); + + // Run parallel reduction kernel for parallel split_k_mode + if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + + status = reduction_op_->run( + &conv_workspace_.reduction_arguments, + conv_workspace_.reduction_host_workspace.data(), + nullptr); + } + + if (status != Status::kSuccess) { + return status; + } + } + + // + // Initialize GPU timer + // + + timer.start(); + + // + // Profiling loop + // + + int Iterations = options.profiling.iterations; + + int iteration = 0; + for (; iteration < Iterations; ++iteration) { + + // Setup rotating workspace + int problem_idx = (iteration % conv_workspace_.problem_count); + + set_cutlass_operator_arguments_(problem_idx); + + // Run underlying conv2d operation + status = underlying_operation->run( + arguments, + host_workspace, + device_workspace); + + // Run parallel reduction kernel for parallel split_k_mode + if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) { + status = reduction_op_->run( + &conv_workspace_.reduction_arguments, + conv_workspace_.reduction_host_workspace.data(), + nullptr); + } + + if (status != Status::kSuccess) { + return status; + } + } + + // + // Wait for completion + // + + timer.stop_and_wait(); + + // + // Update performance result + // + + runtime = timer.duration(iteration); + + return status; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// +#if CUTLASS_ENABLE_CUDNN + +/// Verifies CUTLASS against cudnn reference +bool Conv3dOperationProfiler::verify_with_cudnn_( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + auto &conv_desc = static_cast(operation->description()); + + // + // Construct cudnn operators + // + + CudnnCreate handle; + cudnnStatus_t status = handle.get_cudnn_create_status(); + + if (status != CUDNN_STATUS_SUCCESS) { + + results_.back().verification_map[library::Provider::kCUDNN] = get_cutlass_disposition(status); + return true; + } + + // + // Initialize state + // + + // Initialize structure containing Conv2d arguments + conv_workspace_.arguments.A = conv_workspace_.A->data(); + conv_workspace_.arguments.B = conv_workspace_.B->data(); + conv_workspace_.arguments.D = conv_workspace_.Reference->data(); + conv_workspace_.arguments.alpha = problem_.alpha.data(); + conv_workspace_.arguments.beta = problem_.beta.data(); + conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost; + + // cuDNN does not support four tensor arguments, so we copy the tensor C data into + // tensor D. + conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data()); + conv_workspace_.arguments.C = conv_workspace_.arguments.D; + + try { + + // + // Construct dispatcher to cudnn operator + // + + detail::cudnnConvDispatcher conv_op( + conv_desc, + conv_workspace_.configuration, + conv_workspace_.arguments, + handle + ); + + if (conv_op.status != Status::kSuccess) { + if (conv_op.status == Status::kErrorNotSupported) { + results_.back().verification_map[library::Provider::kCUDNN] = Disposition::kNotSupported; + + } else { + results_.back().verification_map[library::Provider::kCUDNN] = Disposition::kFailed; + } + return true; + } + + + status = conv_op(handle); + + // Handle errors + if (status != CUDNN_STATUS_SUCCESS) { + + results_.back().verification_map[library::Provider::kCUDNN] = get_cutlass_disposition(status); + return true; + } + + // + // Verify results + // + + results_.back().verification_map[library::Provider::kCUDNN] = compare_tensors( + options, + *conv_workspace_.Computed, + *conv_workspace_.Reference + ); + + // Save workspace if incorrect + if (options.verification.save_workspace == SaveWorkspace::kIncorrect && + results_.back().verification_map[library::Provider::kCUDNN] == Disposition::kIncorrect) { + + save_workspace( + device_context, + options, + conv_desc, + library::Provider::kCUTLASS, + library::Provider::kCUDNN); + } + } + catch (...) { + results_.back().verification_map[library::Provider::kCUDNN] = Disposition::kFailed; + } + + // Return true means continue profiling + return true; + +} + +#endif // #if CUTLASS_ENABLE_CUDNN + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace profiler +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tools/profiler/src/conv3d_operation_profiler.h b/tools/profiler/src/conv3d_operation_profiler.h new file mode 100644 index 0000000000..04c2a15e82 --- /dev/null +++ b/tools/profiler/src/conv3d_operation_profiler.h @@ -0,0 +1,441 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief Defines profiling functionality for convolution + +*/ + +#pragma once + +#include +#include +#include +#include +#include + +// CUTLASS Library includes +#include "cutlass/library/library.h" +#include "cutlass/library/util.h" +#include "cutlass/library/handle.h" +#include "cutlass/library/manifest.h" +#include "cutlass/library/singleton.h" + +// Profiler includes +#include "options.h" +#include "device_context.h" +#include "operation_profiler.h" +#include "performance_result.h" +#include "problem_space.h" +#include "reduction_operation_profiler.h" +#if CUTLASS_ENABLE_CUDNN +#include "cudnn_helpers.h" +#endif //#if CUTLASS_ENABLE_CUDNN +#include "debug.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace profiler { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Abstract base class for each math function +class Conv3dOperationProfiler : public OperationProfiler { +public: + + /// Problem structure obtained from problem space + struct Conv3dProblem { + + int64_t n, d, h, w, c, z, p, q, k, t, r, s; + int64_t pad_d, pad_h, pad_w; + int64_t stride_d, stride_h, stride_w; + int64_t dilation_d, dilation_h, dilation_w; + + std::vector alpha; + std::vector beta; + + library::SplitKMode split_k_mode; + int64_t split_k_slices; + + library::ConvModeID conv_mode; + + library::Provider eq_gemm_provider; + + // convolution with parallel interleaved reduction + // convolution epilogue (alpha, beta) = (1.0, 0.0) + // reduction epilogue (alpha, beta) = (Conv3dProblem::alpha, Conv3dProblem::beta) + std::vector alpha_one; + std::vector beta_zero; + + // + // Methods + // + + /// Total number of bytes loaded + int64_t bytes(library::ConvDescription const &operation_desc) const; + + /// Total number of flops computed + int64_t flops(library::ConvDescription const &operation_desc) const; + + /// Infers output size from theinput size, padding, stride, and dilation + void set_default_output_size() { + z = ((d + pad_d - t * dilation_d) / stride_d) + 1; + p = ((h + pad_h - r * dilation_h) / stride_h) + 1; + q = ((w + pad_w - s * dilation_w) / stride_w) + 1; + } + + // Returns equivalent gemm problem size for convolution + cutlass::gemm::GemmCoord eq_gemm_size(library::ConvKind const &conv_kind) const { + + switch (conv_kind) { + case library::ConvKind::kFprop: return cutlass::gemm::GemmCoord(int(n * z * p * q), int(k), int(t * r * s * c)); + case library::ConvKind::kDgrad: return cutlass::gemm::GemmCoord(int(n * d * h * w), int(c), int(t * r * s * k)); + case library::ConvKind::kWgrad: return cutlass::gemm::GemmCoord(int(k), int(t * r * s * c), int(n * z * p * q)); + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns extent for tensor A + std::vector extent_a(library::ConvKind const &conv_kind) const { + + switch (conv_kind) { + case library::ConvKind::kFprop: return {int(n), int(d), int(h), int(w), int(c)}; + case library::ConvKind::kDgrad: return {int(n), int(z), int(p), int(q), int(k)}; + case library::ConvKind::kWgrad: return {int(n), int(z), int(p), int(q), int(k)}; + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns extent for tensor B + std::vector extent_b(library::ConvKind const &conv_kind) const { + + switch (conv_kind) { + case library::ConvKind::kFprop: return {int(k), int(t), int(r), int(s), int(c)}; + case library::ConvKind::kDgrad: return {int(k), int(t), int(r), int(s), int(c)}; + case library::ConvKind::kWgrad: return {int(n), int(d), int(h), int(w), int(c)}; + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns extent for tensor C + std::vector extent_c(library::ConvKind const &conv_kind) const { + + switch (conv_kind) { + case library::ConvKind::kFprop: return {int(n), int(z), int(p), int(q), int(k)}; + case library::ConvKind::kDgrad: return {int(n), int(d), int(h), int(w), int(c)}; + case library::ConvKind::kWgrad: return {int(k), int(t), int(r), int(s), int(c)}; + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns layout for equivalent gemm matrix A + library::LayoutTypeID eq_gemm_layout_a(library::ConvKind const &conv_kind) const { + + switch (conv_kind) { + case library::ConvKind::kFprop: return library::LayoutTypeID::kRowMajor; // TN Gemm + case library::ConvKind::kDgrad: return library::LayoutTypeID::kRowMajor; // TT Gemm + case library::ConvKind::kWgrad: return library::LayoutTypeID::kColumnMajor; // NT Gemm + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns layout for equivalent gemm matrix B + library::LayoutTypeID eq_gemm_layout_b(library::ConvKind const &conv_kind) const { + + switch (conv_kind) { + case library::ConvKind::kFprop: return library::LayoutTypeID::kColumnMajor; // TN Gemm + case library::ConvKind::kDgrad: return library::LayoutTypeID::kRowMajor; // TT Gemm + case library::ConvKind::kWgrad: return library::LayoutTypeID::kRowMajor; // NT Gemm + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns layout for equivalent gemm matrix C + library::LayoutTypeID eq_gemm_layout_c(library::ConvKind const &conv_kind) const { + + switch (conv_kind) { + // Gemm operator assumes column-major output + case library::ConvKind::kFprop: + case library::ConvKind::kDgrad: + case library::ConvKind::kWgrad: return library::LayoutTypeID::kColumnMajor; + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns leading dimenstion for equivalent gemm matrix A + int64_t eq_gemm_lda(library::ConvKind const &conv_kind) const { + + switch (conv_kind) { + case library::ConvKind::kFprop: return eq_gemm_size(conv_kind).k(); + case library::ConvKind::kDgrad: return eq_gemm_size(conv_kind).k(); + case library::ConvKind::kWgrad: return eq_gemm_size(conv_kind).m(); + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns leading dimenstion for equivalent gemm matrix B + int64_t eq_gemm_ldb(library::ConvKind const &conv_kind) const { + + switch (conv_kind) { + case library::ConvKind::kFprop: return eq_gemm_size(conv_kind).k(); + case library::ConvKind::kDgrad: return eq_gemm_size(conv_kind).n(); + case library::ConvKind::kWgrad: return eq_gemm_size(conv_kind).n(); + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns leading dimenstion for equivalent gemm matrix C + int64_t eq_gemm_ldc(library::ConvKind const &conv_kind) const { + + switch (conv_kind) { + case library::ConvKind::kFprop: + case library::ConvKind::kDgrad: + case library::ConvKind::kWgrad: return eq_gemm_size(conv_kind).m(); + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + }; + + /// Workspace used + struct Conv2dWorkspace { + + /// Conv device allocations + DeviceAllocation *A; + DeviceAllocation *B; + DeviceAllocation *C; + DeviceAllocation *Computed; + DeviceAllocation *Reference; + + /// Library configuration and arguments for convolution operator + library::Conv3dConfiguration configuration; + library::ConvArguments arguments; + + /// Number of copies of the problem workspace which are visited sequentially during + /// profiling to avoid camping in the last level cache. + int problem_count; + + /// Buffer used for the cutlass conv2d operations' host workspace + std::vector host_workspace; + + /// Buffer used for the cutlass operations' device workspace + DeviceAllocation device_workspace; + + /// Library configuration and arguments for reduction operator + library::ReductionConfiguration reduction_configuration; + library::ReductionArguments reduction_arguments; + + /// Buffer used for the cutlass reduction operations' host workspace + std::vector reduction_host_workspace; + + /// Host data buffers for host reference operation + /// host buffer for tensor + std::vector host_tensor_a; + + /// host buffer for tensor b + std::vector host_tensor_b; + + /// host buffer for tensor c + std::vector host_tensor_c; + + + // + // Methods + // + + Conv2dWorkspace(): + A(nullptr), B(nullptr), C(nullptr), Computed(nullptr), Reference(nullptr) { } + + // Returns stride vector for tensor A + std::vector stride_a(library::ConvKind const &conv_kind) { + return { + configuration.layout_a(conv_kind).stride()[0], + configuration.layout_a(conv_kind).stride()[1], + configuration.layout_a(conv_kind).stride()[2], + configuration.layout_a(conv_kind).stride()[3] + }; + } + + // Returns stride vector for tensor B + std::vector stride_b(library::ConvKind const &conv_kind) { + + return { + configuration.layout_b(conv_kind).stride()[0], + configuration.layout_b(conv_kind).stride()[1], + configuration.layout_b(conv_kind).stride()[2], + configuration.layout_b(conv_kind).stride()[3] + }; + } + + // Returns stride vector for tensor C + std::vector stride_c(library::ConvKind const &conv_kind) { + + return { + configuration.layout_c(conv_kind).stride()[0], + configuration.layout_c(conv_kind).stride()[1], + configuration.layout_c(conv_kind).stride()[2], + configuration.layout_c(conv_kind).stride()[3] + }; + } + }; + +protected: + + // + // Data members + // + + /// CONV problem obtained from problem space + Conv3dProblem problem_; + + /// Device memory allocations + Conv2dWorkspace conv_workspace_; + + /// CUTLASS parallel reduction operation to follow this* conv2d operation + library::Operation const *reduction_op_; + +public: + // + // Methods + // + + /// Ctor + Conv3dOperationProfiler(Options const &options); + + /// Destructor + virtual ~Conv3dOperationProfiler(); + + /// Prints usage statement for the math function + virtual void print_usage(std::ostream &out) const; + + /// Prints examples + virtual void print_examples(std::ostream &out) const; + + /// Extracts the problem dimensions + virtual Status initialize_configuration( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + + /// Initializes workspace + virtual Status initialize_workspace( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + + /// Verifies CUTLASS against references + virtual bool verify_cutlass( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + + /// Measures performance results + virtual bool profile( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + +protected: + + /// Updates the arguments structure for the CUTLASS operator based on + /// the problem index. + void set_cutlass_operator_arguments_(int problem_idx = 0); + + /// Method to profile an initialized CUTLASS operation + virtual Status profile_cutlass_( + double &runtime, + Options const &options, + library::Operation const *operation, + void *arguments, + void *host_workspace, + void *device_workspace); + + /// Initialize reduction problem dimenstions and library::Operation + bool initialize_reduction_configuration_( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + + /// Initializes the performance result + void initialize_result_( + PerformanceResult &result, + Options const &options, + library::ConvDescription const &operation_desc, + ProblemSpace const &problem_space); + + /// Verifies CUTLASS against host reference + bool verify_with_host_reference_( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + + /// Verifies CUTLASS against device reference + bool verify_with_device_reference_( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + +#if CUTLASS_ENABLE_CUDNN + + /// Verifies CUTLASS against cudnn reference + bool verify_with_cudnn_( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + +#endif //#if CUTLASS_ENABLE_CUDNN + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace profiler +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/tools/profiler/src/cudnn_helpers.cpp b/tools/profiler/src/cudnn_helpers.cpp new file mode 100644 index 0000000000..86f18095bf --- /dev/null +++ b/tools/profiler/src/cudnn_helpers.cpp @@ -0,0 +1,485 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief Helper functions for mapping CUTLASS concepts to cuDNN. +*/ +#if CUTLASS_ENABLE_CUDNN + +#include + +#include "cudnn_helpers.h" + +namespace cutlass { +namespace profiler { + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Converts a cuDNN status to cutlass::Status +Status get_cutlass_status(cudnnStatus_t cudnn_status) { + + if (cudnn_status == CUDNN_STATUS_SUCCESS) { + return Status::kSuccess; + } + else if (cudnn_status == CUDNN_STATUS_INVALID_VALUE) { + return Status::kErrorInvalidProblem; + } + if (cudnn_status == CUDNN_STATUS_NOT_SUPPORTED) { + return Status::kErrorNotSupported; + } + return Status::kErrorInternal; +} + +/// Converts a cuDNN status to cutlass::profiler::Disposition +Disposition get_cutlass_disposition(cudnnStatus_t cudnn_status) { + + if (cudnn_status == CUDNN_STATUS_INVALID_VALUE) { + return Disposition::kInvalidProblem; + } + else if (cudnn_status == CUDNN_STATUS_NOT_SUPPORTED) { + return Disposition::kNotSupported; + } + return Disposition::kFailed; +} + +/// Checks cudnnStatus_t converts to cutlas status and returns if Status::kSuccess o.w. throws exception +Status checkCudnnErr(cudnnStatus_t cudnn_status) { + Status cutlass_status = get_cutlass_status(cudnn_status); + if(cutlass_status != Status::kSuccess) { + throw std::runtime_error("checkCudnnErr failed"); + } + return cutlass_status; +} + +/// Maps a CUTLASS conv mode to a cuDNN cudnnConvolutionMode_t +bool get_cudnn_conv_mode(cudnnConvolutionMode_t &cudnn_conv_mode, conv::Mode conv_mode) { + switch (conv_mode) { + case conv::Mode::kCrossCorrelation: + cudnn_conv_mode = CUDNN_CROSS_CORRELATION; + return true; + case conv::Mode::kConvolution: + cudnn_conv_mode = CUDNN_CONVOLUTION; + return true; + default: break; + } + return false; +} + +/// Maps a CUTLASS tensor layout to a cuDNN cudnnTensorFormat_t +bool get_cudnn_layout(cudnnTensorFormat_t &cudnn_layout, library::LayoutTypeID layout) { + switch (layout) { + // cudnn uses the same enum for TensorNC*HW along nDim (ConvDescription::conv_dim) + case library::LayoutTypeID::kTensorNCHW: + case library::LayoutTypeID::kTensorNCDHW: + cudnn_layout = CUDNN_TENSOR_NCHW; + return true; + case library::LayoutTypeID::kTensorNHWC: + case library::LayoutTypeID::kTensorNDHWC: + cudnn_layout = CUDNN_TENSOR_NHWC; + return true; + default: break; + } + return false; +} + +/// Maps a CUTLASS numeric type to a cuDNN cudnnDataType_t +bool get_cudnn_datatype(cudnnDataType_t &cudnn_element_type, library::NumericTypeID element_type) { + switch (element_type) { + case library::NumericTypeID::kF16: + cudnn_element_type = CUDNN_DATA_HALF; + return true; + + case library::NumericTypeID::kF32: + cudnn_element_type = CUDNN_DATA_FLOAT; + return true; + + case library::NumericTypeID::kF64: + cudnn_element_type = CUDNN_DATA_DOUBLE; + return true; + + case library::NumericTypeID::kS2: + break; + + case library::NumericTypeID::kS4: + break; + + case library::NumericTypeID::kS8: + cudnn_element_type = CUDNN_DATA_INT8; + return true; + + case library::NumericTypeID::kS16: + break; + + case library::NumericTypeID::kS32: + cudnn_element_type = CUDNN_DATA_INT32; + return true; + + case library::NumericTypeID::kS64: + break; + + case library::NumericTypeID::kU2: + break; + + case library::NumericTypeID::kU4: + break; + + case library::NumericTypeID::kU8: + cudnn_element_type = CUDNN_DATA_UINT8; + return true; + + case library::NumericTypeID::kU16: + break; + + case library::NumericTypeID::kU32: + break; + + case library::NumericTypeID::kU64: + break; + + case library::NumericTypeID::kB1: + break; + + case library::NumericTypeID::kInvalid: + + default: + break; + } + + return false; +} + +/// Maps CUTLASS math OpcodeClassID and MathOperationID to cuDNN math_type +bool get_cudnn_mathtype(cudnnMathType_t &cudnn_math_type, library::ConvDescription const &conv_desc) { + + switch (conv_desc.tile_description.math_instruction.opcode_class) { + + case library::OpcodeClassID::kTensorOp: + { + cudnn_math_type = CUDNN_TENSOR_OP_MATH; + + library::MathOperationID math_op = conv_desc.tile_description.math_instruction.math_operation; + + // Allow conversion on input data type for fast math operations + if (math_op == library::MathOperationID::kMultiplyAddFastF16 || + math_op == library::MathOperationID::kMultiplyAddFastBF16) + { + cudnn_math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION; + } + + return true; + } + case library::OpcodeClassID::kSimt: + return false; + } + + return false; +} + +/// Cudnn compute type seems to be hardcoded to float (To handle a possible cudnn issue) +float cast_cudnn_compute_type_to_float(library::NumericTypeID type, void const * src) { + + switch (type) { + case library::NumericTypeID::kF16: + { + return float(*(static_cast(src))); + } + case library::NumericTypeID::kF32: + { + return float(*(static_cast(src))); + } + case library::NumericTypeID::kS32: + { + return float(*(static_cast(src))); + } + default: + throw std::runtime_error("Data type handled in cast_compute_type_to_float"); + } +} + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Returns a status if cuDNN can satisfy a particular Conv2d description +Status cudnn_satisfies( + library::ConvDescription const &desc, + library::Conv2dConfiguration const &configuration) { + + auto const &a_tensor = desc.A; + auto const &b_tensor = desc.B; + auto const &c_tensor = desc.C; + auto const &math_instruction = desc.tile_description.math_instruction; + + if(a_tensor.element != b_tensor.element) { + return Status::kErrorInvalidDataType; + } + + //////////////////////// Convolution output dimensions p and q /////////////////////// + // Cutlass convolutions support arbitrary output dimensions and not constriant by // + // input, filter, padding, striding, dilation sizes. // + // cuDNN sets the output dimensions (p, q) using following equations: // + // // + // output = div_up(input + 2 * pad - ((filter - 1) * dilation + 1) + 1, stride) // + // where; div_up(a, b) : (a - 1)/b + 1 // + // // + // Before launching cudnn verification or profiling check that output p and q // + // dimensions are cuDNN compliant. // + // // + // If user sets output p and q which do not follow above constraints, cutlass conv, // + // host reference, device reference can run. However, cudnn convolution returns // + // "Invalid problem" // + // // + /////////////////////////////////////////////////////////////////////////////////////// + + // check conv output dimension p for cudnn + int cudnn_output_p = + ( + ( + configuration.problem_size.H + + 2 * configuration.problem_size.pad_h - + ((configuration.problem_size.R - 1) * + configuration.problem_size.dilation_h + 1) + ) / + (configuration.problem_size.stride_h) + + 1 + ); + + if (cudnn_output_p != configuration.problem_size.P) { + return Status::kErrorInvalidProblem; + } + + // check conv output dimension q for cudnn + int cudnn_output_q = + ( + ( + configuration.problem_size.W + + 2 * configuration.problem_size.pad_w - + ((configuration.problem_size.S - 1) * + configuration.problem_size.dilation_w + 1) + ) / + (configuration.problem_size.stride_w) + + 1 + ); + + if (cudnn_output_q != configuration.problem_size.Q) { + return Status::kErrorInvalidProblem; + } + ////////////////////////////////////////////////////////////////////////////////////// + + // conv operator with input=FP16, accumulator=FP32, output=FP32 datatype + if (a_tensor.element == library::NumericTypeID::kF16 && + b_tensor.element == library::NumericTypeID::kF16 && + math_instruction.element_accumulator == library::NumericTypeID::kF32 && + c_tensor.element == library::NumericTypeID::kF32 + ) { + + return Status::kErrorNotSupported; + } + + if (a_tensor.element == library::NumericTypeID::kBF16 || + b_tensor.element == library::NumericTypeID::kBF16 || + c_tensor.element == library::NumericTypeID::kBF16 + ) { + + return Status::kErrorNotSupported; + } + + // TF32 input not supported in cuDNN + if (a_tensor.element == library::NumericTypeID::kTF32 || + b_tensor.element == library::NumericTypeID::kTF32 || + c_tensor.element == library::NumericTypeID::kTF32 + ) { + + return Status::kErrorNotSupported; + } + + if (a_tensor.element == library::NumericTypeID::kS8 || + b_tensor.element == library::NumericTypeID::kS8 || + c_tensor.element == library::NumericTypeID::kS8 + ) { + + return Status::kErrorNotSupported; + } + + if (a_tensor.element == library::NumericTypeID::kU8 || + b_tensor.element == library::NumericTypeID::kU8 || + c_tensor.element == library::NumericTypeID::kU8 + ) { + + return Status::kErrorNotSupported; + } + + if (a_tensor.element == library::NumericTypeID::kS4 || + b_tensor.element == library::NumericTypeID::kS4 || + c_tensor.element == library::NumericTypeID::kS4 + ) { + + return Status::kErrorNotSupported; + } + + if (a_tensor.element == library::NumericTypeID::kU4 || + b_tensor.element == library::NumericTypeID::kU4 || + c_tensor.element == library::NumericTypeID::kU4 + ) { + + return Status::kErrorNotSupported; + } + + return Status::kSuccess; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Returns a status if cuDNN can satisfy a particular Conv3d description +Status cudnn_satisfies( + library::ConvDescription const &desc, + library::Conv3dConfiguration const &configuration) { + + auto const &a_tensor = desc.A; + auto const &b_tensor = desc.B; + auto const &c_tensor = desc.C; + auto const &math_instruction = desc.tile_description.math_instruction; + + if(a_tensor.element != b_tensor.element) { + return Status::kErrorInvalidDataType; + } + + //////////////////////// Convolution output dimensions p and q /////////////////////// + // Cutlass convolutions support arbitrary output dimensions and not constriant by // + // input, filter, padding, striding, dilation sizes. // + // cuDNN sets the output dimensions (p, q) using following equations: // + // // + // output = div_up(input + 2 * pad - ((filter - 1) * dilation + 1) + 1, stride) // + // where; div_up(a, b) : (a - 1)/b + 1 // + // // + // Before launching cudnn verification or profiling check that output p and q // + // dimensions are cuDNN compliant. // + // // + // If user sets output p and q which do not follow above constraints, cutlass conv, // + // host reference, device reference can run. However, cudnn convolution returns // + // "Invalid problem" // + // // + /////////////////////////////////////////////////////////////////////////////////////// + + // check conv output dimension z for cudnn + int cudnn_output_z = + ( + ( + configuration.problem_size.D + + 2 * configuration.problem_size.pad_d - + ((configuration.problem_size.T - 1) * + configuration.problem_size.dilation_d + 1) + ) / + (configuration.problem_size.stride_d) + + 1 + ); + + if (cudnn_output_z != configuration.problem_size.Z) { + return Status::kErrorInvalidProblem; + } + + // check conv output dimension p for cudnn + int cudnn_output_p = + ( + ( + configuration.problem_size.H + + 2 * configuration.problem_size.pad_h - + ((configuration.problem_size.R - 1) * + configuration.problem_size.dilation_h + 1) + ) / + (configuration.problem_size.stride_h) + + 1 + ); + + if (cudnn_output_p != configuration.problem_size.P) { + return Status::kErrorInvalidProblem; + } + + // check conv output dimension q for cudnn + int cudnn_output_q = + ( + ( + configuration.problem_size.W + + 2 * configuration.problem_size.pad_w - + ((configuration.problem_size.S - 1) * + configuration.problem_size.dilation_w + 1) + ) / + (configuration.problem_size.stride_w) + + 1 + ); + + if (cudnn_output_q != configuration.problem_size.Q) { + return Status::kErrorInvalidProblem; + } + ////////////////////////////////////////////////////////////////////////////////////// + + // conv operator with input, accumulator, output datatype of (hss) are not supported + // in cuDNN + if (a_tensor.element == library::NumericTypeID::kF16 && + b_tensor.element == library::NumericTypeID::kF16 && + math_instruction.element_accumulator == library::NumericTypeID::kF32 && + c_tensor.element == library::NumericTypeID::kF32 + ) { + + return Status::kErrorNotSupported; + } + + if (a_tensor.element == library::NumericTypeID::kBF16 || + b_tensor.element == library::NumericTypeID::kBF16 || + c_tensor.element == library::NumericTypeID::kBF16 + ) { + + return Status::kErrorNotSupported; + } + + if (a_tensor.element == library::NumericTypeID::kTF32 || + b_tensor.element == library::NumericTypeID::kTF32 || + c_tensor.element == library::NumericTypeID::kTF32 + ) { + + return Status::kErrorNotSupported; + } + + if (a_tensor.element == library::NumericTypeID::kS8 || + b_tensor.element == library::NumericTypeID::kS8 || + c_tensor.element == library::NumericTypeID::kS8 + ) { + + return Status::kErrorNotSupported; + } + + // S4 not supported in cuDNN + if (a_tensor.element == library::NumericTypeID::kS4 || + b_tensor.element == library::NumericTypeID::kS4 || + c_tensor.element == library::NumericTypeID::kS4 + ) { + + return Status::kErrorNotSupported; + } + + return Status::kSuccess; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace profiler +} // namespace cutlass + +#endif diff --git a/tools/profiler/src/cudnn_helpers.h b/tools/profiler/src/cudnn_helpers.h new file mode 100644 index 0000000000..58fe4e678f --- /dev/null +++ b/tools/profiler/src/cudnn_helpers.h @@ -0,0 +1,584 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief Helper functions for mapping CUTLASS concepts to cuDNN. + +*/ + +#pragma once +#if CUTLASS_ENABLE_CUDNN +#include +#include +#include +#include "cutlass/cutlass.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/library/library.h" +#include "enumerated_types.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace profiler { + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Converts a cuDNN status to cutlass::Status +Status get_cutlass_status(cudnnStatus_t cudnn_status); + +/// Converts a cuDNN status to cutlass::profiler::Disposition +Disposition get_cutlass_disposition(cudnnStatus_t cudnn_status); + +/// Checks cudnnStatus_t converts to cutlas status and returns if Status::kSuccess o.w. throws exception +Status checkCudnnErr(cudnnStatus_t cudnn_status); + +/// Maps a CUTLASS conv mode to a cuDNN conv mode enumeration +bool get_cudnn_conv_mode(cudnnConvolutionMode_t &cudnn_conv_mode, conv::Mode conv_mode); + +/// Maps a CUTLASS layout type to a cuDNN data type enumeration +bool get_cudnn_layout(cudnnTensorFormat_t &cudnn_layout, library::LayoutTypeID layout); + +/// Maps a CUTLASS numeric type to a cuDNN data type enumeration +bool get_cudnn_datatype(cudnnDataType_t &cudnn_element_type, library::NumericTypeID element_type); + +/// Maps CUTLASS math OpcodeClassID and MathOperationID to cuDNN math_type +bool get_cudnn_mathtype(cudnnMathType_t &cudnn_math_type, library::ConvDescription const &conv_desc); + +/// Returns a status if cudnn can satisfy a particular Conv2d description +Status cudnn_satisfies(library::ConvDescription const &desc, library::Conv2dConfiguration const &configuration); + +/// Returns a status if cudnn can satisfy a particular Conv3d description +Status cudnn_satisfies(library::ConvDescription const &desc, library::Conv3dConfiguration const &configuration); + +/// Cudnn compute type seems to be hardcoded to float (To handle a possible cudnn issue) +float cast_cudnn_compute_type_to_float(library::NumericTypeID type, void const * src); + + +/// This is a helper class to create cudnnHandle_t automatically on CudnnCreate object creation and +/// to destroy cudnnHandle_t on CudnnCreate object destruction. +/// Additionaly, it provides implicit cast from CudnnCreate's object to cudnnHandle_t's object +class CudnnCreate { +private: + cudnnHandle_t handle; + cudnnStatus_t status; + +public: + CudnnCreate() { + status = cudnnCreate(&handle); + } + + ~CudnnCreate() { + cudnnDestroy(handle); + } + + /// Implicit cast CudnnCreate object to cudnnHandle_t + operator cudnnHandle_t() const { return handle; } + + /// returns cudnnStatus_t for handle creation + cudnnStatus_t get_cudnn_create_status() { return status; } +}; + + +namespace detail { + +/// Dispatcher to cudnn convolution operators +struct cudnnConvDispatcher { + + // + // Data members + // + //library::Conv2dConfiguration configuration; + library::ConvArguments arguments; + library::ConvKind conv_kind; + + // cudnn-specific data structures to fill cudnn API call arguments + // cudnn activation, filter, and output descriptors + cudnnTensorDescriptor_t activation_desc; + cudnnFilterDescriptor_t filter_desc; + cudnnTensorDescriptor_t output_desc; + cudnnConvolutionDescriptor_t conv_desc; + + // cudnn datatypes + cudnnDataType_t data_type_activation; + cudnnDataType_t data_type_filter; + cudnnDataType_t data_type_output; + + // cudnn layouts + cudnnTensorFormat_t layout_activation; + cudnnTensorFormat_t layout_filter; + cudnnTensorFormat_t layout_output; + + // cudnn convolution mode + cudnnConvolutionMode_t conv_mode; + + // cudnn math type (tensorop, tensorop with conversion, simt) + cudnnMathType_t math_type; + + // cudnn compute data type + cudnnDataType_t compute_type; + + // cudnn compute type seems to be hardcoded to float (to handle a possible a cudnn issue) + float alpha; + float beta; + + // cudnn workspace + size_t workspace_size_in_bytes = 0; + cutlass::device_memory::allocation workspace; + + // select cudnn's implicit gemm precomputed algorithm with tensor operations + static cudnnConvolutionFwdAlgo_t const fprop_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + static cudnnConvolutionBwdDataAlgo_t const dgrad_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + static cudnnConvolutionBwdFilterAlgo_t const wgrad_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; + + Status status; + + // + // Methods + // + + // TODO: unify ctor cudnnConvDispatcher for conv2d and conv3d by unifying Conv2dConfigration + + // ctor for conv2d + cudnnConvDispatcher( + library::ConvDescription const &op_desc, + library::Conv2dConfiguration configuration, + library::ConvArguments arguments_, + cudnnHandle_t handle + ): + //configuration(configuration_), + arguments(arguments_), + conv_kind(op_desc.conv_kind), + status(Status::kSuccess) { + + bool good = true; + + // Get cudnn datatype, layout, and convolution mode from library::ConvDescription + good = (good && get_cudnn_datatype(data_type_activation, op_desc.A.element)); + good = (good && get_cudnn_datatype(data_type_filter, op_desc.B.element)); + good = (good && get_cudnn_datatype(data_type_output, op_desc.C.element)); + good = (good && get_cudnn_layout(layout_activation, op_desc.A.layout)); + good = (good && get_cudnn_layout(layout_filter, op_desc.B.layout)); + good = (good && get_cudnn_layout(layout_output, op_desc.C.layout)); + good = (good && get_cudnn_conv_mode(conv_mode, configuration.problem_size.mode)); + // Get cudnn mathtype (cudnnMathType_t) + good = (good && get_cudnn_mathtype(math_type, op_desc)); + good = (good && get_cudnn_datatype( + compute_type, + op_desc.tile_description.math_instruction.element_accumulator)); + // Check cutlass Conv2d description has equivalent operator in cudnn + if (!good) { + status = Status::kErrorNotSupported; + return; + } + // cudnn compute type seems to be hardcoded to float (to handle a possible a cudnn issue) + alpha = cast_cudnn_compute_type_to_float(op_desc.element_epilogue, arguments.alpha); + beta = cast_cudnn_compute_type_to_float(op_desc.element_epilogue, arguments.beta); + + // Create convolution descriptor object + status = get_cutlass_status(cudnnCreateConvolutionDescriptor(&conv_desc)); + + // Configure convolution operator + std::vector padding {configuration.problem_size.pad_h, configuration.problem_size.pad_w}; + std::vector stride {configuration.problem_size.stride_h, configuration.problem_size.stride_w}; + std::vector dilation {configuration.problem_size.dilation_h, configuration.problem_size.dilation_w}; + + status = get_cutlass_status( + cudnnSetConvolutionNdDescriptor( + conv_desc, + op_desc.conv_dim, + padding.data(), + stride.data(), + dilation.data(), + conv_mode, + compute_type + )); + + // Set groups + status = get_cutlass_status(cudnnSetConvolutionGroupCount(conv_desc, configuration.problem_size.groups)); + + // Create activation, filter, and output descriptor objects + status = get_cutlass_status(cudnnCreateTensorDescriptor(&activation_desc)); + status = get_cutlass_status(cudnnCreateFilterDescriptor(&filter_desc)); + status = get_cutlass_status(cudnnCreateTensorDescriptor(&output_desc)); + + // Set activation, filter, and output descriptor + status = get_cutlass_status( + cudnnSetTensor4dDescriptor( + activation_desc, + layout_activation, + data_type_activation, + configuration.problem_size.N, + configuration.problem_size.C, + configuration.problem_size.H, + configuration.problem_size.W + )); + + status = get_cutlass_status( + cudnnSetFilter4dDescriptor( + filter_desc, + data_type_filter, + layout_filter, + configuration.problem_size.K, + configuration.problem_size.C, + configuration.problem_size.R, + configuration.problem_size.S + )); + + status = get_cutlass_status( + cudnnSetTensor4dDescriptor( + output_desc, + layout_output, + data_type_output, + configuration.problem_size.N, + configuration.problem_size.K, + configuration.problem_size.P, + configuration.problem_size.Q + )); + + // Set math instruction to tensor op + status = get_cutlass_status( + cudnnSetConvolutionMathType(conv_desc, math_type)); + + // Initialize workspace + switch (conv_kind) { + case library::ConvKind::kFprop: + status = get_cutlass_status( + cudnnGetConvolutionForwardWorkspaceSize( + handle, + activation_desc, + filter_desc, + conv_desc, + output_desc, + fprop_algo, + &workspace_size_in_bytes + )); break; + case library::ConvKind::kDgrad: + status = get_cutlass_status( + cudnnGetConvolutionBackwardDataWorkspaceSize( + handle, + filter_desc, + output_desc, + conv_desc, + activation_desc, + dgrad_algo, + &workspace_size_in_bytes + )); break; + case library::ConvKind::kWgrad: + status = get_cutlass_status( + cudnnGetConvolutionBackwardFilterWorkspaceSize( + handle, + activation_desc, + output_desc, + conv_desc, + filter_desc, + wgrad_algo, + &workspace_size_in_bytes + )); break; + + } + + workspace = cutlass::device_memory::allocation(workspace_size_in_bytes); + } + + + // ctor for conv3d + cudnnConvDispatcher( + library::ConvDescription const &op_desc, + library::Conv3dConfiguration configuration, + library::ConvArguments arguments_, + cudnnHandle_t handle + ): + //configuration(configuration_), + arguments(arguments_), + conv_kind(op_desc.conv_kind), + status(Status::kSuccess) { + + bool good = true; + + // Get cudnn datatype, layout, and convolution mode from library::ConvDescription + good = (good && get_cudnn_datatype(data_type_activation, op_desc.A.element)); + good = (good && get_cudnn_datatype(data_type_filter, op_desc.B.element)); + good = (good && get_cudnn_datatype(data_type_output, op_desc.C.element)); + + good = (good && get_cudnn_layout(layout_activation, op_desc.A.layout)); + good = (good && get_cudnn_layout(layout_filter, op_desc.B.layout)); + good = (good && get_cudnn_layout(layout_output, op_desc.C.layout)); + + good = (good && get_cudnn_conv_mode(conv_mode, configuration.problem_size.mode)); + + // cudnn compute type seems to be hardcoded to float (to handle a possible a cudnn issue) + alpha = cast_cudnn_compute_type_to_float(op_desc.element_epilogue, arguments.alpha); + beta = cast_cudnn_compute_type_to_float(op_desc.element_epilogue, arguments.beta); + + good = (good && get_cudnn_datatype( + compute_type, + op_desc.tile_description.math_instruction.element_accumulator)); + + // Check cutlass Conv2d description has equivalent operator in cudnn + if (!good) { + status = Status::kErrorNotSupported; + } + + // Create convolution descriptor object + status = get_cutlass_status(cudnnCreateConvolutionDescriptor(&conv_desc)); + + // Configure convolution operator + std::vector padding {configuration.problem_size.pad_d, configuration.problem_size.pad_h, configuration.problem_size.pad_w}; + std::vector stride {configuration.problem_size.stride_d, configuration.problem_size.stride_h, configuration.problem_size.stride_w}; + std::vector dilation {configuration.problem_size.dilation_d, configuration.problem_size.dilation_h, configuration.problem_size.dilation_w}; + + status = get_cutlass_status( + cudnnSetConvolutionNdDescriptor( + conv_desc, + op_desc.conv_dim, + padding.data(), + stride.data(), + dilation.data(), + conv_mode, + compute_type + )); + + // Set groups + status = get_cutlass_status(cudnnSetConvolutionGroupCount(conv_desc, configuration.problem_size.groups)); + + // Create activation, filter, and output descriptor objects + status = get_cutlass_status(cudnnCreateTensorDescriptor(&activation_desc)); + status = get_cutlass_status(cudnnCreateFilterDescriptor(&filter_desc)); + status = get_cutlass_status(cudnnCreateTensorDescriptor(&output_desc)); + + // Set activation descriptor + std::vector activation_extent { + configuration.problem_size.N, + configuration.problem_size.C, + configuration.problem_size.D, + configuration.problem_size.H, + configuration.problem_size.W + }; + + std::vector activation_stride { + configuration.layout_activations.stride()[3], + 1, + configuration.layout_activations.stride()[2], + configuration.layout_activations.stride()[1], + configuration.layout_activations.stride()[0] + }; + + status = get_cutlass_status( + cudnnSetTensorNdDescriptor( + activation_desc, + data_type_activation, + op_desc.conv_dim + 2, + activation_extent.data(), + activation_stride.data() + )); + + // Set filter descriptor + std::vector filter_extent { + configuration.problem_size.K, + configuration.problem_size.C, + configuration.problem_size.T, + configuration.problem_size.R, + configuration.problem_size.S + }; + + std::vector filter_stride { + configuration.layout_filters.stride()[3], + 1, + configuration.layout_filters.stride()[2], + configuration.layout_filters.stride()[1], + configuration.layout_filters.stride()[0] + }; + + status = get_cutlass_status( + cudnnSetFilterNdDescriptor( + filter_desc, + data_type_filter, + layout_filter, + op_desc.conv_dim + 2, + filter_extent.data() + )); + + + // Set output descriptor + std::vector output_extent { + configuration.problem_size.N, + configuration.problem_size.K, + configuration.problem_size.Z, + configuration.problem_size.P, + configuration.problem_size.Q + }; + + std::vector output_stride { + configuration.layout_output.stride()[3], + 1, + configuration.layout_output.stride()[2], + configuration.layout_output.stride()[1], + configuration.layout_output.stride()[0] + }; + + status = get_cutlass_status( + cudnnSetTensorNdDescriptor( + output_desc, + data_type_output, + op_desc.conv_dim + 2, + output_extent.data(), + output_stride.data() + )); + + // Set math instruction to tensor op + status = get_cutlass_status( + cudnnSetConvolutionMathType(conv_desc, math_type)); + + // Initialize workspace + switch (conv_kind) { + case library::ConvKind::kFprop: + status = get_cutlass_status( + cudnnGetConvolutionForwardWorkspaceSize( + handle, + activation_desc, + filter_desc, + conv_desc, + output_desc, + fprop_algo, + &workspace_size_in_bytes + )); break; + case library::ConvKind::kDgrad: + status = get_cutlass_status( + cudnnGetConvolutionBackwardDataWorkspaceSize( + handle, + filter_desc, + output_desc, + conv_desc, + activation_desc, + dgrad_algo, + &workspace_size_in_bytes + )); break; + case library::ConvKind::kWgrad: + status = get_cutlass_status( + cudnnGetConvolutionBackwardFilterWorkspaceSize( + handle, + activation_desc, + output_desc, + conv_desc, + filter_desc, + wgrad_algo, + &workspace_size_in_bytes + )); break; + + } + + workspace = cutlass::device_memory::allocation(workspace_size_in_bytes); + } + + /// Executes Conv2d operater from cudnn library + cudnnStatus_t operator()(cudnnHandle_t handle) { + + switch (conv_kind) { + case library::ConvKind::kFprop: + return cudnnConvolutionForward( + handle, + &alpha, + activation_desc, + activation(), + filter_desc, + filter(), + conv_desc, + fprop_algo, + workspace.get(), + workspace_size_in_bytes, + &beta, + output_desc, + arguments.D + ); + case library::ConvKind::kDgrad: + return cudnnConvolutionBackwardData( + handle, + &alpha, + filter_desc, + filter(), + output_desc, + output(), + conv_desc, + dgrad_algo, + workspace.get(), + workspace_size_in_bytes, + &beta, + activation_desc, + arguments.D + ); + case library::ConvKind::kWgrad: + return cudnnConvolutionBackwardFilter( + handle, + &alpha, + activation_desc, + activation(), + output_desc, + output(), + conv_desc, + wgrad_algo, + workspace.get(), + workspace_size_in_bytes, + &beta, + filter_desc, + arguments.D + ); + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns Actviation Tensor + void const * activation() const { + switch(conv_kind) { + case library::ConvKind::kFprop : return arguments.A; + case library::ConvKind::kDgrad : return arguments.C; + case library::ConvKind::kWgrad : return arguments.B; + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns Filter Tensor + void const *filter() const { + switch(conv_kind) { + case library::ConvKind::kFprop : return arguments.B; + case library::ConvKind::kDgrad : return arguments.B; + case library::ConvKind::kWgrad : return arguments.C; + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } + + // Returns Output Tensor + void const *output() const { + switch(conv_kind) { + case library::ConvKind::kFprop : return arguments.C; + case library::ConvKind::kDgrad : return arguments.A; + case library::ConvKind::kWgrad : return arguments.A; + default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); + } + } +}; + +} // namespace detail +///////////////////////////////////////////////////////////////////////////////////////////////// +#endif //#if CUTLASS_ENABLE_CUDNN +} // namespace profiler +} // namespace cutlass diff --git a/tools/profiler/src/cutlass_profiler.cu b/tools/profiler/src/cutlass_profiler.cu index 9934ff4cd6..c1e33ad61e 100644 --- a/tools/profiler/src/cutlass_profiler.cu +++ b/tools/profiler/src/cutlass_profiler.cu @@ -32,6 +32,8 @@ // Profiler includes #include "cutlass_profiler.h" #include "gemm_operation_profiler.h" +#include "conv2d_operation_profiler.h" +#include "conv3d_operation_profiler.h" #include "sparse_gemm_operation_profiler.h" ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -50,6 +52,10 @@ CutlassProfiler::CutlassProfiler( operation_profilers_.emplace_back(new SparseGemmOperationProfiler(options)); + operation_profilers_.emplace_back(new Conv2dOperationProfiler(options)); + + operation_profilers_.emplace_back(new Conv3dOperationProfiler(options)); + } CutlassProfiler::~CutlassProfiler() { @@ -159,6 +165,8 @@ void CutlassProfiler::print_usage_(std::ostream &out) { out << "\n\nFor details about a particular function, specify the function name with --help.\n\nExample:\n\n" << " $ cutlass_profiler --operation=Gemm --help\n\n" + << " $ cutlass_profiler --operation=Conv3d --help\n\n" + << " $ cutlass_profiler --operation=Conv2d --help\n\n" ; } diff --git a/tools/profiler/src/device_allocation.cu b/tools/profiler/src/device_allocation.cu index 777fb4d0aa..247bcccf15 100644 --- a/tools/profiler/src/device_allocation.cu +++ b/tools/profiler/src/device_allocation.cu @@ -133,7 +133,18 @@ std::vector DeviceAllocation::get_packed_layout( case library::LayoutTypeID::kTensorNDHWC: stride = get_packed_layout_stride(extent); break; - + case library::LayoutTypeID::kTensorNC32HW32: + stride = get_packed_layout_stride>(extent); + break; + case library::LayoutTypeID::kTensorNC64HW64: + stride = get_packed_layout_stride>(extent); + break; + case library::LayoutTypeID::kTensorC32RSK32: + stride = get_packed_layout_stride>(extent); + break; + case library::LayoutTypeID::kTensorC64RSK64: + stride = get_packed_layout_stride>(extent); + break; default: break; } @@ -247,6 +258,18 @@ size_t DeviceAllocation::construct_layout( case library::LayoutTypeID::kTensorNDHWC: return construct_layout_(bytes, layout_id, extent, stride); + case library::LayoutTypeID::kTensorNC32HW32: + return construct_layout_>(bytes, layout_id, extent, stride); + + case library::LayoutTypeID::kTensorNC64HW64: + return construct_layout_>(bytes, layout_id, extent, stride); + + case library::LayoutTypeID::kTensorC32RSK32: + return construct_layout_>(bytes, layout_id, extent, stride); + + case library::LayoutTypeID::kTensorC64RSK64: + return construct_layout_>(bytes, layout_id, extent, stride); + default: break; } @@ -1362,6 +1385,18 @@ static void write_tensor_csv_static_type( case library::LayoutTypeID::kTensorNDHWC: write_tensor_csv_static_tensor_view(out, allocation); break; + case library::LayoutTypeID::kTensorNC32HW32: + write_tensor_csv_static_tensor_view>(out, allocation); + break; + case library::LayoutTypeID::kTensorNC64HW64: + write_tensor_csv_static_tensor_view>(out, allocation); + break; + case library::LayoutTypeID::kTensorC32RSK32: + write_tensor_csv_static_tensor_view>(out, allocation); + break; + case library::LayoutTypeID::kTensorC64RSK64: + write_tensor_csv_static_tensor_view>(out, allocation); + break; default: throw std::runtime_error("Unhandled layout"); } diff --git a/tools/profiler/src/operation_profiler.cu b/tools/profiler/src/operation_profiler.cu index 2bbf2eeb11..edd6f07ce2 100644 --- a/tools/profiler/src/operation_profiler.cu +++ b/tools/profiler/src/operation_profiler.cu @@ -243,7 +243,7 @@ int OperationProfiler::profile_all( ProblemSpace::Iterator problem_it = problem_space.begin(); ProblemSpace::Iterator problem_end = problem_space.end(); - bool continue_profiling = true; + bool continue_profiling = true, internal_error = false; // For each problem in problem space for (; continue_profiling && problem_it != problem_end; ++problem_it) { @@ -302,7 +302,8 @@ int OperationProfiler::profile_all( if (status == Status::kErrorInternal) { // Stop profiling if there was an internal error - return false; + internal_error = true; + break; } else if (status != Status::kSuccess) { // If the workspace could not be initialized for any other reason, continue to @@ -322,7 +323,8 @@ int OperationProfiler::profile_all( if (status == Status::kErrorInternal) { // Stop profiling if there was an internal error - return false; + internal_error = true; + break; } else if (status != Status::kSuccess) { // If the workspace could not be initialized for any other reason, continue to @@ -336,8 +338,9 @@ int OperationProfiler::profile_all( // // B. Verify CUTLASS - if (continue_profiling) { - + + if (continue_profiling && options.profiling.provider_enabled(library::Provider::kCUTLASS)) { + continue_profiling = this->verify_cutlass( options, report, @@ -368,6 +371,7 @@ int OperationProfiler::profile_all( // // D. Profile // + if (continue_profiling && options.profiling.enabled) { continue_profiling = this->profile( @@ -392,10 +396,7 @@ int OperationProfiler::profile_all( } } - // 3. Emit report - report.close(); - - return 0; + return internal_error ? 1 : 0; } /////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tools/profiler/src/options.cu b/tools/profiler/src/options.cu index e2d3e131f0..6bac578072 100644 --- a/tools/profiler/src/options.cu +++ b/tools/profiler/src/options.cu @@ -401,6 +401,7 @@ Options::Profiling::Profiling(cutlass::CommandLine const &cmdline) { else { providers.push_back(library::Provider::kCUTLASS); providers.push_back(library::Provider::kCUBLAS); + providers.push_back(library::Provider::kCUDNN); } } @@ -428,8 +429,8 @@ void Options::Profiling::print_usage(std::ostream &out) const { << " --providers= " << " List of providers to be profiled for performance. (default: '*')" << end_of_line - << " Gemm providers {cutlass*" - << "}" << end_of_line + << " Gemm providers {cutlass*, cublas*}" << end_of_line + << " Conv2d providers {cutlass*, cudnn*}" << "\n\n"; } @@ -502,6 +503,7 @@ Options::Verification::Verification(cutlass::CommandLine const &cmdline) { else { providers.push_back(library::Provider::kCUBLAS); providers.push_back(library::Provider::kReferenceDevice); + providers.push_back(library::Provider::kCUDNN); } } @@ -529,6 +531,7 @@ void Options::Verification::print_usage(std::ostream &out) const { << " --verification-providers= " << " List of providers used to verify result. (default: '*')" << end_of_line << " Gemm verification-providers {cublas*}" << end_of_line + << " Conv2d verification-providers {cudnn*, device*, host}" << "\n\n"; } @@ -570,6 +573,7 @@ Options::Report::Report(cutlass::CommandLine const &cmdline) { cmdline.get_cmd_line_argument("append", append, false); cmdline.get_cmd_line_argument("output", output_path); + cmdline.get_cmd_line_argument("junit-output", junit_output_path); if (cmdline.check_cmd_line_flag("tags")) { cmdline.get_cmd_line_argument_pairs("tags", pivot_tags); @@ -591,6 +595,9 @@ void Options::Report::print_usage(std::ostream &out) const { << " --output= " << " Path to output file for machine readable results. Operation kind and '.csv' is appended.\n\n" + << " --junit-output= " + << " Path to junit output file for result reporting. Operation kind and '.junit.xml' is appended.\n\n" + << " --report-not-run= " << " If true, reports the status of all kernels including those that" << end_of_line << " do not satisfy the given arguments.\n\n" @@ -608,6 +615,7 @@ void Options::Report::print_options(std::ostream &out, int indent) const { out << indent_str(indent) << "append: " << append << "\n" << indent_str(indent) << "output: " << output_path << "\n" + << indent_str(indent) << "junit-output: " << junit_output_path << "\n" << indent_str(indent) << "report_not_run: " << report_not_run << "\n" << indent_str(indent) << "tags:\n"; diff --git a/tools/profiler/src/options.h b/tools/profiler/src/options.h index 48463efa50..79e0169970 100644 --- a/tools/profiler/src/options.h +++ b/tools/profiler/src/options.h @@ -218,6 +218,9 @@ class Options { /// Path to a file containing results std::string output_path; + /// Path to a file containing junit xml results + std::string junit_output_path; + /// Sequence of tags to attach to each result std::vector> pivot_tags; diff --git a/tools/profiler/src/performance_report.cpp b/tools/profiler/src/performance_report.cpp index 07a7edc955..de184eb04b 100644 --- a/tools/profiler/src/performance_report.cpp +++ b/tools/profiler/src/performance_report.cpp @@ -69,11 +69,15 @@ PerformanceReport::PerformanceReport( options_(options), argument_names_(argument_names), problem_index_(0), good_(true), op_kind_(op_kind) { // Strip '.csv' if present - std::string base_path = options_.report.output_path.substr( - 0, options_.report.output_path.rfind(".csv")); - + std::string base_path = options_.report.output_path; + base_path = base_path.substr(0, base_path.rfind(".csv")); op_file_name_ = base_path + "." + to_string(op_kind_) + ".csv"; + base_path = options_.report.junit_output_path; + base_path = base_path.substr(0, base_path.rfind(".xml")); + base_path = base_path.substr(0, base_path.rfind(".junit")); + op_junit_file_name_ = base_path + "." + to_string(op_kind_) + ".junit.xml"; + // // Open output file for operation of PerformanceReport::op_kind // @@ -108,6 +112,21 @@ PerformanceReport::PerformanceReport( print_csv_header_(output_file_) << std::endl; } } + + if (!options_.report.junit_output_path.empty()) { + + junit_output_file_.open(op_junit_file_name_); + + if (!junit_output_file_.good()) { + + std::cerr << "Could not open junit output file at path '" + << options_.report.junit_output_path << "'" << std::endl; + + good_ = false; + } + + print_junit_header_(junit_output_file_); + } } void PerformanceReport::next_problem() { @@ -123,6 +142,10 @@ void PerformanceReport::append_result(PerformanceResult result) { print_result_pretty_(std::cout, result) << std::flush; } + if (junit_output_file_.is_open()) { + print_junit_result_(junit_output_file_, result); + } + if (output_file_.is_open()) { print_result_csv_(output_file_, result) << std::endl; } @@ -143,7 +166,7 @@ void PerformanceReport::append_results(PerformanceResultVector const &results) { } } -void PerformanceReport::close() { +PerformanceReport::~PerformanceReport() { // // Output results to stdout if they were not written to a file already. @@ -161,7 +184,17 @@ void PerformanceReport::close() { } } else if (output_file_.is_open() && options_.report.verbose) { - std::cout << "\n\nWrote results to '" << op_file_name_ << "'" << std::endl; + std::cout << "\nWrote results to '" << op_file_name_ << "'" << std::endl; + } + + if (output_file_.is_open()) { + output_file_.close(); + } + + if (junit_output_file_.is_open()) { + print_junit_footer_(junit_output_file_); + junit_output_file_.close(); + std::cout << "\nWrote jUnit results to '" << op_junit_file_name_ << "'" << std::endl; } } @@ -179,7 +212,8 @@ static const char *disposition_status_color(Disposition disposition) { /// Prints the result in human readable form std::ostream & PerformanceReport::print_result_pretty_( std::ostream &out, - PerformanceResult const &result) { + PerformanceResult const &result, + bool use_shell_coloring) { out << "=============================\n" << " Problem ID: " << result.problem_index << "\n"; @@ -196,14 +230,20 @@ std::ostream & PerformanceReport::print_result_pretty_( out << "\n"; } + std::string shell_color_bright = use_shell_coloring ? SHELL_COLOR_BRIGHT() : ""; + std::string shell_color_end = use_shell_coloring ? SHELL_COLOR_END() : ""; + auto _disposition_status_color = [&](Disposition d) -> const char * { + return use_shell_coloring ? disposition_status_color(d) : ""; + }; + out << "\n" - << " Provider: " << SHELL_COLOR_BRIGHT() << library::to_string(result.provider, true) << SHELL_COLOR_END() << "\n" - << " OperationKind: " << SHELL_COLOR_BRIGHT() << library::to_string(result.op_kind) << SHELL_COLOR_END() << "\n" + << " Provider: " << shell_color_bright << library::to_string(result.provider, true) << shell_color_end << "\n" + << " OperationKind: " << shell_color_bright << library::to_string(result.op_kind) << shell_color_end << "\n" << " Operation: " << result.operation_name << "\n\n" - << " Status: " << SHELL_COLOR_BRIGHT() << library::to_string(result.status, true) << SHELL_COLOR_END() << "\n" - << " Verification: " << SHELL_COLOR_BRIGHT() << (options_.verification.enabled ? "ON":"OFF") << SHELL_COLOR_END() << "\n" - << " Disposition: " << disposition_status_color(result.disposition) << to_string(result.disposition, true) << SHELL_COLOR_END() << "\n\n"; + << " Status: " << shell_color_bright << library::to_string(result.status, true) << shell_color_end << "\n" + << " Verification: " << shell_color_bright << (options_.verification.enabled ? "ON":"OFF") << shell_color_end << "\n" + << " Disposition: " << _disposition_status_color(result.disposition) << to_string(result.disposition, true) << shell_color_end << "\n\n"; // Display individual verification results for each verification-provider if (options_.verification.enabled) { @@ -263,10 +303,6 @@ std::ostream & PerformanceReport::print_csv_header_( << ",OperationKind,Operation,Disposition,Status"; for (auto const &arg_name : argument_names_) { - // Operand E is internal to the sparse kernel - if (arg_name.compare("E") == 0) - continue; - out << "," << arg_name; } @@ -327,6 +363,112 @@ std::ostream & PerformanceReport::print_result_csv_( return out; } +std::ostream & PerformanceReport::print_junit_header_(std::ostream &out) { + + out << "" << std::endl; + out << "" << std::endl; + return out; + +} + +namespace { + + std::string escape_xml_special_chars(const std::string& src) { + std::stringstream dst; + for (char ch : src) { + switch (ch) { + case '&': dst << "&"; break; + case '\'': dst << "'"; break; + case '"': dst << """; break; + case '<': dst << "<"; break; + case '>': dst << ">"; break; + default: dst << ch; break; + } + } + return dst.str(); + } + + template + std::ostream & print_junit_result_property_(std::ostream & os, const std::string & name, const T & property) { + return os << " " << std::endl; + } +} + +std::ostream & PerformanceReport::print_junit_result_(std::ostream &out, PerformanceResult const &result) { + + out << " " << "" << std::endl; + + if (failed) { + out << " " << std::endl; + } + + if (error) { + out << " " << std::endl; + } + + out << " " << std::endl; + + out << " " << std::endl; + + return out; + +} + +std::ostream & PerformanceReport::print_junit_footer_(std::ostream &out) { + + out << "" << std::endl; + return out; + +} + ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace profiler diff --git a/tools/profiler/src/performance_report.h b/tools/profiler/src/performance_report.h index 1c086e6185..5005103158 100644 --- a/tools/profiler/src/performance_report.h +++ b/tools/profiler/src/performance_report.h @@ -59,6 +59,12 @@ class PerformanceReport { /// Output file containing results std::ofstream output_file_; + /// Operation file name containing junit performance report of op_kind + std::string op_junit_file_name_; + + /// Output file containing junit results + std::ofstream junit_output_file_; + /// Flag indicating the performance report is valid bool good_; @@ -74,6 +80,7 @@ class PerformanceReport { public: PerformanceReport(Options const &options, std::vector const &argument_names, library::OperationKind const &op_kind); + ~PerformanceReport(); bool good() const { return good_; } @@ -81,8 +88,6 @@ class PerformanceReport { void append_result(PerformanceResult result); void append_results(PerformanceResultVector const &results); - void close(); - public: /// Prints the CSV header @@ -91,10 +96,21 @@ class PerformanceReport { /// Prints the CSV std::ostream & print_result_csv_(std::ostream &out, PerformanceResult const &result); + /// @defgroup jUnit Result Generation + /// Functions related to generation of the jUnit results + /// @{ + + std::ostream & print_junit_header_(std::ostream &out); + std::ostream & print_junit_result_(std::ostream &out, PerformanceResult const &result); + std::ostream & print_junit_footer_(std::ostream &out); + + /// @} + /// Prints the result in human readable form std::ostream & print_result_pretty_( std::ostream &out, - PerformanceResult const &result); + PerformanceResult const &result, + bool use_shell_coloring = true); }; ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tools/profiler/src/problem_space.cpp b/tools/profiler/src/problem_space.cpp index e69b0110e9..a8c4943218 100644 --- a/tools/profiler/src/problem_space.cpp +++ b/tools/profiler/src/problem_space.cpp @@ -961,6 +961,85 @@ bool arg_as_SplitKModeID( ///////////////////////////////////////////////////////////////////////////////////////////////// +/// Lexically casts an argument to an int64 if it is defined. Returns true if not null. +bool arg_as_ConvModeID( + library::ConvModeID &conv_mode, + KernelArgument::Value const *value_ptr) { + + if (value_ptr->not_null) { + if (value_ptr->argument->description->type == ArgumentTypeID::kEnumerated) { + + conv_mode = library::from_string( + static_cast(value_ptr)->element); + + if (conv_mode == library::ConvModeID::kInvalid) { + throw std::runtime_error( + "arg_as_ConvModeID() - illegal cast."); + } + } + else { + + throw std::runtime_error( + "arg_as_ConvModeID() - illegal cast."); + } + return true; + } + return false; +} + +/// Lexically casts an argument to an int64 if it is defined. Returns true if not null. +bool arg_as_ConvModeID( + library::ConvModeID &conv_mode, + char const *name, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + size_t idx = problem_space.argument_index(name); + KernelArgument::Value const *value_ptr = problem.at(idx).get(); + + return arg_as_ConvModeID(conv_mode, value_ptr); +} + +/// Lexically casts an argument to an int64 if it is defined. Returns true if not null. +bool arg_as_ProviderID( + library::Provider &provider, + KernelArgument::Value const *value_ptr) { + + if (value_ptr->not_null) { + if (value_ptr->argument->description->type == ArgumentTypeID::kEnumerated) { + + provider = library::from_string( + static_cast(value_ptr)->element); + + if (provider == library::Provider::kInvalid) { + throw std::runtime_error( + "arg_as_ProviderID() - illegal cast."); + } + } + else { + + throw std::runtime_error( + "arg_as_ProviderID() - illegal cast."); + } + return true; + } + return false; +} + +/// Lexically casts an argument to an int64 if it is defined. Returns true if not null. +bool arg_as_ProviderID( + library::Provider &provider, + char const *name, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + size_t idx = problem_space.argument_index(name); + KernelArgument::Value const *value_ptr = problem.at(idx).get(); + + return arg_as_ProviderID(provider, value_ptr); +} +///////////////////////////////////////////////////////////////////////////////////////////////// + /// Lexically casts an argument to a given type stored in a byte array. Returns true if not null. bool arg_as_scalar( std::vector &bytes, @@ -1049,9 +1128,94 @@ bool tensor_description_satisfies( return false; } +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Returns true if conv_kind satisfies the value +bool conv_kind_satisfies( + library::ConvKind const &conv_kind, + EnumeratedTypeArgument::EnumeratedTypeValue const *value_ptr) { + + if (value_ptr->not_null) { + library::ConvKind conv_kind_cmd_line = + library::from_string(value_ptr->element); + + if (conv_kind_cmd_line != library::ConvKind::kUnknown && + conv_kind_cmd_line != conv_kind) { + + return false; + } + } + + return true; +} + +/// Returns true if conv_kind satisfies the value +bool conv_kind_satisfies( + library::ConvKind const &conv_kind, + char const *name, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + size_t idx = problem_space.argument_index(name); + KernelArgument::Value const *value_ptr = problem.at(idx).get(); + + if (value_ptr->argument->description->type == ArgumentTypeID::kEnumerated) { + return conv_kind_satisfies( + conv_kind, + static_cast(value_ptr)); + } + else { + throw std::runtime_error("Kernel argument mismatch"); + } + + return false; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Returns true if a iterator algorithm satisfies the value +bool iterator_algorithm_satisfies( + library::IteratorAlgorithmID const &iterator_algorithm, + EnumeratedTypeArgument::EnumeratedTypeValue const *value_ptr) { + + if (value_ptr->not_null) { + library::IteratorAlgorithmID iterator_algorithm_cmd_line = + library::from_string(value_ptr->element); + + if (iterator_algorithm_cmd_line != library::IteratorAlgorithmID::kNone && + iterator_algorithm_cmd_line != iterator_algorithm) { + + return false; + } + } + + return true; +} + +/// Returns true if a iterator algorithm satisfies the value +bool iterator_algorithm_satisfies( + library::IteratorAlgorithmID const &iterator_algorithm, + char const *name, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + size_t idx = problem_space.argument_index(name); + KernelArgument::Value const *value_ptr = problem.at(idx).get(); + + if (value_ptr->argument->description->type == ArgumentTypeID::kEnumerated) { + return iterator_algorithm_satisfies( + iterator_algorithm, + static_cast(value_ptr)); + } + else { + throw std::runtime_error("Kernel argument mismatch"); + } + + return false; +} + ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace profiler } // namespace cutlass ///////////////////////////////////////////////////////////////////////////////////////////////// - diff --git a/tools/profiler/src/problem_space.h b/tools/profiler/src/problem_space.h index 8a9ee4f2e8..8e10dbafce 100644 --- a/tools/profiler/src/problem_space.h +++ b/tools/profiler/src/problem_space.h @@ -909,6 +909,37 @@ bool arg_as_SplitKModeID( ProblemSpace const &problem_space, ProblemSpace::Problem const &problem); +/// Lexically casts an argument to an int64 if it is defined. Returns true if not null. +bool arg_as_ConvModeID(library::ConvModeID &conv_mode, KernelArgument::Value const *value_ptr); + +/// Lexically casts an argument to an int64 if it is defined. Returns true if not null. +bool arg_as_ConvModeID( + library::ConvModeID &conv_mode, + char const *name, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + +/// Lexically casts an argument to an int64 if it is defined. Returns true if not null. +bool arg_as_IteratorAlgorithmID(library::IteratorAlgorithmID &iterator_algorithm, KernelArgument::Value const *value_ptr); + +/// Lexically casts an argument to an int64 if it is defined. Returns true if not null. +bool arg_as_IteratorAlgorithmID( + library::IteratorAlgorithmID &iterator_algorithm, + char const *name, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + + +/// Lexically casts an argument to an int64 if it is defined. Returns true if not null. +bool arg_as_ProviderID(library::Provider &provider, KernelArgument::Value const *value_ptr); + +/// Lexically casts an argument to an int64 if it is defined. Returns true if not null. +bool arg_as_ProviderID( + library::Provider &provider, + char const *name, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + /// Lexically casts an argument to a given type stored in a byte array. Returns true if not null. bool arg_as_scalar( std::vector &bytes, @@ -935,10 +966,34 @@ bool tensor_description_satisfies( ProblemSpace const &problem_space, ProblemSpace::Problem const &problem); + +/// Returns true if a conv kind satisfies the value +bool conv_kind_satisfies( + library::ConvKind const &conv_kind, + EnumeratedTypeArgument::EnumeratedTypeValue const *value_ptr); + +/// Returns true if a conv kind satisfies the value +bool conv_kind_satisfies( + library::ConvKind const &conv_kind, + char const *name, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + +/// Returns true if a iterator algorithm satisfies the value +bool iterator_algorithm_satisfies( + library::IteratorAlgorithmID const &iterator_algorithm, + EnumeratedTypeArgument::EnumeratedTypeValue const *value_ptr); + +/// Returns true if a iterator algorithm satisfies the value +bool iterator_algorithm_satisfies( + library::IteratorAlgorithmID const &iterator_algorithm, + char const *name, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace profiler } // namespace cutlass //////////////////////////////////////////////////////////////////////////////////////////////// - diff --git a/tools/profiler/src/reduction_operation_profiler.h b/tools/profiler/src/reduction_operation_profiler.h new file mode 100644 index 0000000000..e00dcc0b60 --- /dev/null +++ b/tools/profiler/src/reduction_operation_profiler.h @@ -0,0 +1,167 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief Defines profiling functionality for reduction operation + +*/ + +#pragma once + +#include +#include +#include +#include +#include + +// CUTLASS Library includes +#include "cutlass/library/library.h" +#include "cutlass/library/util.h" +#include "cutlass/library/manifest.h" + +// Profiler includes +#include "options.h" +#include "device_context.h" +#include "operation_profiler.h" +#include "performance_result.h" +#include "problem_space.h" +#if CUTLASS_ENABLE_CUDNN +#include "cudnn_helpers.h" +#endif //#if CUTLASS_ENABLE_CUDNN +#include "debug.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace profiler { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Abstract base class for each math function +class ReductionOperationProfiler : public OperationProfiler { +public: + + + /// Workspace used + struct ReductionWorkspace { + + /// Conv device allocations + DeviceAllocation *Workspace; + DeviceAllocation *Source; + DeviceAllocation *Destination; + DeviceAllocation *Reference; + + /// Library configuration and arguments + library::ReductionConfiguration configuration; + library::ReductionArguments arguments; + + /// Buffer used for the cutlass operations' host workspace + std::vector host_workspace; + + /// Buffer used for the cutlass operations' device workspace + DeviceAllocation device_workspace; + + // + // Methods + // + + ReductionWorkspace(): + Workspace(nullptr), Source(nullptr), Destination(nullptr), Reference(nullptr) { } + }; + +protected: + + // + // Data members + // + + /// Reduction problem obtained from problem space + MatrixCoord problem_; + + /// Device memory allocations + ReductionWorkspace conv_workspace_; + + +public: + // + // Methods + // + + /// Ctor + ReductionOperationProfiler(Options const &options); + + /// Destructor + virtual ~ReductionOperationProfiler(); + + /// Prints usage statement for the math function + virtual void print_usage(std::ostream &out) const; + + /// Prints examples + virtual void print_examples(std::ostream &out) const; + + /// Extracts the problem dimensions + virtual Status initialize_configuration( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + + /// Initializes workspace + virtual Status initialize_workspace( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + + /// Verifies CUTLASS against references + virtual bool verify_cutlass( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + + /// Measures performance results + virtual bool profile( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace profiler +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/tools/profiler/src/sparse_gemm_operation_profiler.cu b/tools/profiler/src/sparse_gemm_operation_profiler.cu index 702b79bb6c..7eff2062b0 100644 --- a/tools/profiler/src/sparse_gemm_operation_profiler.cu +++ b/tools/profiler/src/sparse_gemm_operation_profiler.cu @@ -227,6 +227,9 @@ void SparseGemmOperationProfiler::SparseGemmProblem::initialize_result( set_argument(result, "C", problem_space, std::string(library::to_string(operation_desc.C.element)) + ":" + library::to_string(operation_desc.C.layout)); + set_argument(result, "E", problem_space, + std::string(library::to_string(operation_desc.E.element)) + ":" + library::to_string(operation_desc.E.layout)); + set_argument(result, "m", problem_space, m); set_argument(result, "n", problem_space, n); set_argument(result, "k", problem_space, k); diff --git a/tools/util/include/cutlass/util/host_reorder.h b/tools/util/include/cutlass/util/host_reorder.h index 1d12add3ef..660ee0f956 100644 --- a/tools/util/include/cutlass/util/host_reorder.h +++ b/tools/util/include/cutlass/util/host_reorder.h @@ -62,6 +62,18 @@ void reorder_column(TensorRef dest, } } +template +void reorder_convK(TensorRef dest, + TensorRef src, + cutlass::gemm::GemmCoord problem_size) { + + TensorRef> mappedDest(dest.data(), dest.stride(0)); + TensorRef> mappedSrc(src.data(), src.stride(0)); + + reorder_column( + mappedDest, mappedSrc, problem_size); +} + /// This is needed for the sparse tensor core kernels. The purpose /// is to use ldmatrix to load from shared memory to the register file. template diff --git a/tools/util/include/cutlass/util/reference/device/convolution.h b/tools/util/include/cutlass/util/reference/device/convolution.h new file mode 100644 index 0000000000..843b6b15b9 --- /dev/null +++ b/tools/util/include/cutlass/util/reference/device/convolution.h @@ -0,0 +1,1536 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Reference implementation for convolution in device-side code. +*/ + +#pragma once + +#include "cutlass/coord.h" +#include "cutlass/functional.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/numeric_conversion.h" +#include "cutlass/numeric_types.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" +#include "cutlass/conv/conv3d_problem_size.h" + +namespace cutlass { +namespace reference { +namespace device { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace kernel { + +//////////////////////////////////////////////////////////////////////////////////////////////////// +/// Conv2d device reference kernel +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2d Fprop kernel - y = fprop(x, w) +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add, + int kThreadM = 2, // shape of a thread's tile in the GEMM M dimension + int kThreadN = 4, // shape of a thread's tile in the GEMM N dimension + int kCtaShapeM = 16, // shape of a threadblock in units of threads + int kCtaShapeN = 8 // shape of a threadblock in units of threads +> +__global__ void Conv2dFprop( + conv::Conv2dProblemSize problem_size, + TensorRef tensor_x, + TensorRef tensor_w, + TensorRef tensor_y_in, + TensorRef tensor_y_out, + ElementCompute alpha, + ElementCompute beta + ) { + + ConvertOp convert_op; + InnerProductOp inner_product_op; + + ElementAccumulator element_A[kThreadM]; + ElementAccumulator element_B[kThreadN]; + ElementAccumulator accum[kThreadM][kThreadN]; + + int64_t npq_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM; + int k_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN; + + int thread_n[kThreadM]; + int thread_p[kThreadM]; + int thread_q[kThreadM]; + + // Compute N, P, Q coordinates for each row of a thread's tile + int64_t PQ = int64_t(problem_size.P) * problem_size.Q; + + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + + int64_t npq = npq_start + m; + + thread_n[m] = int(npq / PQ); + + int64_t residual = npq % PQ; + thread_p[m] = int(residual / problem_size.Q); + thread_q[m] = int(residual % problem_size.Q); + } + + // Clear accumulators + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + accum[m][n] = ElementAccumulator(); + } + } + + // Compute convolution + for (int R = 0; R < problem_size.R; ++R) { + for (int S = 0; S < problem_size.S; ++S) { + for (int C = 0; C < problem_size.C; ++C) { + + // Load from activations tensor + int filter_r = R; + int filter_s = S; + + if (problem_size.mode == cutlass::conv::Mode::kConvolution) { + filter_r = problem_size.R - 1 - R; + filter_s = problem_size.S - 1 - S; + } + + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + int h = thread_p[m] * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h; + int w = thread_q[m] * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w; + + if (thread_n[m] < problem_size.N && h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W) { + element_A[m] = ElementAccumulator(tensor_x.at({thread_n[m], h, w, C})); + } + else { + element_A[m] = ElementAccumulator(); + } + } + + // Load from filters tensor + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + int thread_k = k_start + n; + + if (thread_k < problem_size.K) { + element_B[n] = ElementAccumulator(tensor_w.at({thread_k, R, S, C})); + } + else { + element_B[n] = ElementAccumulator(); + } + } + + // Accumulate matrix product + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]); + } + } + } + } + } + + // Write out the results + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + if (thread_n[m] < problem_size.N && thread_p[m] < problem_size.P && thread_q[m] < problem_size.Q) { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + int thread_k = k_start + n; + if (thread_k < problem_size.K) { + + ElementCompute c_ref = ElementCompute(); + if (beta != ElementCompute()) { + c_ref = ElementCompute(tensor_y_in.at({thread_n[m], thread_p[m], thread_q[m], thread_k})); + } + + tensor_y_out.at({thread_n[m], thread_p[m], thread_q[m], thread_k}) = convert_op( + alpha * ElementCompute(accum[m][n]) + beta * c_ref); + } + } + } + } +} + +// Conv3d Fprop kernel - y = fprop(x, w) +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add, + int kThreadM = 2, // shape of a thread's tile in the GEMM M dimension + int kThreadN = 4, // shape of a thread's tile in the GEMM N dimension + int kCtaShapeM = 16, // shape of a threadblock in units of threads + int kCtaShapeN = 8 // shape of a threadblock in units of threads +> +__global__ void Conv3dFprop( + conv::Conv3dProblemSize problem_size, + TensorRef tensor_x, + TensorRef tensor_w, + TensorRef tensor_y_in, + TensorRef tensor_y_out, + ElementCompute alpha, + ElementCompute beta + ) { + + ConvertOp convert_op; + InnerProductOp inner_product_op; + + ElementAccumulator element_A[kThreadM]; + ElementAccumulator element_B[kThreadN]; + ElementAccumulator accum[kThreadM][kThreadN]; + + int64_t nzpq_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM; + int k_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN; + + int thread_n[kThreadM]; + int thread_z[kThreadM]; + int thread_p[kThreadM]; + int thread_q[kThreadM]; + + // Compute N, Z, P, Q coordinates for each row of a thread's tile + int64_t PQ = int64_t(problem_size.P) * problem_size.Q; + int64_t ZPQ = PQ * problem_size.Z; + + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + + int64_t nzpq = nzpq_start + m; + + thread_n[m] = int(nzpq / ZPQ); + + int64_t residual = nzpq % ZPQ; + thread_z[m] = int(residual / PQ); + + residual = residual % PQ; + thread_p[m] = int(residual / problem_size.Q); + thread_q[m] = int(residual % problem_size.Q); + } + + // Clear accumulators + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + accum[m][n] = ElementAccumulator(); + } + } + + // Compute convolution + for (int T = 0; T < problem_size.T; ++T) { + for (int R = 0; R < problem_size.R; ++R) { + for (int S = 0; S < problem_size.S; ++S) { + for (int C = 0; C < problem_size.C; ++C) { + + // Load from activations tensor + int filter_t = T; + int filter_r = R; + int filter_s = S; + + if (problem_size.mode == cutlass::conv::Mode::kConvolution) { + filter_t = problem_size.T - 1 - R; + filter_r = problem_size.R - 1 - R; + filter_s = problem_size.S - 1 - S; + } + + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + int d = thread_z[m] * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d; + int h = thread_p[m] * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h; + int w = thread_q[m] * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w; + + if (thread_n[m] < problem_size.N && + d >= 0 && d < problem_size.D && + h >= 0 && h < problem_size.H && + w >= 0 && w < problem_size.W) { + + element_A[m] = ElementAccumulator(tensor_x.at({thread_n[m], d, h, w, C})); + } + else { + element_A[m] = ElementAccumulator(); + } + } + + // Load from filters tensor + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + int thread_k = k_start + n; + + if (thread_k < problem_size.K) { + element_B[n] = ElementAccumulator(tensor_w.at({thread_k, T, R, S, C})); + } + else { + element_B[n] = ElementAccumulator(); + } + } + + // Accumulate matrix product + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]); + } + } + + } // for (C) + } // for (S) + } // for (R) + } // for (T) + + // Write out the results + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + + if (thread_n[m] < problem_size.N && + thread_z[m] < problem_size.Z && + thread_p[m] < problem_size.P && + thread_q[m] < problem_size.Q) { + + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + int thread_k = k_start + n; + if (thread_k < problem_size.K) { + + ElementCompute c_ref = ElementCompute(); + if (beta != ElementCompute()) { + c_ref = ElementCompute(tensor_y_in.at({thread_n[m], thread_z[m], thread_p[m], thread_q[m], thread_k})); + } + + tensor_y_out.at({thread_n[m], thread_z[m], thread_p[m], thread_q[m], thread_k}) = convert_op( + alpha * ElementCompute(accum[m][n]) + beta * c_ref); + } + } // for (n) + + } + } // for (m) +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2d dgrad kernel - dx = dgrad(dy, w) +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add, + int kThreadM = 2, // shape of a thread's tile in the GEMM M dimension + int kThreadN = 4, // shape of a thread's tile in the GEMM N dimension + int kCtaShapeM = 16, // shape of a threadblock in units of threads + int kCtaShapeN = 8 // shape of a threadblock in units of threads +> +__global__ void Conv2dDgrad( + conv::Conv2dProblemSize problem_size, + TensorRef tensor_dy, + TensorRef tensor_w, + TensorRef tensor_dx_in, + TensorRef tensor_dx_out, + ElementCompute alpha, + ElementCompute beta + ) { + + ConvertOp convert_op; + InnerProductOp inner_product_op; + + ElementAccumulator element_A[kThreadM]; + ElementAccumulator element_B[kThreadN]; + ElementAccumulator accum[kThreadM][kThreadN]; + + int64_t nhw_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM; + int c_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN; + + int thread_n[kThreadM]; + int thread_h[kThreadM]; + int thread_w[kThreadM]; + + // Compute N, H, W coordinates for each row of a thread's tile + int64_t HW = int64_t(problem_size.H) * problem_size.W; + + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + + int64_t nhw = nhw_start + m; + + thread_n[m] = int(nhw / HW); + + int64_t residual = nhw % HW; + thread_h[m] = int(residual / problem_size.W); + thread_w[m] = int(residual % problem_size.W); + } + + // Clear accumulators + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + accum[m][n] = ElementAccumulator(); + } + } + + // Compute convolution + for (int R = 0; R < problem_size.R; ++R) { + for (int S = 0; S < problem_size.S; ++S) { + for (int K = 0; K < problem_size.K; ++K) { + + // Load from activations tensor + int filter_r = R; + int filter_s = S; + + if (problem_size.mode == cutlass::conv::Mode::kConvolution) { + filter_r = problem_size.R - 1 - R; + filter_s = problem_size.S - 1 - S; + } + + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + + int p = thread_h[m] + problem_size.pad_h - filter_r * problem_size.dilation_h; + int q = thread_w[m] + problem_size.pad_w - filter_s * problem_size.dilation_w; + + element_A[m] = ElementAccumulator(); + + if (p >= 0 && !(p % problem_size.stride_h) && q >= 0 && !(q % problem_size.stride_w)) { + + p = p / problem_size.stride_h; + q = q / problem_size.stride_w; + + if (thread_n[m] < problem_size.N && p < problem_size.P && q < problem_size.Q) { + element_A[m] = ElementAccumulator(tensor_dy.at({thread_n[m], p, q, K})); + } + } + } + + // Load from filters tensor + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + int thread_c = c_start + n; + + if (thread_c < problem_size.C) { + element_B[n] = ElementAccumulator(tensor_w.at({K, R, S, thread_c})); + } + else { + element_B[n] = ElementAccumulator(); + } + } + + // Accumulate matrix product + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]); + } + } + } + } + } + + // Write out the results + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + + if (thread_n[m] < problem_size.N && thread_h[m] < problem_size.H && thread_w[m] < problem_size.W) { + + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + int thread_c = c_start + n; + if (thread_c < problem_size.C) { + + ElementCompute c_ref = ElementCompute(); + if (beta != ElementCompute()) { + c_ref = ElementCompute(tensor_dx_in.at({thread_n[m], thread_h[m], thread_w[m], thread_c})); + } + + tensor_dx_out.at({thread_n[m], thread_h[m], thread_w[m], thread_c}) = convert_op( + alpha * ElementCompute(accum[m][n]) + beta * c_ref); + } + } + } + } +} + +// Conv3d dgrad kernel - dx = dgrad(dy, w) +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add, + int kThreadM = 2, // shape of a thread's tile in the GEMM M dimension + int kThreadN = 4, // shape of a thread's tile in the GEMM N dimension + int kCtaShapeM = 16, // shape of a threadblock in units of threads + int kCtaShapeN = 8 // shape of a threadblock in units of threads +> +__global__ void Conv3dDgrad( + conv::Conv3dProblemSize problem_size, + TensorRef tensor_dy, + TensorRef tensor_w, + TensorRef tensor_dx_in, + TensorRef tensor_dx_out, + ElementCompute alpha, + ElementCompute beta + ) { + + ConvertOp convert_op; + InnerProductOp inner_product_op; + + ElementAccumulator element_A[kThreadM]; + ElementAccumulator element_B[kThreadN]; + ElementAccumulator accum[kThreadM][kThreadN]; + + int64_t ndhw_start = int64_t(blockIdx.x) * kCtaShapeM * kThreadM + threadIdx.x * kThreadM; + int c_start = blockIdx.y * kCtaShapeN * kThreadN + threadIdx.y * kThreadN; + + int thread_n[kThreadM]; + int thread_d[kThreadM]; + int thread_h[kThreadM]; + int thread_w[kThreadM]; + + // Compute N, H, W coordinates for each row of a thread's tile + int64_t HW = int64_t(problem_size.H) * problem_size.W; + int64_t DHW = HW * problem_size.D; + + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + + int64_t ndhw = ndhw_start + m; + + thread_n[m] = int(ndhw / DHW); + + int64_t residual = ndhw % DHW; + thread_d[m] = int(residual / HW); + + residual = residual % HW; + thread_h[m] = int(residual / problem_size.W); + thread_w[m] = int(residual % problem_size.W); + } + + // Clear accumulators + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + accum[m][n] = ElementAccumulator(); + } + } + + // Compute convolution + for (int T = 0; T < problem_size.T; ++T) { + for (int R = 0; R < problem_size.R; ++R) { + for (int S = 0; S < problem_size.S; ++S) { + for (int K = 0; K < problem_size.K; ++K) { + + // Load from activations tensor + int filter_t = T; + int filter_r = R; + int filter_s = S; + + if (problem_size.mode == cutlass::conv::Mode::kConvolution) { + filter_t = problem_size.T - 1 - T; + filter_r = problem_size.R - 1 - R; + filter_s = problem_size.S - 1 - S; + } + + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + + int z = thread_d[m] + problem_size.pad_d - filter_t * problem_size.dilation_d; + int p = thread_h[m] + problem_size.pad_h - filter_r * problem_size.dilation_h; + int q = thread_w[m] + problem_size.pad_w - filter_s * problem_size.dilation_w; + + element_A[m] = ElementAccumulator(); + + if (z >= 0 && !(z % problem_size.stride_d) && + p >= 0 && !(p % problem_size.stride_h) && + q >= 0 && !(q % problem_size.stride_w)) { + + z = z / problem_size.stride_d; + p = p / problem_size.stride_h; + q = q / problem_size.stride_w; + + if (thread_n[m] < problem_size.N && z < problem_size.Z && p < problem_size.P && q < problem_size.Q) { + element_A[m] = ElementAccumulator(tensor_dy.at({thread_n[m], z, p, q, K})); + } + } + } + + // Load from filters tensor + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + int thread_c = c_start + n; + + if (thread_c < problem_size.C) { + element_B[n] = ElementAccumulator(tensor_w.at({K, T, R, S, thread_c})); + } + else { + element_B[n] = ElementAccumulator(); + } + } + + // Accumulate matrix product + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]); + } + } + + } // for (C) + } // for (S) + } // for (R) + } // for (T) + + // Write out the results + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + + if (thread_n[m] < problem_size.N && + thread_d[m] < problem_size.D && + thread_h[m] < problem_size.H && + thread_w[m] < problem_size.W) { + + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + int thread_c = c_start + n; + if (thread_c < problem_size.C) { + + ElementCompute c_ref = ElementCompute(); + if (beta != ElementCompute()) { + c_ref = ElementCompute(tensor_dx_in.at({thread_n[m], thread_d[m], thread_h[m], thread_w[m], thread_c})); + } + + tensor_dx_out.at({thread_n[m], thread_d[m], thread_h[m], thread_w[m], thread_c}) = convert_op( + alpha * ElementCompute(accum[m][n]) + beta * c_ref); + } + } + } + } +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Conv2d wgrad kernel - dw = wgrad(dy, x) +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add, + int kThreadM = 2, // shape of a thread's tile in the GEMM M dimension + int kThreadN = 4, // shape of a thread's tile in the GEMM N dimension + int kCtaShapeM = 8, // shape of a threadblock in units of threads + int kCtaShapeN = 16 // shape of a threadblock in units of threads +> +__global__ void Conv2dWgrad( + conv::Conv2dProblemSize problem_size, + TensorRef tensor_dy, + TensorRef tensor_x, + TensorRef tensor_dw_in, + TensorRef tensor_dw_out, + ElementCompute alpha, + ElementCompute beta + ) { + + ConvertOp convert_op; + InnerProductOp inner_product_op; + + ElementAccumulator element_A[kThreadM]; + ElementAccumulator element_B[kThreadN]; + ElementAccumulator accum[kThreadM][kThreadN]; + + int k_start = blockIdx.x * kCtaShapeM * kThreadM + threadIdx.x * kThreadM; + int64_t rsc_start = int64_t(blockIdx.y) * kCtaShapeN * kThreadN + threadIdx.y * kThreadN; + + int thread_r[kThreadN]; + int thread_s[kThreadN]; + int thread_c[kThreadN]; + + // Compute R, S, C coordinates for each row of a thread's tile + int64_t SC = int64_t(problem_size.S) * problem_size.C; + + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + + int64_t rsc = rsc_start + n; + int64_t residual = rsc % SC; + + thread_r[n] = int(rsc / SC); + thread_s[n] = int(residual / problem_size.C); + thread_c[n] = int(residual % problem_size.C); + } + + // Clear accumulators + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + accum[m][n] = ElementAccumulator(); + } + } + + // Compute convolution + for (int N = 0; N < problem_size.N; ++N) { + for (int P = 0; P < problem_size.P; ++P) { + for (int Q = 0; Q < problem_size.Q; ++Q) { + + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + int thread_k = k_start + m; + + element_A[m] = ElementAccumulator(); + + if (thread_k < problem_size.K) { + element_A[m] = ElementAccumulator(tensor_dy.at({N, P, Q, thread_k})); + } + } + + // Load from filters tensor + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + + // Load from activations tensor + int filter_r = thread_r[n]; + int filter_s = thread_s[n]; + + if (problem_size.mode == cutlass::conv::Mode::kConvolution) { + filter_r = problem_size.R - 1 - filter_r; + filter_s = problem_size.S - 1 - filter_s; + } + + int h = P * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h; + int w = Q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w; + + element_B[n] = ElementAccumulator(); + + if (h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W && thread_c[n] < problem_size.C) { + element_B[n] = ElementAccumulator(tensor_x.at({N, h, w, thread_c[n]})); + } + } + + // Accumulate matrix product + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]); + } + } + } + } + } + + // Write out the results + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + int thread_k = k_start + m; + + if (thread_k < problem_size.K) { + + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + + if (thread_r[n] < problem_size.R && thread_s[n] < problem_size.S && thread_c[n] < problem_size.C) { + + ElementCompute c_ref = ElementCompute(); + + if (beta != ElementCompute()) { + c_ref = ElementCompute(tensor_dw_in.at({thread_k, thread_r[n], thread_s[n], thread_c[n]})); + } + + tensor_dw_out.at({thread_k, thread_r[n], thread_s[n], thread_c[n]}) = convert_op( + alpha * ElementCompute(accum[m][n]) + beta * c_ref); + } + } + } + } +} + +// Conv3d wgrad kernel - dw = wgrad(dy, x) +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add, + int kThreadM = 2, // shape of a thread's tile in the GEMM M dimension + int kThreadN = 4, // shape of a thread's tile in the GEMM N dimension + int kCtaShapeM = 8, // shape of a threadblock in units of threads + int kCtaShapeN = 16 // shape of a threadblock in units of threads +> +__global__ void Conv3dWgrad( + conv::Conv3dProblemSize problem_size, + TensorRef tensor_dy, + TensorRef tensor_x, + TensorRef tensor_dw_in, + TensorRef tensor_dw_out, + ElementCompute alpha, + ElementCompute beta + ) { + + ConvertOp convert_op; + InnerProductOp inner_product_op; + + ElementAccumulator element_A[kThreadM]; + ElementAccumulator element_B[kThreadN]; + ElementAccumulator accum[kThreadM][kThreadN]; + + int k_start = blockIdx.x * kCtaShapeM * kThreadM + threadIdx.x * kThreadM; + int64_t trsc_start = int64_t(blockIdx.y) * kCtaShapeN * kThreadN + threadIdx.y * kThreadN; + + int thread_t[kThreadN]; + int thread_r[kThreadN]; + int thread_s[kThreadN]; + int thread_c[kThreadN]; + + // Compute R, S, C coordinates for each row of a thread's tile + int64_t SC = int64_t(problem_size.S) * problem_size.C; + int64_t RSC = SC * problem_size.R; + + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + + int64_t trsc = trsc_start + n; + + thread_t[n] = int(trsc / RSC); + + int64_t residual = trsc % RSC; + thread_r[n] = int(residual / SC); + + residual = residual % SC; + thread_s[n] = int(residual / problem_size.C); + thread_c[n] = int(residual % problem_size.C); + } + + // Clear accumulators + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + accum[m][n] = ElementAccumulator(); + } + } + + // Compute convolution + for (int N = 0; N < problem_size.N; ++N) { + for (int Z = 0; Z < problem_size.Z; ++Z) { + for (int P = 0; P < problem_size.P; ++P) { + for (int Q = 0; Q < problem_size.Q; ++Q) { + + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + int thread_k = k_start + m; + + element_A[m] = ElementAccumulator(); + + if (thread_k < problem_size.K) { + element_A[m] = ElementAccumulator(tensor_dy.at({N, Z, P, Q, thread_k})); + } + } + + // Load from filters tensor + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + + // Load from activations tensor + int filter_t = thread_t[n]; + int filter_r = thread_r[n]; + int filter_s = thread_s[n]; + + if (problem_size.mode == cutlass::conv::Mode::kConvolution) { + filter_t = problem_size.T - 1 - filter_t; + filter_r = problem_size.R - 1 - filter_r; + filter_s = problem_size.S - 1 - filter_s; + } + + int d = Z * problem_size.stride_d - problem_size.pad_w + filter_t * problem_size.dilation_d; + int h = P * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h; + int w = Q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w; + + element_B[n] = ElementAccumulator(); + + if (d >= 0 && d < problem_size.D && + h >= 0 && h < problem_size.H && + w >= 0 && w < problem_size.W && + thread_c[n] < problem_size.C) { + + element_B[n] = ElementAccumulator(tensor_x.at({N, d, h, w, thread_c[n]})); + } + } + + // Accumulate matrix product + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + accum[m][n] = inner_product_op(element_A[m], element_B[n], accum[m][n]); + } + } + + } // for (Q) + } // for (P) + } // for (Z) + } // for (N) + + // Write out the results + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < kThreadM; ++m) { + int thread_k = k_start + m; + + if (thread_k < problem_size.K) { + + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < kThreadN; ++n) { + + if (thread_t[n] < problem_size.T && + thread_r[n] < problem_size.R && + thread_s[n] < problem_size.S && + thread_c[n] < problem_size.C) { + + ElementCompute c_ref = ElementCompute(); + + if (beta != ElementCompute()) { + c_ref = ElementCompute(tensor_dw_in.at({thread_k, thread_t[n], thread_r[n], thread_s[n], thread_c[n]})); + } + + tensor_dw_out.at({thread_k, thread_t[n], thread_r[n], thread_s[n], thread_c[n]}) = convert_op( + alpha * ElementCompute(accum[m][n]) + beta * c_ref); + } + } + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace kernel + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Conv2d Fprop dispatcher - y = fprop(x, w) +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add +> +Status Conv2dFprop( + conv::Conv2dProblemSize problem_size, + TensorRef tensor_x, + TensorRef tensor_w, + TensorRef tensor_y_in, + TensorRef tensor_y_out, + ElementCompute alpha, + ElementCompute beta, + cudaStream_t stream = nullptr) { + + // + // Blocking factors improve performance of reference implementation + // + + int const kThreadM = 4; // shape of a thread's tile in the GEMM M dimension + int const kThreadN = 4; // shape of a thread's tile in the GEMM N dimension + int const kCtaShapeM = 16; // shape of a threadblock in units of threads + int const kCtaShapeN = 8; // shape of a threadblock in units of threads + + int64_t npq = int64_t(problem_size.N) * problem_size.P * problem_size.Q; + int64_t blocks_m = (npq + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM); + + dim3 block(kCtaShapeM, kCtaShapeN); + dim3 grid(uint32_t(blocks_m), (problem_size.K + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN)); + + kernel::Conv2dFprop< + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, + InnerProductOp, + kThreadM, + kThreadN, + kCtaShapeM, + kCtaShapeN + ><<< grid, block, 0, stream >>>( + problem_size, + tensor_x, + tensor_w, + tensor_y_in, + tensor_y_out, + alpha, + beta + ); + + cudaError_t result = cudaPeekAtLastError(); + if (result != cudaSuccess) { + return Status::kErrorInternal; + } + + return Status::kSuccess; +} + +/// Conv3d Fprop dispatcher - y = fprop(x, w) +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add +> +Status Conv3dFprop( + conv::Conv3dProblemSize problem_size, + TensorRef tensor_x, + TensorRef tensor_w, + TensorRef tensor_y_in, + TensorRef tensor_y_out, + ElementCompute alpha, + ElementCompute beta, + cudaStream_t stream = nullptr) { + + // + // Blocking factors improve performance of reference implementation + // + + int const kThreadM = 4; // shape of a thread's tile in the GEMM M dimension + int const kThreadN = 4; // shape of a thread's tile in the GEMM N dimension + int const kCtaShapeM = 16; // shape of a threadblock in units of threads + int const kCtaShapeN = 8; // shape of a threadblock in units of threads + + int64_t nzpq = int64_t(problem_size.N) * problem_size.Z * problem_size.P * problem_size.Q; + int64_t blocks_m = (nzpq + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM); + + dim3 block(kCtaShapeM, kCtaShapeN); + dim3 grid(uint32_t(blocks_m), (problem_size.K + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN)); + + kernel::Conv3dFprop< + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, + InnerProductOp, + kThreadM, + kThreadN, + kCtaShapeM, + kCtaShapeN + ><<< grid, block, 0, stream >>>( + problem_size, + tensor_x, + tensor_w, + tensor_y_in, + tensor_y_out, + alpha, + beta + ); + + cudaError_t result = cudaPeekAtLastError(); + if (result != cudaSuccess) { + return Status::kErrorInternal; + } + + return Status::kSuccess; +} + +/// Conv2d Dgrad dispatcher - dx = dgrad(dy, w) +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add +> +Status Conv2dDgrad( + conv::Conv2dProblemSize problem_size, + TensorRef tensor_dy, + TensorRef tensor_w, + TensorRef tensor_dx_in, + TensorRef tensor_dx_out, + ElementCompute alpha, + ElementCompute beta, + cudaStream_t stream = nullptr) { + + // + // Blocking factors improve performance of reference implementation + // + + int const kThreadM = 2; // shape of a thread's tile in the GEMM M dimension + int const kThreadN = 4; // shape of a thread's tile in the GEMM N dimension + int const kCtaShapeM = 16; // shape of a threadblock in units of threads + int const kCtaShapeN = 8; // shape of a threadblock in units of threads + + int64_t nhw = int64_t(problem_size.N) * problem_size.H * problem_size.W; + int64_t blocks_m = (nhw + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM); + + dim3 block(kCtaShapeM, kCtaShapeN); + dim3 grid(uint32_t(blocks_m), (problem_size.C + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN)); + + kernel::Conv2dDgrad< + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, + InnerProductOp, + kThreadM, + kThreadN, + kCtaShapeM, + kCtaShapeN + ><<< grid, block, 0, stream >>>( + problem_size, + tensor_dy, + tensor_w, + tensor_dx_in, + tensor_dx_out, + alpha, + beta + ); + + cudaError_t result = cudaPeekAtLastError(); + if (result != cudaSuccess) { + return Status::kErrorInternal; + } + + return Status::kSuccess; +} + +/// Conv3d Dgrad dispatcher - dx = dgrad(dy, w) +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add +> +Status Conv3dDgrad( + conv::Conv3dProblemSize problem_size, + TensorRef tensor_dy, + TensorRef tensor_w, + TensorRef tensor_dx_in, + TensorRef tensor_dx_out, + ElementCompute alpha, + ElementCompute beta, + cudaStream_t stream = nullptr) { + + // + // Blocking factors improve performance of reference implementation + // + + int const kThreadM = 2; // shape of a thread's tile in the GEMM M dimension + int const kThreadN = 4; // shape of a thread's tile in the GEMM N dimension + int const kCtaShapeM = 16; // shape of a threadblock in units of threads + int const kCtaShapeN = 8; // shape of a threadblock in units of threads + + int64_t ndhw = int64_t(problem_size.N) * problem_size.D * problem_size.H * problem_size.W; + int64_t blocks_m = (ndhw + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM); + + dim3 block(kCtaShapeM, kCtaShapeN); + dim3 grid(uint32_t(blocks_m), (problem_size.C + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN)); + + kernel::Conv3dDgrad< + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, + InnerProductOp, + kThreadM, + kThreadN, + kCtaShapeM, + kCtaShapeN + ><<< grid, block, 0, stream >>>( + problem_size, + tensor_dy, + tensor_w, + tensor_dx_in, + tensor_dx_out, + alpha, + beta + ); + + cudaError_t result = cudaPeekAtLastError(); + if (result != cudaSuccess) { + return Status::kErrorInternal; + } + + return Status::kSuccess; +} + +/// Conv2d Wgrad dispatcher - dw = wgrad(dy, x) +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add +> +Status Conv2dWgrad( + conv::Conv2dProblemSize problem_size, + TensorRef tensor_dy, + TensorRef tensor_x, + TensorRef tensor_dw_in, + TensorRef tensor_dw_out, + ElementCompute alpha, + ElementCompute beta, + cudaStream_t stream = nullptr) { + + // + // Blocking factors improve performance of reference implementation + // + + int const kThreadM = 2; // shape of a thread's tile in the GEMM M dimension + int const kThreadN = 4; // shape of a thread's tile in the GEMM N dimension + int const kCtaShapeM = 8; // shape of a threadblock in units of threads + int const kCtaShapeN = 16; // shape of a threadblock in units of threads + + int64_t rsc = int64_t(problem_size.R) * problem_size.S * problem_size.C; + int64_t blocks_n = (rsc + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN); + + dim3 block(kCtaShapeM, kCtaShapeN); + dim3 grid((problem_size.K + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM), uint32_t(blocks_n)); + + kernel::Conv2dWgrad< + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, + InnerProductOp, + kThreadM, + kThreadN, + kCtaShapeM, + kCtaShapeN + ><<< grid, block, 0, stream >>>( + problem_size, + tensor_dy, + tensor_x, + tensor_dw_in, + tensor_dw_out, + alpha, + beta + ); + + cudaError_t result = cudaPeekAtLastError(); + if (result != cudaSuccess) { + return Status::kErrorInternal; + } + + return Status::kSuccess; +} + +/// Conv3d Wgrad dispatcher - dw = wgrad(dy, x) +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add +> +Status Conv3dWgrad( + conv::Conv3dProblemSize problem_size, + TensorRef tensor_dy, + TensorRef tensor_x, + TensorRef tensor_dw_in, + TensorRef tensor_dw_out, + ElementCompute alpha, + ElementCompute beta, + cudaStream_t stream = nullptr) { + + // + // Blocking factors improve performance of reference implementation + // + + int const kThreadM = 2; // shape of a thread's tile in the GEMM M dimension + int const kThreadN = 4; // shape of a thread's tile in the GEMM N dimension + int const kCtaShapeM = 8; // shape of a threadblock in units of threads + int const kCtaShapeN = 16; // shape of a threadblock in units of threads + + int64_t trsc = int64_t(problem_size.T) * problem_size.R * problem_size.S * problem_size.C; + int64_t blocks_n = (trsc + (kCtaShapeN * kThreadN) - 1) / (kCtaShapeN * kThreadN); + + dim3 block(kCtaShapeM, kCtaShapeN); + dim3 grid((problem_size.K + (kCtaShapeM * kThreadM) - 1) / (kCtaShapeM * kThreadM), uint32_t(blocks_n)); + + kernel::Conv3dWgrad< + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, + InnerProductOp, + kThreadM, + kThreadN, + kCtaShapeM, + kCtaShapeN + ><<< grid, block, 0, stream >>>( + problem_size, + tensor_dy, + tensor_x, + tensor_dw_in, + tensor_dw_out, + alpha, + beta + ); + + cudaError_t result = cudaPeekAtLastError(); + if (result != cudaSuccess) { + return Status::kErrorInternal; + } + + return Status::kSuccess; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Generic 2D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add +> +Status Conv2d( + conv::Operator convolutional_operator, + conv::Conv2dProblemSize problem_size, + TensorRef tensor_A, + TensorRef tensor_B, + TensorRef tensor_C, + TensorRef tensor_D, + ElementCompute alpha, + ElementCompute beta, + cudaStream_t stream = nullptr) { + + switch (convolutional_operator) { + case conv::Operator::kFprop: + return Conv2dFprop< + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, InnerProductOp + >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream); + break; + + case conv::Operator::kDgrad: + return Conv2dDgrad< + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, InnerProductOp + >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream); + break; + + case conv::Operator::kWgrad: + return Conv2dWgrad< + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, InnerProductOp + >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream); + break; + + default: break; + } + + return Status::kErrorNotSupported; +} + +/// Generic 3D convolution targeting Conv3dFprop, Conv3dDgrad, and Conv3dWgrad. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add +> +Status Conv3d( + conv::Operator convolutional_operator, + conv::Conv3dProblemSize problem_size, + TensorRef tensor_A, + TensorRef tensor_B, + TensorRef tensor_C, + TensorRef tensor_D, + ElementCompute alpha, + ElementCompute beta, + cudaStream_t stream = nullptr) { + + switch (convolutional_operator) { + case conv::Operator::kFprop: + return Conv3dFprop< + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, InnerProductOp + >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream); + + case conv::Operator::kDgrad: + return Conv3dDgrad< + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, InnerProductOp + >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream); + + case conv::Operator::kWgrad: + return Conv3dWgrad< + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, InnerProductOp + >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, stream); + + default: break; + } + + return Status::kErrorNotSupported; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace device +} // namespace reference +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/tools/util/include/cutlass/util/reference/host/convolution.h b/tools/util/include/cutlass/util/reference/host/convolution.h new file mode 100644 index 0000000000..48f5db81ea --- /dev/null +++ b/tools/util/include/cutlass/util/reference/host/convolution.h @@ -0,0 +1,767 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Reference implementation for convolution in host-side code. +*/ + +#pragma once + +#include "cutlass/coord.h" +#include "cutlass/functional.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/numeric_conversion.h" +#include "cutlass/numeric_types.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" +#include "cutlass/conv/conv3d_problem_size.h" + +namespace cutlass { +namespace reference { +namespace host { + +//////////////////////////////////////////////////////////////////////////////////////////////////// +/// Forward propagation +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// y = conv2d(x, w) +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add +> +void Conv2dFprop( + conv::Conv2dProblemSize problem_size, + TensorRef tensor_x, + TensorRef tensor_w, + TensorRef tensor_y_in, + TensorRef tensor_y_out, + ElementCompute alpha, + ElementCompute beta) { + + ConvertOp convert_op; + InnerProductOp inner_product_op; + + // Apply MMA and accumulate ElementAccumulator + for (int n = 0; n < problem_size.N; ++n) { + for (int p = 0; p < problem_size.P; ++p) { + for (int q = 0; q < problem_size.Q; ++q) { + for (int k = 0; k < problem_size.K; ++k) { + + ElementAccumulator acc = ElementAccumulator(); + + for (int r = 0; r < problem_size.R; ++r) { + for (int s = 0; s < problem_size.S; ++s) { + for (int c = 0; c < problem_size.C; ++c) { + + int filter_r = r; + int filter_s = s; + + if (problem_size.mode == cutlass::conv::Mode::kConvolution) { + filter_r = problem_size.R - 1 - r; + filter_s = problem_size.S - 1 - s; + } + + int h = p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h; + int w = q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w; + + if (h >= 0 && h < problem_size.H && w >= 0 && w < problem_size.W) { + + ElementA a = tensor_x.at({n, h, w, c}); + ElementB b = tensor_w.at({k, r, s, c}); + + acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc); + + } + } + } + } + + // Apply Epilogue, compute ElementCompute, convert and store ElementC + ElementC c_ref = ElementC(); + + if (beta != ElementCompute()) { + c_ref = tensor_y_in.at(cutlass::make_Coord(n, p, q, k)); + } + + tensor_y_out.at(cutlass::make_Coord(n, p, q, k)) = + convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref)); + } + } + } + } +} + +/// Depthwise-separable convolution +template , + typename InnerProductOp = multiply_add > +void Depsep_Fprop( + cutlass::TensorView tensor_A, + cutlass::TensorView tensor_B, + cutlass::TensorView tensor_C, + ElementCompute alpha, + ElementCompute beta, + cutlass::Tensor4DCoord padding, + cutlass::Coord<2> conv_stride, + cutlass::Coord<2> dilation, + cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation) { + + ConvertOp convert_op; + InnerProductOp inner_product_op; + + // Apply MMA and accumulate ElementAccumulator + for (int n = 0; n < tensor_C.extent().n(); ++n) { + for (int p = 0; p < tensor_C.extent().h(); ++p) { + for (int q = 0; q < tensor_C.extent().w(); ++q) { + for (int g = 0; g < tensor_C.extent().c(); ++g) { + ElementAccumulator acc = ElementAccumulator(); + for (int r = 0; r < tensor_B.extent().h(); ++r) { + for (int s = 0; s < tensor_B.extent().w(); ++s) { + if ((p * conv_stride[0] - padding[0] + r * dilation[0]) < tensor_A.extent().h() && + (p * conv_stride[0] - padding[0] + r * dilation[0]) >= 0 && + (q * conv_stride[1] - padding[2] + s * dilation[1]) < tensor_A.extent().w() && + (q * conv_stride[1] - padding[2] + s * dilation[1]) >= 0) { + ElementA a = tensor_A.at( + cutlass::make_Coord(n, + p * conv_stride[0] - padding[0] + r * dilation[0], + q * conv_stride[1] - padding[2] + s * dilation[1], + g)); + + ElementB b = (mode == cutlass::conv::Mode::kCrossCorrelation) + ? tensor_B.at(cutlass::make_Coord(g, r, s, 0)) + : tensor_B.at(cutlass::make_Coord( + g, tensor_B.extent().h() - r - 1, tensor_B.extent().w() - s - 1, 0)); + + acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc); + } + } + } + + // Apply Epilogue, compute ElementCompute, convert and store ElementC + ElementC c_ref = tensor_C.at(cutlass::make_Coord(n, p, q, g)); + tensor_C.at(cutlass::make_Coord(n, p, q, g)) = + convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref)); + } + } + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +/// Dgrad +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// dx = dgrad(dy, w) +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add +> +void Conv2dDgrad( + cutlass::conv::Conv2dProblemSize problem_size, + TensorRef tensor_dy, + TensorRef tensor_w, + TensorRef tensor_dx_in, + TensorRef tensor_dx_out, + ElementCompute alpha, + ElementCompute beta) { + + ConvertOp convert_op; + InnerProductOp inner_product_op; + + // Apply MMA and accumulate ElementAccumulator + for (int n = 0; n < problem_size.N; ++n) { + for (int h = 0; h < problem_size.H; ++h) { + for (int w = 0; w < problem_size.W; ++w) { + for (int c = 0; c < problem_size.C; ++c) { + + ElementAccumulator acc = ElementAccumulator(); + + for (int r = 0; r < problem_size.R; ++r) { + for (int s = 0; s < problem_size.S; ++s) { + for (int k = 0; k < problem_size.K; ++k) { + + int filter_r = r; + int filter_s = s; + + if (problem_size.mode == cutlass::conv::Mode::kConvolution) { + filter_r = problem_size.R - 1 - r; + filter_s = problem_size.S - 1 - s; + } + + int p = h + problem_size.pad_h - filter_r * problem_size.dilation_h; + int q = w + problem_size.pad_w - filter_s * problem_size.dilation_w; + + if (p >= 0 && (p % problem_size.stride_h) == 0 && + q >= 0 && (q % problem_size.stride_w) == 0) { + + p = p / problem_size.stride_h; + q = q / problem_size.stride_w; + + if (p < problem_size.P && q < problem_size.Q) { + + ElementA a = tensor_dy.at(cutlass::make_Coord(n, p, q, k)); + ElementB b = tensor_w.at(cutlass::make_Coord(k, r, s, c)); + + acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc); + } + } + + } // for (K) + } // for (S) + } // for (R) + + // Apply Epilogue, compute ElementCompute, convert and store ElementC + ElementC c_ref = ElementC(); + + if (beta != ElementCompute()) { + c_ref = tensor_dx_in.at(cutlass::make_Coord(n, h, w, c)); + } + + tensor_dx_out.at(cutlass::make_Coord(n, h, w, c)) = + convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref)); + + } // for (C) + } // for (W) + } // for (H) + } // for (N) +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +/// Wgrad +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// dw = wgrad(dy, x) +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add +> +void Conv2dWgrad( + cutlass::conv::Conv2dProblemSize problem_size, + TensorRef tensor_dy, + TensorRef tensor_x, + TensorRef tensor_dw_in, + TensorRef tensor_dw_out, + ElementCompute alpha, + ElementCompute beta) { + + InnerProductOp inner_product_op; + ConvertOp convert_op; + + // Apply MMA and accumulate ElementAccumulator + for (int k = 0; k < problem_size.K; ++k) { + for (int r = 0; r < problem_size.R; ++r) { + for (int s = 0; s < problem_size.S; ++s) { + for (int c = 0; c < problem_size.C; ++c) { + + ElementAccumulator acc = ElementAccumulator(); + + for (int n = 0; n < problem_size.N; ++n) { + for (int p = 0; p < problem_size.P; ++p) { + for (int q = 0; q < problem_size.Q; ++q) { + + cutlass::Tensor4DCoord b_coord; + + int filter_r = r; + int filter_s = s; + + if (problem_size.mode == cutlass::conv::Mode::kConvolution) { + filter_r = problem_size.R - 1 - r; + filter_s = problem_size.S - 1 - s; + } + + b_coord = make_Coord( + n, + p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h, + q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w, + c); + + if (b_coord.h() < problem_size.H && b_coord.h() >= 0 && + b_coord.w() < problem_size.W && b_coord.w() >= 0) { + + ElementAccumulator a = ElementAccumulator(tensor_dy.at(cutlass::make_Coord(n, p, q, k))); + ElementAccumulator b = ElementAccumulator(tensor_x.at(b_coord)); + acc = inner_product_op(a, b, acc); + } + } + } + } + + // Apply Epilogue, compute ElementCompute, convert and store ElementC + ElementC c_ref = ElementC(); + + if (beta != ElementCompute()) { + c_ref = tensor_dw_in.at(cutlass::make_Coord(k, r, s, c)); + } + + tensor_dw_out.at(cutlass::make_Coord(k, r, s, c)) = + convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref)); + + } // for (C) + } // for (S) + } // for (R) + } // for (K) +} + +/// Generic 2D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add +> +void Conv2d( + conv::Operator convolutional_operator, + conv::Conv2dProblemSize problem_size, + TensorRef tensor_A, + TensorRef tensor_B, + TensorRef tensor_C, + TensorRef tensor_D, + ElementCompute alpha, + ElementCompute beta) { + + switch (convolutional_operator) { + case conv::Operator::kFprop: + Conv2dFprop< + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, InnerProductOp + >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta); + break; + + case conv::Operator::kDgrad: + Conv2dDgrad< + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, InnerProductOp + >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta); + break; + + case conv::Operator::kWgrad: + Conv2dWgrad< + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, InnerProductOp + >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta); + break; + + default: + break; + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +/// 3D convolution +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// y = conv3d(x, w) +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add +> +void Conv3dFprop( + conv::Conv3dProblemSize problem_size, + TensorRef tensor_x, + TensorRef tensor_w, + TensorRef tensor_y_in, + TensorRef tensor_y_out, + ElementCompute alpha, + ElementCompute beta) { + + ConvertOp convert_op; + InnerProductOp inner_product_op; + + // Apply MMA and accumulate ElementAccumulator + for (int n = 0; n < problem_size.N; ++n) { + for (int z = 0; z < problem_size.Z; ++z) { + for (int p = 0; p < problem_size.P; ++p) { + for (int q = 0; q < problem_size.Q; ++q) { + for (int k = 0; k < problem_size.K; ++k) { + + ElementAccumulator acc = ElementAccumulator(); + + for (int t = 0; t < problem_size.T; ++t) { + for (int r = 0; r < problem_size.R; ++r) { + for (int s = 0; s < problem_size.S; ++s) { + for (int c = 0; c < problem_size.C; ++c) { + + int filter_t = t; + int filter_r = r; + int filter_s = s; + + if (problem_size.mode == cutlass::conv::Mode::kConvolution) { + filter_t = problem_size.T - 1 - t; + filter_r = problem_size.R - 1 - r; + filter_s = problem_size.S - 1 - s; + } + + int d = z * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d; + int h = p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h; + int w = q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w; + + if (d >= 0 && d < problem_size.D && + h >=0 && h < problem_size.H && + w >= 0 && w < problem_size.W) { + + ElementA a = tensor_x.at({n, d, h, w, c}); + ElementB b = tensor_w.at({k, t, r, s, c}); + + acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc); + } + } + } + } + } + + // Apply Epilogue, compute ElementCompute, convert and store ElementC + ElementC c_ref = ElementC(); + + if (beta != ElementCompute()) { + c_ref = tensor_y_in.at(cutlass::make_Coord(n, z, p, q, k)); + } + + tensor_y_out.at(cutlass::make_Coord(n, z, p, q, k)) = + convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref)); + } + } + } + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +/// Dgrad +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// dx = dgrad(dy, w) +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add +> +void Conv3dDgrad( + cutlass::conv::Conv3dProblemSize problem_size, + TensorRef tensor_dy, + TensorRef tensor_w, + TensorRef tensor_dx_in, + TensorRef tensor_dx_out, + ElementCompute alpha, + ElementCompute beta) { + + ConvertOp convert_op; + InnerProductOp inner_product_op; + + // Apply MMA and accumulate ElementAccumulator + for (int n = 0; n < problem_size.N; ++n) { + for (int d = 0; d < problem_size.D; ++d) { + for (int h = 0; h < problem_size.H; ++h) { + for (int w = 0; w < problem_size.W; ++w) { + for (int c = 0; c < problem_size.C; ++c) { + + ElementAccumulator acc = ElementAccumulator(); + + for (int t = 0; t < problem_size.T; ++t) { + for (int r = 0; r < problem_size.R; ++r) { + for (int s = 0; s < problem_size.S; ++s) { + for (int k = 0; k < problem_size.K; ++k) { + + int filter_t = t; + int filter_r = r; + int filter_s = s; + + if (problem_size.mode == cutlass::conv::Mode::kConvolution) { + filter_t = problem_size.T - 1 - t; + filter_r = problem_size.R - 1 - r; + filter_s = problem_size.S - 1 - s; + } + + int z = d + problem_size.pad_d - filter_t * problem_size.dilation_d; + int p = h + problem_size.pad_h - filter_r * problem_size.dilation_h; + int q = w + problem_size.pad_w - filter_s * problem_size.dilation_w; + + if (z >= 0 && (z % problem_size.stride_d) == 0 && + p >= 0 && (p % problem_size.stride_h) == 0 && + q >= 0 && (q % problem_size.stride_w) == 0) { + + z = z / problem_size.stride_d; + p = p / problem_size.stride_h; + q = q / problem_size.stride_w; + + if (z < problem_size.Z && p < problem_size.P && q < problem_size.Q) { + + ElementA a = tensor_dy.at(cutlass::make_Coord(n, z, p, q, k)); + ElementB b = tensor_w.at(cutlass::make_Coord(k, t, r, s, c)); + + acc = inner_product_op(ElementAccumulator(a), ElementAccumulator(b), acc); + } + } + + } // for (K) + } // for (S) + } // for (R) + } // for (T) + + // Apply Epilogue, compute ElementCompute, convert and store ElementC + ElementC c_ref = ElementC(); + + if (beta != ElementCompute()) { + c_ref = tensor_dx_in.at(cutlass::make_Coord(n, d, h, w, c)); + } + + tensor_dx_out.at(cutlass::make_Coord(n, d, h, w, c)) = + convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref)); + + } // for (C) + } // for (W) + } // for (H) + } // for (D) + } // for (N) +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +/// Wgrad +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// dw = wgrad(dy, x) +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add +> +void Conv3dWgrad( + cutlass::conv::Conv3dProblemSize problem_size, + TensorRef tensor_dy, + TensorRef tensor_x, + TensorRef tensor_dw_in, + TensorRef tensor_dw_out, + ElementCompute alpha, + ElementCompute beta) { + + InnerProductOp inner_product_op; + ConvertOp convert_op; + + // Apply MMA and accumulate ElementAccumulator + for (int k = 0; k < problem_size.K; ++k) { + for (int t = 0; t < problem_size.T; ++t) { + for (int r = 0; r < problem_size.R; ++r) { + for (int s = 0; s < problem_size.S; ++s) { + for (int c = 0; c < problem_size.C; ++c) { + + ElementAccumulator acc = ElementAccumulator(); + + for (int n = 0; n < problem_size.N; ++n) { + for (int z = 0; z < problem_size.Z; ++z) { + for (int p = 0; p < problem_size.P; ++p) { + for (int q = 0; q < problem_size.Q; ++q) { + + int filter_t = t; + int filter_r = r; + int filter_s = s; + + if (problem_size.mode == cutlass::conv::Mode::kConvolution) { + filter_t = problem_size.T - 1 - t; + filter_r = problem_size.R - 1 - r; + filter_s = problem_size.S - 1 - s; + } + + Tensor5DCoord b_coord = make_Coord( + n, + z * problem_size.stride_d - problem_size.pad_d + filter_t * problem_size.dilation_d, + p * problem_size.stride_h - problem_size.pad_h + filter_r * problem_size.dilation_h, + q * problem_size.stride_w - problem_size.pad_w + filter_s * problem_size.dilation_w, + c); + + if (b_coord.d() < problem_size.D && b_coord.d() >= 0 && + b_coord.h() < problem_size.H && b_coord.h() >= 0 && + b_coord.w() < problem_size.W && b_coord.w() >= 0) { + + ElementAccumulator a = ElementAccumulator(tensor_dy.at(cutlass::make_Coord(n, z, p, q, k))); + ElementAccumulator b = ElementAccumulator(tensor_x.at(b_coord)); + + acc = inner_product_op(a, b, acc); + } + } + } + } + } + + // Apply Epilogue, compute ElementCompute, convert and store ElementC + ElementC c_ref = ElementC(); + + if (beta != ElementCompute()) { + c_ref = tensor_dw_in.at(cutlass::make_Coord(k, t, r, s, c)); + } + + tensor_dw_out.at(cutlass::make_Coord(k, t, r, s, c)) = + convert_op(alpha * ElementCompute(acc) + beta * ElementCompute(c_ref)); + + } // for (C) + } // for (S) + } // for (R) + } // for (T) + } // for (K) +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Generic 3D convolution targeting Conv2dFprop, Conv2dDgrad, and Conv2dWgrad. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementCompute, + typename ElementAccumulator = ElementCompute, + typename ConvertOp = NumericConverter, + typename InnerProductOp = multiply_add +> +void Conv3d( + conv::Operator convolutional_operator, + conv::Conv3dProblemSize problem_size, + TensorRef tensor_A, + TensorRef tensor_B, + TensorRef tensor_C, + TensorRef tensor_D, + ElementCompute alpha, + ElementCompute beta) { + + switch (convolutional_operator) { + case conv::Operator::kFprop: + Conv3dFprop< + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, InnerProductOp + >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta); + break; + + case conv::Operator::kDgrad: + Conv3dDgrad< + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, InnerProductOp + >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta); + break; + + case conv::Operator::kWgrad: + Conv3dWgrad< + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + ElementCompute, + ElementAccumulator, + ConvertOp, InnerProductOp + >(problem_size, tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta); + break; + + default: + break; + } +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace host +} // namespace reference +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/tools/util/include/cutlass/util/reference/host/gemm.h b/tools/util/include/cutlass/util/reference/host/gemm.h index 98db6dcd95..6381aa3066 100644 --- a/tools/util/include/cutlass/util/reference/host/gemm.h +++ b/tools/util/include/cutlass/util/reference/host/gemm.h @@ -249,6 +249,45 @@ struct Gemm +struct Gemm { + + void operator()(gemm::GemmCoord problem_size, ScalarType alpha, + TensorRef tensor_a, + TensorRef tensor_b, ScalarType beta, + TensorRef tensor_c, + ComputeType initial_accum = ComputeType(0)) { + static_assert( + LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2, + "Tensors must be of rank 2"); + + compute_gemm>( + problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, initial_accum); + } + + void operator()(gemm::GemmCoord problem_size, ScalarType alpha, + TensorRef tensor_a, + TensorRef tensor_b, ScalarType beta, + TensorRef tensor_c, + TensorRef tensor_d, + ComputeType initial_accum = ComputeType(0)) { + static_assert( + LayoutA::kRank == 2 && LayoutB::kRank == 2 && LayoutC::kRank == 2, + "Tensors must be of rank 2"); + + compute_gemm>( + problem_size, alpha, tensor_a, tensor_b, beta, tensor_c, tensor_d, initial_accum); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + /// Partial specialization for multiply-add-saturate template