diff --git a/CHANGELOG.md b/CHANGELOG.md index d90f71378b..bdee14f0fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,19 @@ # NVIDIA CUTLASS Changelog # CUTLASS 2.x + +## [2.5.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.5.0) (2021-02-26) + * Tensor reductions + * User-supplied reduction operations across one or more dimensions of tensors with affine layouts + * Optimizations for vectorized memory accesses + * Large tensor support, up to 2^63 elements (however, each dimension is limited to an extent of 2^31) + * Fused inlined operations on Convolution input + * Vector broadcast and transformation on Convolution input + * Optimizations for 3-D convolution + * Tile iterators using precomputed delta table for three spatial dimensions + * Performance parity with 2-D convolution implementation + + ## [2.4.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.4.0) (2020-11-19) * Implicit GEMM convolution kernels supporting CUDA and Tensor Cores on NVIDIA GPUs * Operators: forward (Fprop), backward data gradient (Dgrad), and backward weight gradient (Wgrad) convolution @@ -126,7 +139,7 @@ ## Copyright -Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/CMakeLists.txt b/CMakeLists.txt index a0ece82c6d..4abf54a986 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -32,7 +32,7 @@ endif() message(STATUS "CMake Version: ${CMAKE_VERSION}") -project(CUTLASS VERSION 2.4.0 LANGUAGES CXX) +project(CUTLASS VERSION 2.5.0 LANGUAGES CXX) include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake) find_package(Doxygen QUIET) @@ -67,6 +67,8 @@ else() set(CUTLASS_ENABLE_TOOLS_INIT ON) endif() +set(CUTLASS_TEST_UNIT_ENABLE_WARNINGS OFF CACHE BOOL "Enable warnings on waived unit tests.") + set(CUTLASS_ENABLE_EXAMPLES ${CUTLASS_ENABLE_EXAMPLES_INIT} CACHE BOOL "Enable CUTLASS Examples") set(CUTLASS_ENABLE_TOOLS ${CUTLASS_ENABLE_TOOLS_INIT} CACHE BOOL "Enable CUTLASS Tools") set(CUTLASS_ENABLE_LIBRARY ${CUTLASS_ENABLE_TOOLS} CACHE BOOL "Enable CUTLASS Library") @@ -114,10 +116,6 @@ if (POLICY CMP0076) cmake_policy(SET CMP0076 NEW) endif() -if( NOT CMAKE_SIZEOF_VOID_P EQUAL 8 ) - message(FATAL_ERROR "CUTLASS requires a 64-bit compiler!") -endif() - include(GNUInstallDirs) link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs) @@ -257,6 +255,17 @@ if (NOT CMAKE_BUILD_TYPE MATCHES "Release") list(APPEND CUTLASS_CUDA_NVCC_FLAGS -lineinfo) endif() +#Report CUDA build flags +if (CUDA_COMPILER MATCHES "[Cc]lang") + if(CUTLASS_CUDA_CLANG_FLAGS) + message(STATUS "Using CLANG flags: ${CUTLASS_CUDA_CLANG_FLAGS}") + endif() +else() + if(CUTLASS_CUDA_NVCC_FLAGS) + message(STATUS "Using NVCC flags: ${CUTLASS_CUDA_NVCC_FLAGS}") + endif() +endif() + if(CUDA_COMPILER MATCHES "[Cc]lang") if( NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang" ) message(FATAL_ERROR "Clang CUDA compilation requires Clang CXX compilation. Currently CMAKE_CXX_COMPILER is ${CMAKE_CXX_COMPILER_ID}" ) @@ -318,20 +327,35 @@ function(cutlass_apply_cuda_gencode_flags TARGET) endfunction() +# Cache the flags so they are available when the function below is called anywhere globally. + +set(__CUTLASS_CUDA_FLAGS ${CUTLASS_CUDA_FLAGS} CACHE INTERNAL "") +set(__CUTLASS_CUDA_FLAGS_RELEASE ${CUTLASS_CUDA_FLAGS_RELEASE} CACHE INTERNAL "") +set(__CUTLASS_CUDA_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} CACHE INTERNAL "") +set(__CUTLASS_CUDA_FLAGS_DEBUG ${CUTLASS_CUDA_FLAGS_DEBUG} CACHE INTERNAL "") +set(__CUTLASS_CUDA_CLANG_FLAGS ${CUTLASS_CUDA_CLANG_FLAGS} CACHE INTERNAL "") +set(__CUTLASS_CUDA_CLANG_FLAGS_RELEASE ${CUTLASS_CUDA_CLANG_FLAGS_RELEASE} CACHE INTERNAL "") +set(__CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO} CACHE INTERNAL "") +set(__CUTLASS_CUDA_CLANG_FLAGS_DEBUG ${CUTLASS_CUDA_CLANG_FLAGS_DEBUG} CACHE INTERNAL "") +set(__CUTLASS_CUDA_NVCC_FLAGS ${CUTLASS_CUDA_NVCC_FLAGS} CACHE INTERNAL "") +set(__CUTLASS_CUDA_NVCC_FLAGS_RELEASE ${CUTLASS_CUDA_NVCC_FLAGS_RELEASE} CACHE INTERNAL "") +set(__CUTLASS_CUDA_NVCC_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_NVCC_FLAGS_RELWITHDEBINFO} CACHE INTERNAL "") +set(__CUTLASS_CUDA_NVCC_FLAGS_DEBUG ${CUTLASS_CUDA_NVCC_FLAGS_DEBUG} CACHE INTERNAL "") + function(cutlass_apply_standard_compile_options TARGET) if(CUDA_COMPILER MATCHES "[Cc]lang") set(CUDA_COMPILE_LANGUAGE CXX) - set(_FLAGS ${CUTLASS_CUDA_FLAGS} ${CUTLASS_CUDA_CLANG_FLAGS}) - set(_FLAGS_RELEASE ${CUTLASS_CUDA_FLAGS_RELEASE} ${CUTLASS_CUDA_CLANG_FLAGS_RELEASE}) - set(_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO}) - set(_FLAGS_DEBUG ${CUTLASS_CUDA_FLAGS_DEBUG} ${CUTLASS_CUDA_CLANG_FLAGS_DEBUG}) + set(_FLAGS ${__CUTLASS_CUDA_FLAGS} ${__CUTLASS_CUDA_CLANG_FLAGS}) + set(_FLAGS_RELEASE ${__CUTLASS_CUDA_FLAGS_RELEASE} ${__CUTLASS_CUDA_CLANG_FLAGS_RELEASE}) + set(_FLAGS_RELWITHDEBINFO ${__CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${__CUTLASS_CUDA_CLANG_FLAGS_RELWITHDEBINFO}) + set(_FLAGS_DEBUG ${__CUTLASS_CUDA_FLAGS_DEBUG} ${__CUTLASS_CUDA_CLANG_FLAGS_DEBUG}) else() set(CUDA_COMPILE_LANGUAGE CUDA) - set(_FLAGS ${CUTLASS_CUDA_FLAGS} ${CUTLASS_CUDA_NVCC_FLAGS}) - set(_FLAGS_RELEASE ${CUTLASS_CUDA_FLAGS_RELEASE} ${CUTLASS_CUDA_NVCC_FLAGS_RELEASE}) - set(_FLAGS_RELWITHDEBINFO ${CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${CUTLASS_CUDA_NVCC_FLAGS_RELWITHDEBINFO}) - set(_FLAGS_DEBUG ${CUTLASS_CUDA_FLAGS_DEBUG} ${CUTLASS_CUDA_NVCC_FLAGS_DEBUG}) + set(_FLAGS ${__CUTLASS_CUDA_FLAGS} ${__CUTLASS_CUDA_NVCC_FLAGS}) + set(_FLAGS_RELEASE ${__CUTLASS_CUDA_FLAGS_RELEASE} ${__CUTLASS_CUDA_NVCC_FLAGS_RELEASE}) + set(_FLAGS_RELWITHDEBINFO ${__CUTLASS_CUDA_FLAGS_RELWITHDEBINFO} ${__CUTLASS_CUDA_NVCC_FLAGS_RELWITHDEBINFO}) + set(_FLAGS_DEBUG ${__CUTLASS_CUDA_FLAGS_DEBUG} ${__CUTLASS_CUDA_NVCC_FLAGS_DEBUG}) endif() target_compile_options( @@ -464,20 +488,6 @@ endif() ################################################################################ -include(${CMAKE_CURRENT_SOURCE_DIR}/cuBLAS.cmake) - -if (CUTLASS_ENABLE_CUBLAS) - target_compile_definitions(CUTLASS INTERFACE CUTLASS_ENABLE_CUBLAS=1) -endif() - -include(${CMAKE_CURRENT_SOURCE_DIR}/cuDNN.cmake) - -if (CUTLASS_ENABLE_CUDNN) - target_compile_definitions(CUTLASS INTERFACE CUTLASS_ENABLE_CUDNN=1) -endif() - -################################################################################ - include(CTest) enable_testing() if (NOT TARGET test_all) @@ -497,6 +507,22 @@ install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_BINDIR}) install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_LIBDIR}) install(DIRECTORY DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ctest) +################################################################################ + +include(${CMAKE_CURRENT_SOURCE_DIR}/cuBLAS.cmake) + +if (CUTLASS_ENABLE_CUBLAS) + target_compile_definitions(CUTLASS INTERFACE CUTLASS_ENABLE_CUBLAS=1) +endif() + +include(${CMAKE_CURRENT_SOURCE_DIR}/cuDNN.cmake) + +if (CUTLASS_ENABLE_CUDNN) + target_compile_definitions(CUTLASS INTERFACE CUTLASS_ENABLE_CUDNN=1) +endif() + +################################################################################ + set(CUTLASS_CTEST_TEMPLATE_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/CTestTestfile.config.cmake) set(CUTLASS_CTEST_GENERATED_FILES "" CACHE INTERNAL "") diff --git a/CUDA.cmake b/CUDA.cmake index c887178a89..3578989a23 100644 --- a/CUDA.cmake +++ b/CUDA.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -204,7 +204,7 @@ include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) # paths by default, so we add it explicitly here. function(cutlass_correct_source_file_language_property) - if(CUDA_COMPILER MATCHES "clang") + if(CUDA_COMPILER MATCHES "[Cc]lang") foreach(File ${ARGN}) if(File MATCHES ".*\.cu$") set_source_files_properties(${File} PROPERTIES LANGUAGE CXX) diff --git a/README.md b/README.md index d8855c7395..d376d636ef 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ ![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition") -# CUTLASS 2.4 +# CUTLASS 2.5 -_CUTLASS 2.4 - November 2020_ +_CUTLASS 2.5 - February 2021_ CUTLASS is a collection of CUDA C++ template abstractions for implementing high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA. @@ -34,12 +34,18 @@ See the [Quick Start Guide](/media/docs/quickstart.md) to get started quickly. See the [functionality listing](/media/docs/functionality.md) for the list of operations supported at each level of the execution model hierarchy. +# What's New in CUTLASS 2.5 +CUTLASS 2.5 is a minor update to CUTLASS adding: +- Tensor reductions +- Fused inlined operations on Convolution input +- Optimizations for 3-D convolution +- See the [CHANGELOG](CHANGELOG.md) for more details + # What's New in CUTLASS 2.4 CUTLASS 2.4 is a significant update to CUTLASS adding: - 1-D, 2-D, and 3-D convolution targeting Tensor and CUDA cores for NVIDIA Ampere, Turing, and Volta GPU architectures - CUTLASS profiler support for convolution - [Documentation](/media/docs/implicit_gemm_convolution.md) describing Implicit GEMM Convolution algorithm and implementation -- See the [CHANGELOG](CHANGELOG.md) for more details. # What's New in CUTLASS 2.3 @@ -47,7 +53,6 @@ CUTLASS 2.3 is a minor update to CUTLASS adding: - GEMMs targeting structured [Sparse Tensor Cores](test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu) in NVIDIA Ampere Architecture GPUs - Fast SGEMM kernels targeting GeForce RTX 30-series CUDA Cores - Intended to be compiled with [CUDA 11.1 Toolkit](https://developer.nvidia.com/cuda-toolkit) -- See the [CHANGELOG](CHANGELOG.md) for more details. # What's New in CUTLASS 2.2 @@ -508,7 +513,7 @@ The official list of CUTLASS developers and contributors is available here: [CON # Copyright -Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/cmake/nop.cu b/cmake/nop.cu index 518a582b89..77216e5c7b 100644 --- a/cmake/nop.cu +++ b/cmake/nop.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/cuBLAS.cmake b/cuBLAS.cmake index 0ad6db2378..0e1733f0ac 100644 --- a/cuBLAS.cmake +++ b/cuBLAS.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/cuDNN.cmake b/cuDNN.cmake index da5e453131..0eb8e853ab 100644 --- a/cuDNN.cmake +++ b/cuDNN.cmake @@ -1,5 +1,5 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/00_basic_gemm/CMakeLists.txt b/examples/00_basic_gemm/CMakeLists.txt index 9ae257d9ab..8a619b3258 100644 --- a/examples/00_basic_gemm/CMakeLists.txt +++ b/examples/00_basic_gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/00_basic_gemm/basic_gemm.cu b/examples/00_basic_gemm/basic_gemm.cu index bda012abee..1dbeef75d5 100644 --- a/examples/00_basic_gemm/basic_gemm.cu +++ b/examples/00_basic_gemm/basic_gemm.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/01_cutlass_utilities/CMakeLists.txt b/examples/01_cutlass_utilities/CMakeLists.txt index 5f22b7b1cf..9a1d59325c 100644 --- a/examples/01_cutlass_utilities/CMakeLists.txt +++ b/examples/01_cutlass_utilities/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/01_cutlass_utilities/cutlass_utilities.cu b/examples/01_cutlass_utilities/cutlass_utilities.cu index d1eaa57fe7..8d6bf6a61a 100644 --- a/examples/01_cutlass_utilities/cutlass_utilities.cu +++ b/examples/01_cutlass_utilities/cutlass_utilities.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/02_dump_reg_shmem/CMakeLists.txt b/examples/02_dump_reg_shmem/CMakeLists.txt index 5e6112e026..15216513aa 100644 --- a/examples/02_dump_reg_shmem/CMakeLists.txt +++ b/examples/02_dump_reg_shmem/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/02_dump_reg_shmem/dump_reg_shmem.cu b/examples/02_dump_reg_shmem/dump_reg_shmem.cu index 9d7db79a95..c4276da103 100644 --- a/examples/02_dump_reg_shmem/dump_reg_shmem.cu +++ b/examples/02_dump_reg_shmem/dump_reg_shmem.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without *modification, are permitted provided that the following conditions are met: diff --git a/examples/03_visualize_layout/CMakeLists.txt b/examples/03_visualize_layout/CMakeLists.txt index 27a87c9292..60700f5fcb 100644 --- a/examples/03_visualize_layout/CMakeLists.txt +++ b/examples/03_visualize_layout/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/03_visualize_layout/options.h b/examples/03_visualize_layout/options.h index dd7de198a4..4fba7a77bd 100644 --- a/examples/03_visualize_layout/options.h +++ b/examples/03_visualize_layout/options.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/03_visualize_layout/register_layout.cu b/examples/03_visualize_layout/register_layout.cu index 0d2b25eb30..1a761ecb3b 100644 --- a/examples/03_visualize_layout/register_layout.cu +++ b/examples/03_visualize_layout/register_layout.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/03_visualize_layout/register_layout.h b/examples/03_visualize_layout/register_layout.h index 1518e433c8..7eb1c778e5 100644 --- a/examples/03_visualize_layout/register_layout.h +++ b/examples/03_visualize_layout/register_layout.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/03_visualize_layout/visualize_layout.cpp b/examples/03_visualize_layout/visualize_layout.cpp index 3c4b783ca6..b8098d5038 100644 --- a/examples/03_visualize_layout/visualize_layout.cpp +++ b/examples/03_visualize_layout/visualize_layout.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/03_visualize_layout/visualize_layout.h b/examples/03_visualize_layout/visualize_layout.h index 4093d27721..5d96acc4c4 100644 --- a/examples/03_visualize_layout/visualize_layout.h +++ b/examples/03_visualize_layout/visualize_layout.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/04_tile_iterator/CMakeLists.txt b/examples/04_tile_iterator/CMakeLists.txt index cd32e2287a..7d22d9cf67 100644 --- a/examples/04_tile_iterator/CMakeLists.txt +++ b/examples/04_tile_iterator/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/04_tile_iterator/tile_iterator.cu b/examples/04_tile_iterator/tile_iterator.cu index 5c56f33bd8..47aaad8f58 100644 --- a/examples/04_tile_iterator/tile_iterator.cu +++ b/examples/04_tile_iterator/tile_iterator.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/05_batched_gemm/CMakeLists.txt b/examples/05_batched_gemm/CMakeLists.txt index 6cd0ca8dba..f62f20955e 100644 --- a/examples/05_batched_gemm/CMakeLists.txt +++ b/examples/05_batched_gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/05_batched_gemm/batched_gemm.cu b/examples/05_batched_gemm/batched_gemm.cu index a9d8a9c680..10204837bb 100644 --- a/examples/05_batched_gemm/batched_gemm.cu +++ b/examples/05_batched_gemm/batched_gemm.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/06_splitK_gemm/CMakeLists.txt b/examples/06_splitK_gemm/CMakeLists.txt index 7b30ae1668..e47f8df876 100644 --- a/examples/06_splitK_gemm/CMakeLists.txt +++ b/examples/06_splitK_gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/06_splitK_gemm/splitk_gemm.cu b/examples/06_splitK_gemm/splitk_gemm.cu index b38de0c885..8aec0a294b 100644 --- a/examples/06_splitK_gemm/splitk_gemm.cu +++ b/examples/06_splitK_gemm/splitk_gemm.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/07_volta_tensorop_gemm/CMakeLists.txt b/examples/07_volta_tensorop_gemm/CMakeLists.txt index 82e8172271..61d5a82597 100644 --- a/examples/07_volta_tensorop_gemm/CMakeLists.txt +++ b/examples/07_volta_tensorop_gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu index ac27fa177d..23d5a95e2e 100644 --- a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu +++ b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -284,8 +284,12 @@ int run() { // Instantiate CUTLASS kernel depending on templates Gemm gemm_op; + // Check the problem size is supported or not + cutlass::Status status = gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + // Initialize CUTLASS kernel with arguments and workspace pointer - cutlass::Status status = gemm_op.initialize(arguments, workspace.get()); + status = gemm_op.initialize(arguments, workspace.get()); CUTLASS_CHECK(status); // Launch initialized CUTLASS kernel diff --git a/examples/08_turing_tensorop_gemm/CMakeLists.txt b/examples/08_turing_tensorop_gemm/CMakeLists.txt index b4e4fe82f6..b5b16ba1de 100644 --- a/examples/08_turing_tensorop_gemm/CMakeLists.txt +++ b/examples/08_turing_tensorop_gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu index 36f794d921..ba739bea01 100644 --- a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu +++ b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -266,8 +266,12 @@ int run() { // Instantiate CUTLASS kernel depending on templates Gemm gemm_op; + // Check the problem size is supported or not + cutlass::Status status = gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + // Initialize CUTLASS kernel with arguments and workspace pointer - cutlass::Status status = gemm_op.initialize(arguments, workspace.get()); + status = gemm_op.initialize(arguments, workspace.get()); CUTLASS_CHECK(status); // Launch initialized CUTLASS kernel diff --git a/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt b/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt index b1b5c8df1e..d529f978ea 100644 --- a/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt +++ b/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu index cf07efdcb5..efbca39d63 100644 --- a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu +++ b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -485,6 +485,7 @@ Result profile_convolution(Options const &options) { // Split K dimension into 1 partitions int split_k_slices = 1; + // Construct Conv2dProblemSize with user defined output size cutlass::conv::Conv2dProblemSize problem_size( options.input_size, options.filter_size, @@ -495,6 +496,8 @@ Result profile_convolution(Options const &options) { mode, split_k_slices); + // Construct ImplicitGemm::Argument structure with conv2d + // problem size, data pointers, and epilogue values typename ImplicitGemm::Arguments arguments{ problem_size, tensor_a.device_ref(), @@ -515,6 +518,9 @@ Result profile_convolution(Options const &options) { // Allocate workspace memory cutlass::device_memory::allocation workspace(workspace_size); + result.status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(result.status); + result.status = implicit_gemm_op.initialize(arguments, workspace.get()); CUTLASS_CHECK(result.status); diff --git a/examples/10_planar_complex/CMakeLists.txt b/examples/10_planar_complex/CMakeLists.txt index 555836aebf..31e5c31a1d 100644 --- a/examples/10_planar_complex/CMakeLists.txt +++ b/examples/10_planar_complex/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/10_planar_complex/planar_complex.cu b/examples/10_planar_complex/planar_complex.cu index d810777d9c..1ee8a069ee 100644 --- a/examples/10_planar_complex/planar_complex.cu +++ b/examples/10_planar_complex/planar_complex.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/11_planar_complex_array/CMakeLists.txt b/examples/11_planar_complex_array/CMakeLists.txt index 2a3f5987e4..082629b87a 100644 --- a/examples/11_planar_complex_array/CMakeLists.txt +++ b/examples/11_planar_complex_array/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/11_planar_complex_array/planar_complex_array.cu b/examples/11_planar_complex_array/planar_complex_array.cu index 53134168a0..e74ba10a18 100644 --- a/examples/11_planar_complex_array/planar_complex_array.cu +++ b/examples/11_planar_complex_array/planar_complex_array.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/12_gemm_bias_relu/CMakeLists.txt b/examples/12_gemm_bias_relu/CMakeLists.txt index fb78d77fa2..1a02f5e6d2 100644 --- a/examples/12_gemm_bias_relu/CMakeLists.txt +++ b/examples/12_gemm_bias_relu/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/12_gemm_bias_relu/gemm_bias_relu.cu b/examples/12_gemm_bias_relu/gemm_bias_relu.cu index 2b5c779bc6..5ad0d4a0ca 100644 --- a/examples/12_gemm_bias_relu/gemm_bias_relu.cu +++ b/examples/12_gemm_bias_relu/gemm_bias_relu.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -48,11 +48,19 @@ using ElementInputA = cutlass::half_t; // <- data type of elements using ElementInputB = cutlass::half_t; // <- data type of elements in input matrix B using ElementOutput = float; // <- data type of elements in output matrix D -// The code section below describes matrix layout of input and output matrices. Column Major for -// Matrix A, Row Major for Matrix B and Row Major for Matrix C +// The code section below describes matrix layout of input and output matrices. +// Column Major for Matrix A, B and C. +// +// Note this example only works for ColumnMajor output because +// 1) we only have row major epilogue. +// 2) we swap A and B if the output is column major then we can still use the +// row major epilogue. +// 3) Mx1 bias vector becomes 1xM after the swapping/transposing. +// 4) we can use the existing OutputIterator to load 1xM bias vector. + using LayoutInputA = cutlass::layout::ColumnMajor; using LayoutInputB = cutlass::layout::ColumnMajor; -using LayoutOutput = cutlass::layout::RowMajor; +using LayoutOutput = cutlass::layout::ColumnMajor; // This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM using MMAOp = cutlass::arch::OpClassTensorOp; @@ -73,17 +81,18 @@ using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSw // Define the epilogue operation as LinearCombinationRelu. This is approximately equal to // -// d_ij = max(0, alpha * sum_k(a_ik * b_kj) + beta * c_ij ) +// d_ij = max(0, alpha * sum_k(a_ik * b_kj) + c_ij ) // using EpilogueOp = cutlass::epilogue::thread::LinearCombinationRelu< - ElementOutput, // <- data type of output matrix - 128 / cutlass::sizeof_bits::value, // <- this is the number of elements per - // vectorized memory access. For half - // precision, it's 8 elements. This becomes - // the vector width of math instructions in - // epilogue too - ElementAccumulator, // <- data type of accumulator - ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function + ElementOutput, // <- data type of output matrix + 128 / cutlass::sizeof_bits::value, // <- this is the number of elements per + // vectorized memory access. For half + // precision, it's 8 elements. This becomes + // the vector width of math instructions in + // epilogue too + ElementAccumulator, // <- data type of accumulator + ElementComputeEpilogue, // <- data type for alpha in linear combination function + cutlass::epilogue::thread::ScaleType::NoBetaScaling>; // <- alpha x C + bias // Number of pipelines you want to use constexpr int NumStages = 2; @@ -160,9 +169,8 @@ int run() { tensor_d.sync_device(); tensor_ref_d.sync_device(); - // Initialize alpha and beta for dot product computation + // Initialize alpha for dot product computation ElementComputeEpilogue alpha = ElementComputeEpilogue(1); - ElementComputeEpilogue beta = ElementComputeEpilogue(0); // Split K dimension into 1 partitions int split_k_slices = 1; @@ -178,7 +186,7 @@ int run() { // to project away the N dimension by setting the stride to zero. tensor_d.device_ref(), // <- reference to matrix D on device - {alpha, beta}, // <- tuple of alpha and beta + {alpha}, // <- alpha split_k_slices}; // <- k-dimension split factor // Using the arguments, query for extra workspace required for matrix multiplication computation @@ -190,8 +198,12 @@ int run() { // Instantiate CUTLASS kernel depending on templates Gemm gemm_op; + // Check the problem size is supported or not + cutlass::Status status = gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + // Initialize CUTLASS kernel with arguments and workspace pointer - cutlass::Status status = gemm_op.initialize(arguments, workspace.get()); + status = gemm_op.initialize(arguments, workspace.get()); CUTLASS_CHECK(status); // Launch initialized CUTLASS kernel @@ -233,7 +245,7 @@ int run() { for (int j = 0; j < problem_size.n(); ++j) { tensor_ref_d.at({i, j}) = std::max( ElementOutput(0), - ElementOutput(tensor_ref_d.at({i, j}) + beta * tensor_c_bias.at({i, 0})) + ElementOutput(tensor_ref_d.at({i, j}) + tensor_c_bias.at({i, 0})) ); } } diff --git a/examples/13_two_tensor_op_fusion/CMakeLists.txt b/examples/13_two_tensor_op_fusion/CMakeLists.txt new file mode 100644 index 0000000000..220485b7b7 --- /dev/null +++ b/examples/13_two_tensor_op_fusion/CMakeLists.txt @@ -0,0 +1,45 @@ +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, are permitted +# provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright notice, this list of +# conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, this list of +# conditions and the following disclaimer in the documentation and/or other materials +# provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used +# to endorse or promote products derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +cutlass_example_add_executable( + 13_fused_two_gemms + fused_gemm.cu + ) + +cutlass_example_add_executable( + 13_fused_two_convs + fused_conv2d.cu + ) + + +target_include_directories( + 13_fused_two_gemms + PRIVATE + . + ) + +target_include_directories( + 13_fused_two_convs + PRIVATE + . + ) + diff --git a/examples/13_two_tensor_op_fusion/README.md b/examples/13_two_tensor_op_fusion/README.md new file mode 100644 index 0000000000..d89d876a0c --- /dev/null +++ b/examples/13_two_tensor_op_fusion/README.md @@ -0,0 +1,76 @@ +# Introduction + +This example shows fusing two back-to-back GEMMs/Convolutions into one kernel. + +

+ +When running two unfused GEMM/Conv operations, each operation loads one input +activation matrix, one weight matrix (or filter matrix) from the memory and then +stores the result activation matrix back to the memory. + +When the two GEMM/Conv operations are fused together, the mainloops of the two +GEMMs/Convs run back to back in a single kernel. The output accumulator of the +1st GEMM/Conv will be stored in the register file and reused as the activation +input of the 2nd GEMM/Conv. This saves a round trip to memory for the activation +matrix. + + +This example computes the following: +- 1st GEMM/Conv: D0 = relu(alpha0 .\* A0 \*\* B0) +- 2nd GEMM/Conv: D1 = relu(alpha1 .\* D0 \*\* B1 + beta1 .\* C1) + +In the above equation, operator \*\* can be matrix multiplication or convolution operation. + +# Implementation Details + +In order to run two GEMM/Convs in a single kernel, the example requires the same number of +threadblocks are used across 2 GEMMs/Convs. This also ensures the same threadblock tile M across +2 GEMMs/Convs. + +In order to reuse the output accumulator (stored in register-file) of the 1st GEMM as the +input activation, the example enforces the following two constraints: + +- thread_block_tile_N = problem_N + +

+ +This constraint ensures that each threadblock loads the entire weight/filter matrix in +addition to its own input activation tile. Therefore the input activation tile of the +2nd GEMM/Conv only depends on the output activation tile of the 1st GEMM/Conv, and the +operation can be fully block-resident. + +- warp_tile_N = thread_block_tile_N + +

+ +This constraint ensures that each warp loads the entire weight/filter kBlock in +addition to its own input activation tile. Therefore the input activation warp tile of the +2nd GEMM/Conv only depends on the output warp accumulator of the 1st GEMM/Conv in the +register file, and the operation can be fully register-file-resident. + +# Copyright + +Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + +``` + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + * Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` + diff --git a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h new file mode 100644 index 0000000000..305d18297c --- /dev/null +++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h @@ -0,0 +1,368 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "device/b2b_implicit_gemm_convolution.h" +#include "b2b_conv2d_run.h" + +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_0 ( + {128, 56, 56, 64}, // input size (NHWC) + {64, 3, 3, 64}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1}, // dilation (dilation_h, dilation_w) + {128, 56, 56, 64} // output size (NPQK) + ); +cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_1 ( + {128, 56, 56, 64}, // input size (NHWC) + {64, 1, 1, 64}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1}, // dilation (dilation_h, dilation_w) + {128, 56, 56, 64} // output size (NPQK) + ); + +void run_nonfused_conv2d_fprop_f16_sm75() { + + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + using ElementCompute = cutlass::half_t; + + ElementCompute alpha0 = ElementCompute(1); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(1); + ElementCompute beta1 = ElementCompute(0); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 32>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape0, + WarpShape0, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution; + + using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape1, + WarpShape1, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution; + + B2bNonFusedConv2dRun nonFusedConv2d; + + std::cout << "Running Non-fused back-to-back FP16 Analytic Convolution Fprops...\n"; + bool pass = nonFusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial, + alpha0, beta0, alpha1, beta1); + + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} + +void run_fused_conv2d_fprop_f16_sm75() { + + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + using ElementCompute = cutlass::half_t; + + ElementCompute alpha0 = ElementCompute(1); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(1); + ElementCompute beta1 = ElementCompute(0); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 32>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using EpilogueOutputOp0 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + InstructionShape::kM * InstructionShape::kN / 32, + ElementAccumulator, + ElementCompute + >; + + using EpilogueOutputOp1 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >; + + + + using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution; + + B2bFusedConv2dRun fusedConv2d; + + std::cout << "Running Fused back-to-back FP16 Analytic Convolution Fprops...\n"; + bool pass = fusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial, + alpha0, beta0, alpha1, beta1); + + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} + +void run_nonfused_conv2d_fprop_optimized_f16_sm75() { + + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + using ElementCompute = cutlass::half_t; + + ElementCompute alpha0 = ElementCompute(1); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(1); + ElementCompute beta1 = ElementCompute(0); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 32>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape0, + WarpShape0, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution; + + using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape1, + WarpShape1, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution; + + B2bNonFusedConv2dRun nonFusedConv2d; + + std::cout << "Running Non-fused back-to-back FP16 Optimized Convolution Fprops...\n"; + bool pass = nonFusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial, + alpha0, beta0, alpha1, beta1); + + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} + +void run_fused_conv2d_fprop_optimized_f16_sm75() { + + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + using ElementCompute = cutlass::half_t; + + ElementCompute alpha0 = ElementCompute(1); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(1); + ElementCompute beta1 = ElementCompute(0); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 32>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 32>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 32>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using EpilogueOutputOp0 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + InstructionShape::kM * InstructionShape::kN / 32, + ElementAccumulator, + ElementCompute + >; + + using EpilogueOutputOp1 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >; + + + + using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution; + + B2bFusedConv2dRun fusedConv2d; + + std::cout << "Running Fused back-to-back FP16 Optimized Convolution Fprops...\n"; + bool pass = fusedConv2d.run(conv2d_f16_sm75_problem_size_0, conv2d_f16_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial, + alpha0, beta0, alpha1, beta1); + + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} + + +//////////////////////////////////////////////////////////////////////////////// + +#endif // if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + diff --git a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h new file mode 100644 index 0000000000..e14134e944 --- /dev/null +++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h @@ -0,0 +1,363 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "device/b2b_implicit_gemm_convolution.h" +#include "b2b_conv2d_run.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_0 ( + {128, 56, 56, 64}, // input size (NHWC) + {64, 3, 3, 64}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1}, // dilation (dilation_h, dilation_w) + {128, 56, 56, 64} // output size (NPQK) + ); +cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_1 ( + {128, 56, 56, 64}, // input size (NHWC) + {64, 1, 1, 64}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1}, // dilation (dilation_h, dilation_w) + {128, 56, 56, 64} // output size (NPQK) + ); + +void run_nonfused_conv2d_fprop_f16_sm80() { + + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + using ElementCompute = cutlass::half_t; + + ElementCompute alpha0 = ElementCompute(1); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(1); + ElementCompute beta1 = ElementCompute(0); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + + using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + ThreadblockShape0, + WarpShape0, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution; + + using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + ThreadblockShape1, + WarpShape1, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution; + + B2bNonFusedConv2dRun nonFusedConv2d; + + std::cout << "Running Non-fused back-to-back FP16 Analytic Convolution Fprops...\n"; + bool pass = nonFusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial, + alpha0, beta0, alpha1, beta1); + + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} + +void run_fused_conv2d_fprop_f16_sm80() { + + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + using ElementCompute = cutlass::half_t; + + ElementCompute alpha0 = ElementCompute(1); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(1); + ElementCompute beta1 = ElementCompute(0); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + + using EpilogueOutputOp0 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + InstructionShape::kM * InstructionShape::kN / 32, + ElementAccumulator, + ElementCompute + >; + + using EpilogueOutputOp1 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >; + + using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution; + + B2bFusedConv2dRun fusedConv2d; + + std::cout << "Running Fused back-to-back FP16 Analytic Convolution Fprops...\n"; + bool pass = fusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial, + alpha0, beta0, alpha1, beta1); + + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} + +void run_nonfused_conv2d_fprop_optimized_f16_sm80() { + + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + using ElementCompute = cutlass::half_t; + + ElementCompute alpha0 = ElementCompute(1); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(1); + ElementCompute beta1 = ElementCompute(0); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + + using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + ThreadblockShape0, + WarpShape0, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution; + + using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + ThreadblockShape1, + WarpShape1, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution; + + B2bNonFusedConv2dRun nonFusedConv2d; + + std::cout << "Running Non-fused back-to-back FP16 Optimized Convolution Fprops...\n"; + bool pass = nonFusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial, + alpha0, beta0, alpha1, beta1); + + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} + +void run_fused_conv2d_fprop_optimized_f16_sm80() { + + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + using ElementCompute = cutlass::half_t; + + ElementCompute alpha0 = ElementCompute(1); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(1); + ElementCompute beta1 = ElementCompute(0); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + + using EpilogueOutputOp0 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + InstructionShape::kM * InstructionShape::kN / 32, + ElementAccumulator, + ElementCompute + >; + + using EpilogueOutputOp1 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >; + + using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop< + ElementA, cutlass::layout::TensorNHWC, + ElementB, cutlass::layout::TensorNHWC, + ElementC, cutlass::layout::TensorNHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution; + + B2bFusedConv2dRun fusedConv2d; + + std::cout << "Running Fused back-to-back FP16 Optimized Convolution Fprops...\n"; + bool pass = fusedConv2d.run(conv2d_f16_sm80_problem_size_0, conv2d_f16_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial, + alpha0, beta0, alpha1, beta1); + + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + diff --git a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.h b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.h new file mode 100644 index 0000000000..2cb4ac2e80 --- /dev/null +++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.h @@ -0,0 +1,367 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "device/b2b_implicit_gemm_convolution.h" +#include "b2b_interleaved_conv2d_run.h" + +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_0 ( + {128, 56, 56, 64}, // input size (NHWC) + {64, 3, 3, 64}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1}, // dilation (dilation_h, dilation_w) + {128, 56, 56, 64} // output size (NPQK) + ); +cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_1 ( + {128, 56, 56, 64}, // input size (NHWC) + {64, 1, 1, 64}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1}, // dilation (dilation_h, dilation_w) + {128, 56, 56, 64} // output size (NPQK) + ); + +void run_nonfused_conv2d_fprop_s8_sm75() { + + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + ElementCompute alpha0 = ElementCompute(1); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(1); + ElementCompute beta1 = ElementCompute(0); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; + + using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape0, + WarpShape0, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution; + + using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape1, + WarpShape1, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution; + + B2bInterleavedNonFusedConv2dRun nonFusedConv2d; + + std::cout << "Running Non-fused back-to-back INT8 interleaved Analytic Convolution Fprops...\n"; + bool pass = nonFusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial, + alpha0, beta0, alpha1, beta1); + + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} + +void run_fused_conv2d_fprop_s8_sm75() { + + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + ElementCompute alpha0 = ElementCompute(1); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(1); + ElementCompute beta1 = ElementCompute(0); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; + + using EpilogueOutputOp0 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + InstructionShape::kM * InstructionShape::kN / 32, + ElementAccumulator, + ElementCompute + >; + + using EpilogueOutputOp1 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >; + + + + using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution; + + B2bInterleavedFusedConv2dRun fusedConv2d; + + std::cout << "Running Fused back-to-back INT8 interleaved Analytic Convolution Fprops...\n"; + bool pass = fusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial, + alpha0, beta0, alpha1, beta1); + + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} + +void run_nonfused_conv2d_fprop_optimized_s8_sm75() { + + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + ElementCompute alpha0 = ElementCompute(1); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(1); + ElementCompute beta1 = ElementCompute(0); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; + + using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape0, + WarpShape0, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution; + + using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape1, + WarpShape1, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution; + + B2bInterleavedNonFusedConv2dRun nonFusedConv2d; + + std::cout << "Running Non-fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n"; + bool pass = nonFusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial, + alpha0, beta0, alpha1, beta1); + + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} + +void run_fused_conv2d_fprop_optimized_s8_sm75() { + + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + ElementCompute alpha0 = ElementCompute(1); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(1); + ElementCompute beta1 = ElementCompute(0); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; + + using EpilogueOutputOp0 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + InstructionShape::kM * InstructionShape::kN / 32, + ElementAccumulator, + ElementCompute + >; + + using EpilogueOutputOp1 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >; + + + + using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution; + + B2bInterleavedFusedConv2dRun fusedConv2d; + + std::cout << "Running Fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n"; + bool pass = fusedConv2d.run(conv2d_s8_sm75_problem_size_0, conv2d_s8_sm75_problem_size_1, cutlass::conv::SplitKMode::kSerial, + alpha0, beta0, alpha1, beta1); + + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + diff --git a/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.h b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.h new file mode 100644 index 0000000000..c73d6c69b4 --- /dev/null +++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.h @@ -0,0 +1,368 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "device/b2b_implicit_gemm_convolution.h" +#include "b2b_interleaved_conv2d_run.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_0 ( + {128, 56, 56, 64}, // input size (NHWC) + {64, 3, 3, 64}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1}, // dilation (dilation_h, dilation_w) + {128, 56, 56, 64} // output size (NPQK) + ); +cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_1 ( + {128, 56, 56, 64}, // input size (NHWC) + {64, 1, 1, 64}, // filter size (KRSC) + {0, 0, 0, 0}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1}, // dilation (dilation_h, dilation_w) + {128, 56, 56, 64} // output size (NPQK) + ); + +void run_nonfused_conv2d_fprop_s8_sm80() { + + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + ElementCompute alpha0 = ElementCompute(1); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(1); + ElementCompute beta1 = ElementCompute(0); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + + using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + ThreadblockShape0, + WarpShape0, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution; + + using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + ThreadblockShape1, + WarpShape1, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution; + + B2bInterleavedNonFusedConv2dRun nonFusedConv2d; + + std::cout << "Running Non-fused back-to-back INT8 interleaved Analytic Convolution Fprops...\n"; + bool pass = nonFusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial, + alpha0, beta0, alpha1, beta1); + + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} + +void run_fused_conv2d_fprop_s8_sm80() { + + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + ElementCompute alpha0 = ElementCompute(1); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(1); + ElementCompute beta1 = ElementCompute(0); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + + using EpilogueOutputOp0 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 8 * InstructionShape::kN / 32, + ElementAccumulator, + ElementCompute + >; + + using EpilogueOutputOp1 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >; + + + + using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kAnalytic + >::Kernel; + + using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution; + + B2bInterleavedFusedConv2dRun fusedConv2d; + + std::cout << "Running Fused back-to-back INT8 interleaved Analytic Convolution Fprops...\n"; + bool pass = fusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial, + alpha0, beta0, alpha1, beta1); + + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} + +void run_nonfused_conv2d_fprop_optimized_s8_sm80() { + + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + ElementCompute alpha0 = ElementCompute(1); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(1); + ElementCompute beta1 = ElementCompute(0); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + + using Conv2dFpropKernel0 = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + ThreadblockShape0, + WarpShape0, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop0 = cutlass::conv::device::ImplicitGemmConvolution; + + using Conv2dFpropKernel1 = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + ThreadblockShape1, + WarpShape1, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv2dFprop1 = cutlass::conv::device::ImplicitGemmConvolution; + + B2bInterleavedNonFusedConv2dRun nonFusedConv2d; + + std::cout << "Running Non-fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n"; + bool pass = nonFusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial, + alpha0, beta0, alpha1, beta1); + + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} + +void run_fused_conv2d_fprop_optimized_s8_sm80() { + + using ElementA = int8_t; + using ElementB = int8_t; + using ElementC = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + ElementCompute alpha0 = ElementCompute(1); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(1); + ElementCompute beta1 = ElementCompute(0); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + + using EpilogueOutputOp0 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 8 * InstructionShape::kN / 32, + ElementAccumulator, + ElementCompute + >; + + using EpilogueOutputOp1 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementC, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >; + + + + using B2bConv2dFpropKernel = typename cutlass::conv::kernel::DefaultB2bConv2dFprop< + ElementA, cutlass::layout::TensorNCxHWx<32>, + ElementB, cutlass::layout::TensorCxRSKx<32>, + ElementC, cutlass::layout::TensorNCxHWx<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using B2bConv2dFprop = cutlass::conv::device::B2bImplicitGemmConvolution; + + B2bInterleavedFusedConv2dRun fusedConv2d; + + std::cout << "Running Fused back-to-back INT8 interleaved Optimized Convolution Fprops...\n"; + bool pass = fusedConv2d.run(conv2d_s8_sm80_problem_size_0, conv2d_s8_sm80_problem_size_1, cutlass::conv::SplitKMode::kSerial, + alpha0, beta0, alpha1, beta1); + + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} + + +//////////////////////////////////////////////////////////////////////////////// + +#endif // if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + diff --git a/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h b/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h new file mode 100644 index 0000000000..07e3a0dfc2 --- /dev/null +++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h @@ -0,0 +1,628 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Implicit GEMM testbed +*/ +#pragma once + +#include +#include +#include + +#include "cutlass/cutlass.h" + +#include "cutlass/conv/device/implicit_gemm_convolution.h" +#include "cutlass/reduction/device/reduce_split_k.h" +#include "cutlass/reduction/thread/reduction_operators.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_norm.h" + +#include "cutlass/util/reference/host/convolution.h" +#include "cutlass/util/reference/device/convolution.h" +#include "cutlass/util/reference/device/tensor_relu.h" + +#include "cutlass/core_io.h" +#include "cutlass/util/tensor_view_io.h" + +#include "helper.h" + +#define CHECK_GT(val1, val2) \ + if((val1) <= (val2)) \ + std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n"; +#define CHECK_TRUE(val) \ + if(!(val)) \ + std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n"; + + +template +class B2bNonFusedConv2dRun { +public: + + using Conv2d0 = Conv2d0_; + using Conv2d1 = Conv2d1_; + using ElementAccumulator = typename Conv2d0::ElementAccumulator; + using ElementCompute = typename Conv2d0::ElementCompute; + + static cutlass::conv::Operator const kConvolutionalOperator = Conv2d0::kConvolutionalOperator; + static_assert(kConvolutionalOperator == Conv2d1::kConvolutionalOperator, + "Fused convolution operators must be the same"); + +public: + + /// Initialization + cutlass::Distribution::Kind init_A; + cutlass::Distribution::Kind init_B; + cutlass::Distribution::Kind init_C; + uint64_t seed; + + cutlass::HostTensor tensor_A0; + cutlass::HostTensor tensor_B0; + cutlass::HostTensor tensor_C0; + cutlass::HostTensor tensor_D0_computed; + cutlass::HostTensor tensor_D0_reference; + + cutlass::HostTensor tensor_B1; + cutlass::HostTensor tensor_C1; + cutlass::HostTensor tensor_D1_computed; + cutlass::HostTensor tensor_D1_reference; + + +public: + + B2bNonFusedConv2dRun( + cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, + uint64_t seed_ = 2080 + ): + init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { + + } + + /// Helper to initialize a tensor view + template + void initialize_tensor( + cutlass::TensorView view, + cutlass::Distribution::Kind dist_kind, + uint64_t seed) { + + if (dist_kind == cutlass::Distribution::Uniform) { + + int scope; + int bits = cutlass::sizeof_bits::value; + + if (bits <= 16) { + scope = 2; + } + else { + scope = 8; + } + cutlass::reference::host::TensorFillRandomUniform( + view, seed, scope, -scope, 0); + } + else if (dist_kind == cutlass::Distribution::Identity) { + + cutlass::reference::host::TensorFillIdentity(view); + } + else if (dist_kind == cutlass::Distribution::Gaussian) { + + cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5); + } + else if (dist_kind == cutlass::Distribution::Sequential) { + + cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); + } + else { + } + } + + void initialize( + cutlass::conv::Conv2dProblemSize const &problem_size_0, + cutlass::conv::Conv2dProblemSize const &problem_size_1, uint64_t seed = 2019) { + + tensor_A0.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size_0)); + tensor_B0.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0)); + tensor_C0.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0)); + tensor_D0_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0)); + tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0)); + tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1)); + tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1)); + tensor_D1_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1)); + tensor_D1_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1)); + + initialize_tensor(tensor_A0.host_view(), init_A, seed); + initialize_tensor(tensor_B0.host_view(), init_B, seed * 17); + initialize_tensor(tensor_C0.host_view(), init_C, seed * 39); + initialize_tensor(tensor_B1.host_view(), init_B, seed * 18); + initialize_tensor(tensor_C1.host_view(), init_C, seed * 40); + + tensor_A0.sync_device(); + tensor_B0.sync_device(); + tensor_C0.sync_device(); + tensor_D0_computed.sync_device(); + tensor_D0_reference.sync_device(); + tensor_B1.sync_device(); + tensor_C1.sync_device(); + tensor_D1_computed.sync_device(); + tensor_D1_reference.sync_device(); + } + + /// Executes one test + bool run( + cutlass::conv::Conv2dProblemSize const &problem_size_0, + cutlass::conv::Conv2dProblemSize const &problem_size_1, + cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial, + ElementCompute alpha0 = ElementCompute(1), + ElementCompute beta0 = ElementCompute(0), + ElementCompute alpha1 = ElementCompute(1), + ElementCompute beta1 = ElementCompute(0), + bool relu = true, + int warm_ups = 1, + int runs = 100) { + + initialize(problem_size_0, problem_size_1); + + // configure the operator + Conv2d0 conv2d_op_0; + Conv2d1 conv2d_op_1; + + typename Conv2d0::Arguments conv2d_args_0( + problem_size_0, + tensor_A0.device_ref(), + tensor_B0.device_ref(), + tensor_C0.device_ref(), + tensor_D0_computed.device_ref(), + {alpha0, beta0}, + split_k_mode + ); + typename Conv2d1::Arguments conv2d_args_1( + problem_size_1, + tensor_D0_computed.device_ref(), + tensor_B1.device_ref(), + tensor_C1.device_ref(), + tensor_D1_computed.device_ref(), + {alpha1, beta1}, + split_k_mode + ); + + + cutlass::Status status = conv2d_op_0.initialize(conv2d_args_0); + + CUTLASS_CHECK(status); + + status = conv2d_op_1.initialize(conv2d_args_1); + + CUTLASS_CHECK(status); + + for(int i = 0; i < warm_ups; i++) { + status = conv2d_op_0(); + CUTLASS_CHECK(status); + status = conv2d_op_1(); + CUTLASS_CHECK(status); + } + + // + // Run Conv2d + // + cudaEvent_t start, stop1, stop2; + cudaEventCreate(&start); + cudaEventCreate(&stop1); + cudaEventCreate(&stop2); + + cudaEventRecord(start); + + + for(int i = 0; i < runs; i++) { + // run conv2d operator + status = conv2d_op_0(); + CUTLASS_CHECK(status); + } + cudaEventRecord(stop1); + + for(int i = 0; i < runs; i++) { + // run conv2d operator + status = conv2d_op_1(); + CUTLASS_CHECK(status); + } + cudaEventRecord(stop2); + cudaDeviceSynchronize(); + float conv2d0Time, conv2d1Time, totalTime; + cudaEventElapsedTime(&conv2d0Time, start, stop1); + cudaEventElapsedTime(&conv2d1Time, stop1, stop2); + cudaEventElapsedTime(&totalTime, start, stop2); + std::cout << "conv2d 0 time " << conv2d0Time / (float)runs << " ms\n"; + std::cout << "conv2d 1 time " << conv2d1Time / (float)runs << " ms\n"; + std::cout << "total time " << totalTime / (float)runs << " ms\n"; + + tensor_D0_computed.sync_host(); + tensor_D1_computed.sync_host(); + + bool passed = false; + + cutlass::reference::device::Conv2d< + typename Conv2d0::ElementA, + typename Conv2d0::LayoutA, + typename Conv2d0::ElementB, + typename Conv2d0::LayoutB, + typename Conv2d0::ElementC, + typename Conv2d0::LayoutC, + ElementCompute, + ElementAccumulator + >( + kConvolutionalOperator, + problem_size_0, + tensor_A0.device_ref(), + tensor_B0.device_ref(), + tensor_C0.device_ref(), + tensor_D0_reference.device_ref(), + alpha0, + beta0); + + if(relu) { + cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view()); + } + + cutlass::reference::device::Conv2d< + typename Conv2d1::ElementA, + typename Conv2d1::LayoutA, + typename Conv2d1::ElementB, + typename Conv2d1::LayoutB, + typename Conv2d1::ElementC, + typename Conv2d1::LayoutC, + ElementCompute, + ElementAccumulator + >( + kConvolutionalOperator, + problem_size_1, + tensor_D0_reference.device_ref(), + tensor_B1.device_ref(), + tensor_C1.device_ref(), + tensor_D1_reference.device_ref(), + alpha1, + beta1); + + if(relu) { + cutlass::reference::device::TensorReLu(tensor_D1_reference.device_view()); + } + + cudaError_t result = cudaDeviceSynchronize(); + CHECK_TRUE(result == cudaSuccess); + + // sync host (copy device data to host) for dumping error output in case of mismatches + tensor_D0_reference.sync_host(); + tensor_D1_reference.sync_host(); + + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_computed.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_reference.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_computed.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_reference.host_view()), 0); + + passed = cutlass::reference::host::TensorEquals( + tensor_D1_computed.host_view(), + tensor_D1_reference.host_view()); + + CHECK_TRUE(passed); + + if (!passed) { + std::stringstream fname; + + fname << "error_B2bImplicitGemm_device_nonfused.txt"; + std::cerr << "Dumping results in " << fname.str() << "\n"; + + std::ofstream results(fname.str()); + + results << problem_size_0 << std::endl; + results << problem_size_1 << std::endl; + + results + << "\nA0:\n" << tensor_A0.host_view() << "\n" + << "\nB0:\n" << tensor_B0.host_view() << "\n" + << "\nC0:\n" << tensor_C0.host_view() << "\n" + << "\nD0 reference:\n" << tensor_D0_reference.host_view() << "\n" + << "\nD0 computed:\n" << tensor_D0_computed.host_view() << "\n" + << "\nB1:\n" << tensor_B1.host_view() << "\n" + << "\nC1:\n" << tensor_C1.host_view() << "\n" + << "\nD1 reference:\n" << tensor_D1_reference.host_view() << "\n" + << "\nD1 computed:\n" << tensor_D1_computed.host_view(); + + + } + + return passed; + } + +}; + +template +class B2bFusedConv2dRun { +public: + + using B2bConv2d = B2bConv2d_; + using ElementAccumulator = typename B2bConv2d::ElementAccumulator; + using ElementCompute = typename B2bConv2d::ElementCompute; + + static cutlass::conv::Operator const kConvolutionalOperator = B2bConv2d::kConvolutionalOperator; + +public: + + /// Initialization + cutlass::Distribution::Kind init_A; + cutlass::Distribution::Kind init_B; + cutlass::Distribution::Kind init_C; + uint64_t seed; + + cutlass::HostTensor tensor_A0; + cutlass::HostTensor tensor_B0; + cutlass::HostTensor tensor_C0; + cutlass::HostTensor tensor_D0_reference; + + cutlass::HostTensor tensor_B1; + cutlass::HostTensor tensor_C1; + cutlass::HostTensor tensor_D1_computed; + cutlass::HostTensor tensor_D1_reference; + + +public: + + B2bFusedConv2dRun( + cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, + uint64_t seed_ = 2080 + ): + init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { + + } + + /// Helper to initialize a tensor view + template + void initialize_tensor( + cutlass::TensorView view, + cutlass::Distribution::Kind dist_kind, + uint64_t seed) { + + if (dist_kind == cutlass::Distribution::Uniform) { + + int scope; + int bits = cutlass::sizeof_bits::value; + + if (bits <= 16) { + scope = 2; + } + else { + scope = 8; + } + cutlass::reference::host::TensorFillRandomUniform( + view, seed, scope, -scope, 0); + } + else if (dist_kind == cutlass::Distribution::Identity) { + + cutlass::reference::host::TensorFillIdentity(view); + } + else if (dist_kind == cutlass::Distribution::Gaussian) { + + cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5); + } + else if (dist_kind == cutlass::Distribution::Sequential) { + + cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); + } + else { + } + } + + void initialize( + cutlass::conv::Conv2dProblemSize const &problem_size_0, + cutlass::conv::Conv2dProblemSize const &problem_size_1, uint64_t seed = 2019) { + + tensor_A0.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size_0)); + tensor_B0.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0)); + tensor_C0.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0)); + tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0)); + tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1)); + tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1)); + tensor_D1_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1)); + tensor_D1_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1)); + + initialize_tensor(tensor_A0.host_view(), init_A, seed); + initialize_tensor(tensor_B0.host_view(), init_B, seed * 17); + initialize_tensor(tensor_C0.host_view(), init_C, seed * 39); + initialize_tensor(tensor_B1.host_view(), init_B, seed * 18); + initialize_tensor(tensor_C1.host_view(), init_C, seed * 40); + + tensor_A0.sync_device(); + tensor_B0.sync_device(); + tensor_C0.sync_device(); + tensor_D0_reference.sync_device(); + tensor_B1.sync_device(); + tensor_C1.sync_device(); + tensor_D1_computed.sync_device(); + tensor_D1_reference.sync_device(); + } + + /// Executes one test + bool run( + cutlass::conv::Conv2dProblemSize const &problem_size_0, + cutlass::conv::Conv2dProblemSize const &problem_size_1, + cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial, + ElementCompute alpha0 = ElementCompute(1), + ElementCompute beta0 = ElementCompute(0), + ElementCompute alpha1 = ElementCompute(1), + ElementCompute beta1 = ElementCompute(0), + bool relu = true, + int warm_ups = 1, + int runs = 100) { + + initialize(problem_size_0, problem_size_1); + + // configure the operator + B2bConv2d b2b_conv2d_op; + + typename B2bConv2d::Arguments b2b_conv2d_args( + problem_size_0, + problem_size_1, + tensor_A0.device_ref(), + tensor_B0.device_ref(), + tensor_C0.device_ref(), + tensor_B1.device_ref(), + tensor_C1.device_ref(), + tensor_D1_computed.device_ref(), + {alpha0, beta0}, + {alpha1, beta1}, + split_k_mode + ); + + cutlass::Status status = b2b_conv2d_op.initialize(b2b_conv2d_args); + + CUTLASS_CHECK(status); + + for(int i = 0; i < warm_ups; i++) { + status = b2b_conv2d_op(); + CUTLASS_CHECK(status); + } + + // + // Run the Conv2d + // + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + + for(int i = 0; i < runs; i++) { + + // run conv2d operator + status = b2b_conv2d_op(); + CUTLASS_CHECK(status); + } + + cudaEventRecord(stop); + cudaDeviceSynchronize(); + float conv2dTime; + cudaEventElapsedTime(&conv2dTime, start, stop); + std::cout << "time " << conv2dTime / (float)runs << " ms\n"; + + tensor_D1_computed.sync_host(); + + bool passed = false; + + cutlass::reference::device::Conv2d< + typename B2bConv2d::ElementA, + typename B2bConv2d::LayoutA, + typename B2bConv2d::ElementB, + typename B2bConv2d::LayoutB, + typename B2bConv2d::ElementC, + typename B2bConv2d::LayoutC, + ElementCompute, + ElementAccumulator + >( + kConvolutionalOperator, + problem_size_0, + tensor_A0.device_ref(), + tensor_B0.device_ref(), + tensor_C0.device_ref(), + tensor_D0_reference.device_ref(), + alpha0, + beta0); + + if(relu) { + cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view()); + } + + cutlass::reference::device::Conv2d< + typename B2bConv2d::ElementA, + typename B2bConv2d::LayoutA, + typename B2bConv2d::ElementB, + typename B2bConv2d::LayoutB, + typename B2bConv2d::ElementC, + typename B2bConv2d::LayoutC, + ElementCompute, + ElementAccumulator + >( + kConvolutionalOperator, + problem_size_1, + tensor_D0_reference.device_ref(), + tensor_B1.device_ref(), + tensor_C1.device_ref(), + tensor_D1_reference.device_ref(), + alpha1, + beta1); + + if(relu) { + cutlass::reference::device::TensorReLu(tensor_D1_reference.device_view()); + } + + cudaError_t result = cudaDeviceSynchronize(); + CHECK_TRUE(result == cudaSuccess); + + // sync host (copy device data to host) for dumping error output in case of mismatches + tensor_D0_reference.sync_host(); + tensor_D1_reference.sync_host(); + + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_reference.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_computed.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_reference.host_view()), 0); + + passed = cutlass::reference::host::TensorEquals( + tensor_D1_computed.host_view(), + tensor_D1_reference.host_view()); + + CHECK_TRUE(passed); + + if (!passed) { + std::stringstream fname; + + fname << "error_B2bImplicitGemm_device_fused.txt"; + std::cerr << "Dumping results in " << fname.str() << "\n"; + + std::ofstream results(fname.str()); + + results << problem_size_0 << std::endl; + results << problem_size_1 << std::endl; + + results + << "\nA0:\n" << tensor_A0.host_view() << "\n" + << "\nB0:\n" << tensor_B0.host_view() << "\n" + << "\nC0:\n" << tensor_C0.host_view() << "\n" + << "\nB1:\n" << tensor_B1.host_view() << "\n" + << "\nC1:\n" << tensor_C1.host_view() << "\n" + << "\nD1 reference:\n" << tensor_D1_reference.host_view() << "\n" + << "\nD1 computed:\n" << tensor_D1_computed.host_view(); + + + } + + return passed; + } + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/examples/13_fused_two_gemms/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h b/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h similarity index 92% rename from examples/13_fused_two_gemms/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h rename to examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h index 10a0d4bf94..50da709e73 100644 --- a/examples/13_fused_two_gemms/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h +++ b/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -43,14 +43,15 @@ //////////////////////////////////////////////////////////////////////////////// +cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_0(128*1600, 64, 576); +cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_1(128*1600, 128, 64); + void run_nonfused_gemm_f16() { using ElementOutput = cutlass::half_t; using ElementAccumulator = cutlass::half_t; using ElementCompute = cutlass::half_t; - cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576); - cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64); ElementCompute alpha0 = ElementCompute(2); ElementCompute beta0 = ElementCompute(0); ElementCompute alpha1 = ElementCompute(2); @@ -110,7 +111,7 @@ void run_nonfused_gemm_f16() { B2bNonFusedGemmRun nonFusedGemm; std::cout << "Running Non-fused back-to-back FP16 TN GEMMs...\n"; - bool pass = nonFusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1); + bool pass = nonFusedGemm.run(gemm_f16_sm75_problem_size_0, gemm_f16_sm75_problem_size_1, alpha0, beta0, alpha1, beta1); if(pass) std::cout << "Pass\n"; else @@ -123,8 +124,6 @@ void run_fused_gemm_f16() { using ElementAccumulator = cutlass::half_t; using ElementCompute = cutlass::half_t; - cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576); - cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64); ElementCompute alpha0 = ElementCompute(2); ElementCompute beta0 = ElementCompute(0); ElementCompute alpha1 = ElementCompute(2); @@ -178,7 +177,7 @@ void run_fused_gemm_f16() { B2bFusedGemmRun fusedGemm; std::cout << "Running Fused back-to-back FP16 TN GEMMs...\n"; - bool passed = fusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1); + bool passed = fusedGemm.run(gemm_f16_sm75_problem_size_0, gemm_f16_sm75_problem_size_1, alpha0, beta0, alpha1, beta1); if(passed) std::cout << "Pass\n"; else diff --git a/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm80.h b/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm80.h new file mode 100644 index 0000000000..749ece2b22 --- /dev/null +++ b/examples/13_two_tensor_op_fusion/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm80.h @@ -0,0 +1,189 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "device/b2b_gemm.h" +#include "b2b_gemm_run.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_0(128*1600, 64, 576); +cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_1(128*1600, 128, 64); + +void run_nonfused_gemm_f16_sm80() { + + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + using ElementCompute = cutlass::half_t; + + ElementCompute alpha0 = ElementCompute(2); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(2); + ElementCompute beta1 = ElementCompute(1); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 64, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + + using Gemm0 = cutlass::gemm::device::Gemm< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + ThreadblockShape0, + WarpShape0, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3 + >; + using Gemm1 = cutlass::gemm::device::Gemm< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + ThreadblockShape1, + WarpShape1, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3 + >; + + B2bNonFusedGemmRun nonFusedGemm; + + std::cout << "Running Non-fused back-to-back FP16 TN GEMMs...\n"; + bool pass = nonFusedGemm.run(gemm_f16_sm80_problem_size_0, gemm_f16_sm80_problem_size_1, alpha0, beta0, alpha1, beta1); + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; +} + +void run_fused_gemm_f16_sm80() { + + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + using ElementCompute = cutlass::half_t; + + ElementCompute alpha0 = ElementCompute(2); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(2); + ElementCompute beta1 = ElementCompute(1); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 128, 32>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + + using EpilogueOutputOp0 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, + InstructionShape::kM * InstructionShape::kN / 32, + ElementAccumulator, + ElementCompute + >; + + using EpilogueOutputOp1 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >; + + + + using B2bGemm = cutlass::gemm::device::B2bGemm< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 3 + >; + + B2bFusedGemmRun fusedGemm; + + std::cout << "Running Fused back-to-back FP16 TN GEMMs...\n"; + bool passed = fusedGemm.run(gemm_f16_sm80_problem_size_0, gemm_f16_sm80_problem_size_1, alpha0, beta0, alpha1, beta1); + if(passed) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} +//////////////////////////////////////////////////////////////////////////////// + +#endif //#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/examples/13_fused_two_gemms/b2b_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_gemm_run.h similarity index 95% rename from examples/13_fused_two_gemms/b2b_gemm_run.h rename to examples/13_two_tensor_op_fusion/b2b_gemm_run.h index 053064d751..8143f3d21a 100644 --- a/examples/13_fused_two_gemms/b2b_gemm_run.h +++ b/examples/13_two_tensor_op_fusion/b2b_gemm_run.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -121,7 +121,9 @@ struct B2bNonFusedGemmRun ElementCompute beta0 = ElementCompute(0), ElementCompute alpha1 = ElementCompute(1), ElementCompute beta1 = ElementCompute(0), - bool relu = true) { + bool relu = true, + int warm_ups = 1, + int runs = 100) { // // Allocate the GEMM workspace @@ -222,6 +224,14 @@ struct B2bNonFusedGemmRun status = gemm_op_1.initialize(arguments_1); CUTLASS_CHECK(status); + + for(int i = 0; i < warm_ups; i++) { + status = gemm_op_0(); + CUTLASS_CHECK(status); + status = gemm_op_1(); + CUTLASS_CHECK(status); + } + // // Run the GEMM // @@ -233,13 +243,13 @@ struct B2bNonFusedGemmRun cudaEventRecord(start); - for(int i = 0; i < 100; i++) { + for(int i = 0; i < runs; i++) { status = gemm_op_0(); CUTLASS_CHECK(status); } cudaEventRecord(stop1); - for(int i = 0; i < 100; i++) { + for(int i = 0; i < runs; i++) { status = gemm_op_1(); @@ -252,9 +262,9 @@ struct B2bNonFusedGemmRun cudaEventElapsedTime(&gemm0Time, start, stop1); cudaEventElapsedTime(&gemm1Time, stop1, stop2); cudaEventElapsedTime(&totalTime, start, stop2); - std::cout << "gemm 0 time " << gemm0Time / 100.0 << " ms\n"; - std::cout << "gemm 1 time " << gemm1Time / 100.0 << " ms\n"; - std::cout << "total time " << totalTime / 100.0 << " ms\n"; + std::cout << "gemm 0 time " << gemm0Time / (float)runs << " ms\n"; + std::cout << "gemm 1 time " << gemm1Time / (float)runs << " ms\n"; + std::cout << "total time " << totalTime / (float)runs << " ms\n"; tensor_D0.sync_host(); tensor_D1.sync_host(); @@ -415,7 +425,9 @@ struct B2bFusedGemmRun ElementCompute beta0 = ElementCompute(0), ElementCompute alpha1 = ElementCompute(1), ElementCompute beta1 = ElementCompute(0), - bool relu = true) { + bool relu = true, + int warm_ups = 1, + int runs = 100) { // // Allocate the GEMM workspace @@ -433,10 +445,6 @@ struct B2bFusedGemmRun typename B2bGemm::ElementC, typename B2bGemm::LayoutC> tensor_C0(problem_size_0.mn()); -// cutlass::HostTensor< -// typename B2bGemm::ElementC, -// typename B2bGemm::LayoutC> tensor_D0(problem_size_0.mn()); - cutlass::HostTensor< typename B2bGemm::ElementC, typename B2bGemm::LayoutC> reference_D0(problem_size_0.mn()); @@ -503,6 +511,11 @@ struct B2bFusedGemmRun CUTLASS_CHECK(status); + for(int i = 0; i < warm_ups; i++) { + status = b2b_gemm_op(); + CUTLASS_CHECK(status); + } + // // Run the GEMM // @@ -513,7 +526,7 @@ struct B2bFusedGemmRun cudaEventRecord(start); - for(int i = 0; i < 100; i++) { + for(int i = 0; i < runs; i++) { status = b2b_gemm_op(); CUTLASS_CHECK(status); @@ -523,9 +536,8 @@ struct B2bFusedGemmRun cudaDeviceSynchronize(); float gemmTime; cudaEventElapsedTime(&gemmTime, start, stop); - std::cout << "time " << gemmTime / 100.0 << " ms\n"; + std::cout << "time " << gemmTime / (float)runs << " ms\n"; - //tensor_D0.sync_host(); tensor_D1.sync_host(); // @@ -593,7 +605,6 @@ struct B2bFusedGemmRun << "A0 =\n" << tensor_A0.host_view() << "\nB0 =\n" << tensor_B0.host_view() << "\nC0 =\n" << tensor_C0.host_view() -// << "\nD0 =\n" << tensor_D0.host_view() << "\nB1 =\n" << tensor_B1.host_view() << "\nC1 =\n" << tensor_C1.host_view() << "\n\nReference =\n" << reference_D1.host_view() diff --git a/examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h b/examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h similarity index 92% rename from examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h rename to examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h index 1c3f15c2cf..2c2610b7d4 100644 --- a/examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h +++ b/examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -43,14 +43,15 @@ //////////////////////////////////////////////////////////////////////////////// +cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_0(128*1600, 64, 576); +cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_1(128*1600, 128, 64); + void run_nonfused_gemm_s8() { using ElementOutput = int8_t; using ElementAccumulator = int32_t; using ElementCompute = float; - cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576); - cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64); ElementCompute alpha0 = ElementCompute(2); ElementCompute beta0 = ElementCompute(0); ElementCompute alpha1 = ElementCompute(2); @@ -110,7 +111,7 @@ void run_nonfused_gemm_s8() { B2bInterleavedNonFusedGemmRun nonFusedGemm; std::cout << "Running Non-fused back-to-back INT8 NT interleaved GEMMs...\n"; - bool pass = nonFusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1); + bool pass = nonFusedGemm.run(gemm_s8_sm75_problem_size_0, gemm_s8_sm75_problem_size_1, alpha0, beta0, alpha1, beta1); if(pass) std::cout << "Pass\n"; else @@ -123,8 +124,6 @@ void run_fused_gemm_s8() { using ElementAccumulator = int32_t; using ElementCompute = float; - cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576); - cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64); ElementCompute alpha0 = ElementCompute(2); ElementCompute beta0 = ElementCompute(0); ElementCompute alpha1 = ElementCompute(2); @@ -178,7 +177,7 @@ void run_fused_gemm_s8() { B2bInterleavedFusedGemmRun fusedGemm; std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs...\n"; - bool passed = fusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1); + bool passed = fusedGemm.run(gemm_s8_sm75_problem_size_0, gemm_s8_sm75_problem_size_1, alpha0, beta0, alpha1, beta1); if(passed) std::cout << "Pass\n"; else diff --git a/examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h b/examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h similarity index 91% rename from examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h rename to examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h index 32b77128e8..8b9eefc604 100644 --- a/examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h +++ b/examples/13_two_tensor_op_fusion/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -43,14 +43,15 @@ //////////////////////////////////////////////////////////////////////////////// +cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_0(128*1600, 64, 576); +cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_1(128*1600, 128, 64); + void run_nonfused_gemm_s8_sm80() { using ElementOutput = int8_t; using ElementAccumulator = int32_t; using ElementCompute = float; - cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576); - cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64); ElementCompute alpha0 = ElementCompute(2); ElementCompute beta0 = ElementCompute(0); ElementCompute alpha1 = ElementCompute(2); @@ -86,8 +87,7 @@ void run_nonfused_gemm_s8_sm80() { 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate, - true + cutlass::arch::OpMultiplyAddSaturate >; using Gemm1 = cutlass::gemm::device::Gemm< int8_t, @@ -113,14 +113,13 @@ void run_nonfused_gemm_s8_sm80() { 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate, - true + cutlass::arch::OpMultiplyAddSaturate >; B2bInterleavedNonFusedGemmRun nonFusedGemm; std::cout << "Running Non-fused back-to-back INT8 NT interleaved GEMMs...\n"; - bool pass = nonFusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1); + bool pass = nonFusedGemm.run(gemm_s8_sm80_problem_size_0, gemm_s8_sm80_problem_size_1, alpha0, beta0, alpha1, beta1); if(pass) std::cout << "Pass\n"; else @@ -133,8 +132,6 @@ void run_fused_gemm_s8_sm80() { using ElementAccumulator = int32_t; using ElementCompute = float; - cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576); - cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64); ElementCompute alpha0 = ElementCompute(2); ElementCompute beta0 = ElementCompute(0); ElementCompute alpha1 = ElementCompute(2); @@ -193,7 +190,7 @@ void run_fused_gemm_s8_sm80() { B2bInterleavedFusedGemmRun fusedGemm; std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs...\n"; - bool passed = fusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1); + bool passed = fusedGemm.run(gemm_s8_sm80_problem_size_0, gemm_s8_sm80_problem_size_1, alpha0, beta0, alpha1, beta1); if(passed) std::cout << "Pass\n"; else diff --git a/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h b/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h new file mode 100644 index 0000000000..1b0795fa41 --- /dev/null +++ b/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h @@ -0,0 +1,661 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Implicit GEMM testbed +*/ +#pragma once + +#include +#include +#include + +#include "cutlass/cutlass.h" + +#include "cutlass/conv/device/implicit_gemm_convolution.h" +#include "cutlass/reduction/device/reduce_split_k.h" +#include "cutlass/reduction/thread/reduction_operators.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_norm.h" +#include "cutlass/util/host_reorder.h" + +#include "cutlass/util/reference/host/convolution.h" +#include "cutlass/util/reference/device/convolution.h" +#include "cutlass/util/reference/device/tensor_relu.h" + +#include "cutlass/core_io.h" +#include "cutlass/util/tensor_view_io.h" + +#include "helper.h" + +#define CHECK_GT(val1, val2) \ + if((val1) <= (val2)) \ + std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n"; +#define CHECK_TRUE(val) \ + if(!(val)) \ + std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n"; + + +template +class B2bInterleavedNonFusedConv2dRun { +public: + + using Conv2d0 = Conv2d0_; + using Conv2d1 = Conv2d1_; + using ElementAccumulator = typename Conv2d0::ElementAccumulator; + using ElementCompute = typename Conv2d0::ElementCompute; + + static cutlass::conv::Operator const kConvolutionalOperator = Conv2d0::kConvolutionalOperator; + static_assert(kConvolutionalOperator == Conv2d1::kConvolutionalOperator, + "Fused convolution operators must be the same"); + +public: + + /// Initialization + cutlass::Distribution::Kind init_A; + cutlass::Distribution::Kind init_B; + cutlass::Distribution::Kind init_C; + uint64_t seed; + + cutlass::HostTensor tensor_A0; + cutlass::HostTensor tensor_B0; + cutlass::HostTensor tensor_B0_reordered; + cutlass::HostTensor tensor_C0; + cutlass::HostTensor tensor_D0_computed; + cutlass::HostTensor tensor_D0_reference; + + cutlass::HostTensor tensor_B1; + cutlass::HostTensor tensor_B1_reordered; + cutlass::HostTensor tensor_C1; + cutlass::HostTensor tensor_D1_computed; + cutlass::HostTensor tensor_D1_reference; + + +public: + + B2bInterleavedNonFusedConv2dRun( + cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, + uint64_t seed_ = 2080 + ): + init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { + + } + + /// Helper to initialize a tensor view + template + void initialize_tensor( + cutlass::TensorView view, + cutlass::Distribution::Kind dist_kind, + uint64_t seed) { + + if (dist_kind == cutlass::Distribution::Uniform) { + + int scope; + int bits = cutlass::sizeof_bits::value; + + if (bits <= 16) { + scope = 2; + } + else { + scope = 8; + } + cutlass::reference::host::TensorFillRandomUniform( + view, seed, scope, -scope, 0); + } + else if (dist_kind == cutlass::Distribution::Identity) { + + cutlass::reference::host::TensorFillIdentity(view); + } + else if (dist_kind == cutlass::Distribution::Gaussian) { + + cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5); + } + else if (dist_kind == cutlass::Distribution::Sequential) { + + cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); + } + else { + } + } + + void initialize( + cutlass::conv::Conv2dProblemSize const &problem_size_0, + cutlass::conv::Conv2dProblemSize const &problem_size_1, uint64_t seed = 2019) { + + tensor_A0.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size_0)); + tensor_B0.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0)); + tensor_B0_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0)); + tensor_C0.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0)); + tensor_D0_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0)); + tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0)); + tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1)); + tensor_B1_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1)); + tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1)); + tensor_D1_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1)); + tensor_D1_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1)); + + initialize_tensor(tensor_A0.host_view(), init_A, seed); + initialize_tensor(tensor_B0.host_view(), init_B, seed * 17); + initialize_tensor(tensor_C0.host_view(), init_C, seed * 39); + initialize_tensor(tensor_B1.host_view(), init_B, seed * 18); + initialize_tensor(tensor_C1.host_view(), init_C, seed * 40); + + //Reorder B0 and B1 + cutlass::reorder_convK( + tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size_0)); + cutlass::reorder_convK( + tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size_1)); + + tensor_A0.sync_device(); + tensor_B0.sync_device(); + tensor_B0_reordered.sync_device(); + tensor_C0.sync_device(); + tensor_D0_computed.sync_device(); + tensor_D0_reference.sync_device(); + tensor_B1.sync_device(); + tensor_B1_reordered.sync_device(); + tensor_C1.sync_device(); + tensor_D1_computed.sync_device(); + tensor_D1_reference.sync_device(); + } + + /// Executes one test + bool run( + cutlass::conv::Conv2dProblemSize const &problem_size_0, + cutlass::conv::Conv2dProblemSize const &problem_size_1, + cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial, + ElementCompute alpha0 = ElementCompute(1), + ElementCompute beta0 = ElementCompute(0), + ElementCompute alpha1 = ElementCompute(1), + ElementCompute beta1 = ElementCompute(0), + bool relu = true, + int warm_ups = 1, + int runs = 100) { + + initialize(problem_size_0, problem_size_1); + + // configure the operator + Conv2d0 conv2d_op_0; + Conv2d1 conv2d_op_1; + + typename Conv2d0::Arguments conv2d_args_0( + problem_size_0, + tensor_A0.device_ref(), + tensor_B0_reordered.device_ref(), + tensor_C0.device_ref(), + tensor_D0_computed.device_ref(), + {alpha0, beta0}, + split_k_mode + ); + typename Conv2d1::Arguments conv2d_args_1( + problem_size_1, + tensor_D0_computed.device_ref(), + tensor_B1_reordered.device_ref(), + tensor_C1.device_ref(), + tensor_D1_computed.device_ref(), + {alpha1, beta1}, + split_k_mode + ); + + + cutlass::Status status = conv2d_op_0.initialize(conv2d_args_0); + + CUTLASS_CHECK(status); + + status = conv2d_op_1.initialize(conv2d_args_1); + + CUTLASS_CHECK(status); + + for(int i = 0; i < warm_ups; i++) { + status = conv2d_op_0(); + CUTLASS_CHECK(status); + status = conv2d_op_1(); + CUTLASS_CHECK(status); + } + + // + // Run Conv2d + // + cudaEvent_t start, stop1, stop2; + cudaEventCreate(&start); + cudaEventCreate(&stop1); + cudaEventCreate(&stop2); + + cudaEventRecord(start); + + + for(int i = 0; i < runs; i++) { + // run conv2d operator + status = conv2d_op_0(); + CUTLASS_CHECK(status); + } + cudaEventRecord(stop1); + + for(int i = 0; i < runs; i++) { + // run conv2d operator + status = conv2d_op_1(); + CUTLASS_CHECK(status); + } + cudaEventRecord(stop2); + cudaDeviceSynchronize(); + float conv2d0Time, conv2d1Time, totalTime; + cudaEventElapsedTime(&conv2d0Time, start, stop1); + cudaEventElapsedTime(&conv2d1Time, stop1, stop2); + cudaEventElapsedTime(&totalTime, start, stop2); + std::cout << "conv2d 0 time " << conv2d0Time / (float)runs << " ms\n"; + std::cout << "conv2d 1 time " << conv2d1Time / (float)runs << " ms\n"; + std::cout << "total time " << totalTime / (float)runs << " ms\n"; + + tensor_D0_computed.sync_host(); + tensor_D1_computed.sync_host(); + + bool passed = false; + + cutlass::reference::device::Conv2d< + typename Conv2d0::ElementA, + typename Conv2d0::LayoutA, + typename Conv2d0::ElementB, + typename Conv2d0::LayoutB, + typename Conv2d0::ElementC, + typename Conv2d0::LayoutC, + ElementCompute, + ElementAccumulator, + cutlass::NumericConverterClamp + >( + kConvolutionalOperator, + problem_size_0, + tensor_A0.device_ref(), + tensor_B0.device_ref(), + tensor_C0.device_ref(), + tensor_D0_reference.device_ref(), + alpha0, + beta0); + + if(relu) { + cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view()); + } + + cutlass::reference::device::Conv2d< + typename Conv2d1::ElementA, + typename Conv2d1::LayoutA, + typename Conv2d1::ElementB, + typename Conv2d1::LayoutB, + typename Conv2d1::ElementC, + typename Conv2d1::LayoutC, + ElementCompute, + ElementAccumulator, + cutlass::NumericConverterClamp + >( + kConvolutionalOperator, + problem_size_1, + tensor_D0_reference.device_ref(), + tensor_B1.device_ref(), + tensor_C1.device_ref(), + tensor_D1_reference.device_ref(), + alpha1, + beta1); + + if(relu) { + cutlass::reference::device::TensorReLu(tensor_D1_reference.device_view()); + } + + cudaError_t result = cudaDeviceSynchronize(); + CHECK_TRUE(result == cudaSuccess); + + // sync host (copy device data to host) for dumping error output in case of mismatches + tensor_D0_reference.sync_host(); + tensor_D1_reference.sync_host(); + + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_computed.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_reference.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_computed.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_reference.host_view()), 0); + + passed = cutlass::reference::host::TensorEquals( + tensor_D1_computed.host_view(), + tensor_D1_reference.host_view()); + + CHECK_TRUE(passed); + + if (!passed) { + std::stringstream fname; + + fname << "error_B2bImplicitGemm_device_interleaved_nonfused.txt"; + std::cerr << "Dumping results in " << fname.str() << "\n"; + + std::ofstream results(fname.str()); + + results << problem_size_0 << std::endl; + results << problem_size_1 << std::endl; + + results + << "\nA0:\n" << tensor_A0.host_view() << "\n" + << "\nB0:\n" << tensor_B0.host_view() << "\n" + << "\nB0_reordered:\n" << tensor_B0_reordered.host_view() << "\n" + << "\nC0:\n" << tensor_C0.host_view() << "\n" + << "\nD0 reference:\n" << tensor_D0_reference.host_view() << "\n" + << "\nD0 computed:\n" << tensor_D0_computed.host_view() << "\n" + << "\nB1:\n" << tensor_B1.host_view() << "\n" + << "\nB1_reordered:\n" << tensor_B1_reordered.host_view() << "\n" + << "\nC1:\n" << tensor_C1.host_view() << "\n" + << "\nD1 reference:\n" << tensor_D1_reference.host_view() << "\n" + << "\nD1 computed:\n" << tensor_D1_computed.host_view(); + + + } + + return passed; + } + +}; + +template +class B2bInterleavedFusedConv2dRun { +public: + + using B2bConv2d = B2bConv2d_; + using ElementAccumulator = typename B2bConv2d::ElementAccumulator; + using ElementCompute = typename B2bConv2d::ElementCompute; + + static cutlass::conv::Operator const kConvolutionalOperator = B2bConv2d::kConvolutionalOperator; + +public: + + /// Initialization + cutlass::Distribution::Kind init_A; + cutlass::Distribution::Kind init_B; + cutlass::Distribution::Kind init_C; + uint64_t seed; + + cutlass::HostTensor tensor_A0; + cutlass::HostTensor tensor_B0; + cutlass::HostTensor tensor_B0_reordered; + cutlass::HostTensor tensor_C0; + cutlass::HostTensor tensor_D0_reference; + + cutlass::HostTensor tensor_B1; + cutlass::HostTensor tensor_B1_reordered; + cutlass::HostTensor tensor_C1; + cutlass::HostTensor tensor_D1_computed; + cutlass::HostTensor tensor_D1_reference; + + +public: + + B2bInterleavedFusedConv2dRun( + cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, + uint64_t seed_ = 2080 + ): + init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { + + } + + /// Helper to initialize a tensor view + template + void initialize_tensor( + cutlass::TensorView view, + cutlass::Distribution::Kind dist_kind, + uint64_t seed) { + + if (dist_kind == cutlass::Distribution::Uniform) { + + int scope; + int bits = cutlass::sizeof_bits::value; + + if (bits <= 16) { + scope = 2; + } + else { + scope = 8; + } + cutlass::reference::host::TensorFillRandomUniform( + view, seed, scope, -scope, 0); + } + else if (dist_kind == cutlass::Distribution::Identity) { + + cutlass::reference::host::TensorFillIdentity(view); + } + else if (dist_kind == cutlass::Distribution::Gaussian) { + + cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5); + } + else if (dist_kind == cutlass::Distribution::Sequential) { + + cutlass::reference::host::BlockFillSequential(view.data(), view.capacity()); + } + else { + } + } + + void initialize( + cutlass::conv::Conv2dProblemSize const &problem_size_0, + cutlass::conv::Conv2dProblemSize const &problem_size_1, uint64_t seed = 2019) { + + tensor_A0.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size_0)); + tensor_B0.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0)); + tensor_B0_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_0)); + tensor_C0.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0)); + tensor_D0_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_0)); + tensor_B1.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1)); + tensor_B1_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size_1)); + tensor_C1.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1)); + tensor_D1_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1)); + tensor_D1_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size_1)); + + initialize_tensor(tensor_A0.host_view(), init_A, seed); + initialize_tensor(tensor_B0.host_view(), init_B, seed * 17); + initialize_tensor(tensor_C0.host_view(), init_C, seed * 39); + initialize_tensor(tensor_B1.host_view(), init_B, seed * 18); + initialize_tensor(tensor_C1.host_view(), init_C, seed * 40); + + //Reorder B0 and B1 + cutlass::reorder_convK<16, InterleavedK>( + tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size_0)); + cutlass::reorder_convK( + tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size_1)); + + tensor_A0.sync_device(); + tensor_B0.sync_device(); + tensor_B0_reordered.sync_device(); + tensor_C0.sync_device(); + tensor_D0_reference.sync_device(); + tensor_B1.sync_device(); + tensor_B1_reordered.sync_device(); + tensor_C1.sync_device(); + tensor_D1_computed.sync_device(); + tensor_D1_reference.sync_device(); + } + + /// Executes one test + bool run( + cutlass::conv::Conv2dProblemSize const &problem_size_0, + cutlass::conv::Conv2dProblemSize const &problem_size_1, + cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial, + ElementCompute alpha0 = ElementCompute(1), + ElementCompute beta0 = ElementCompute(0), + ElementCompute alpha1 = ElementCompute(1), + ElementCompute beta1 = ElementCompute(0), + bool relu = true, + int warm_ups = 1, + int runs = 100) { + + initialize(problem_size_0, problem_size_1); + + // configure the operator + B2bConv2d b2b_conv2d_op; + + typename B2bConv2d::Arguments b2b_conv2d_args( + problem_size_0, + problem_size_1, + tensor_A0.device_ref(), + tensor_B0_reordered.device_ref(), + tensor_C0.device_ref(), + tensor_B1_reordered.device_ref(), + tensor_C1.device_ref(), + tensor_D1_computed.device_ref(), + {alpha0, beta0}, + {alpha1, beta1}, + split_k_mode + ); + + cutlass::Status status = b2b_conv2d_op.initialize(b2b_conv2d_args); + + CUTLASS_CHECK(status); + + for(int i = 0; i < warm_ups; i++) { + status = b2b_conv2d_op(); + CUTLASS_CHECK(status); + } + + // + // Run the Conv2d + // + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + + for(int i = 0; i < runs; i++) { + + // run conv2d operator + status = b2b_conv2d_op(); + CUTLASS_CHECK(status); + } + + cudaEventRecord(stop); + cudaDeviceSynchronize(); + float conv2dTime; + cudaEventElapsedTime(&conv2dTime, start, stop); + std::cout << "time " << conv2dTime / (float)runs << " ms\n"; + + tensor_D1_computed.sync_host(); + + bool passed = false; + + cutlass::reference::device::Conv2d< + typename B2bConv2d::ElementA, + typename B2bConv2d::LayoutA, + typename B2bConv2d::ElementB, + typename B2bConv2d::LayoutB, + typename B2bConv2d::ElementC, + typename B2bConv2d::LayoutC, + ElementCompute, + ElementAccumulator, + cutlass::NumericConverterClamp + >( + kConvolutionalOperator, + problem_size_0, + tensor_A0.device_ref(), + tensor_B0.device_ref(), + tensor_C0.device_ref(), + tensor_D0_reference.device_ref(), + alpha0, + beta0); + + if(relu) { + cutlass::reference::device::TensorReLu(tensor_D0_reference.device_view()); + } + + cutlass::reference::device::Conv2d< + typename B2bConv2d::ElementA, + typename B2bConv2d::LayoutA, + typename B2bConv2d::ElementB, + typename B2bConv2d::LayoutB, + typename B2bConv2d::ElementC, + typename B2bConv2d::LayoutC, + ElementCompute, + ElementAccumulator, + cutlass::NumericConverterClamp + >( + kConvolutionalOperator, + problem_size_1, + tensor_D0_reference.device_ref(), + tensor_B1.device_ref(), + tensor_C1.device_ref(), + tensor_D1_reference.device_ref(), + alpha1, + beta1); + + if(relu) { + cutlass::reference::device::TensorReLu(tensor_D1_reference.device_view()); + } + + cudaError_t result = cudaDeviceSynchronize(); + CHECK_TRUE(result == cudaSuccess); + + // sync host (copy device data to host) for dumping error output in case of mismatches + tensor_D0_reference.sync_host(); + tensor_D1_reference.sync_host(); + + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0_reference.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_computed.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1_reference.host_view()), 0); + + passed = cutlass::reference::host::TensorEquals( + tensor_D1_computed.host_view(), + tensor_D1_reference.host_view()); + + CHECK_TRUE(passed); + + if (!passed) { + std::stringstream fname; + + fname << "error_B2bImplicitGemm_device_interleaved_fused.txt"; + std::cerr << "Dumping results in " << fname.str() << "\n"; + + std::ofstream results(fname.str()); + + results << problem_size_0 << std::endl; + results << problem_size_1 << std::endl; + + results + << "\nA0:\n" << tensor_A0.host_view() << "\n" + << "\nB0:\n" << tensor_B0.host_view() << "\n" + << "\nB0_reordered:\n" << tensor_B0_reordered.host_view() << "\n" + << "\nC0:\n" << tensor_C0.host_view() << "\n" + << "\nB1:\n" << tensor_B1.host_view() << "\n" + << "\nB1_reordered:\n" << tensor_B1_reordered.host_view() << "\n" + << "\nC1:\n" << tensor_C1.host_view() << "\n" + << "\nD1 reference:\n" << tensor_D1_reference.host_view() << "\n" + << "\nD1 computed:\n" << tensor_D1_computed.host_view(); + + + } + + return passed; + } + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/examples/13_fused_two_gemms/b2b_interleaved_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h similarity index 98% rename from examples/13_fused_two_gemms/b2b_interleaved_gemm_run.h rename to examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h index e98be9e511..c33494095d 100644 --- a/examples/13_fused_two_gemms/b2b_interleaved_gemm_run.h +++ b/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -243,6 +243,7 @@ struct B2bInterleavedNonFusedGemmRun status = gemm_op_1(); CUTLASS_CHECK(status); } + // // Run the GEMM // @@ -455,10 +456,6 @@ struct B2bInterleavedFusedGemmRun typename B2bGemm::ElementC, typename B2bGemm::LayoutC> tensor_C0(problem_size_0.mn()); -// cutlass::HostTensor< -// typename B2bGemm::ElementC, -// typename B2bGemm::LayoutC> tensor_D0(problem_size_0.mn()); - cutlass::HostTensor< typename B2bGemm::ElementC, typename B2bGemm::LayoutC> reference_D0(problem_size_0.mn()); @@ -507,7 +504,6 @@ struct B2bInterleavedFusedGemmRun tensor_B0.sync_device(); tensor_B0_reordered.sync_device(); tensor_C0.sync_device(); - //tensor_D0.sync_device(); tensor_B1.sync_device(); tensor_B1_reordered.sync_device(); tensor_C1.sync_device(); @@ -566,7 +562,6 @@ struct B2bInterleavedFusedGemmRun cudaEventElapsedTime(&gemmTime, start, stop); std::cout << "time " << gemmTime / (float)runs << " ms\n"; - //tensor_D0.sync_host(); tensor_D1.sync_host(); // @@ -635,7 +630,6 @@ struct B2bInterleavedFusedGemmRun << "\nB0 =\n" << tensor_B0.host_view() << "\nB0_reordered =\n" << tensor_B0_reordered.host_view() << "\nC0 =\n" << tensor_C0.host_view() -// << "\nD0 =\n" << tensor_D0.host_view() << "\nB1 =\n" << tensor_B1.host_view() << "\nB1_reordered =\n" << tensor_B1_reordered.host_view() << "\nC1 =\n" << tensor_C1.host_view() diff --git a/examples/13_fused_two_gemms/device/b2b_gemm.h b/examples/13_two_tensor_op_fusion/device/b2b_gemm.h similarity index 99% rename from examples/13_fused_two_gemms/device/b2b_gemm.h rename to examples/13_two_tensor_op_fusion/device/b2b_gemm.h index 3f161435dd..b72ac2918f 100644 --- a/examples/13_fused_two_gemms/device/b2b_gemm.h +++ b/examples/13_two_tensor_op_fusion/device/b2b_gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h b/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h new file mode 100644 index 0000000000..64f97b7b39 --- /dev/null +++ b/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h @@ -0,0 +1,274 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief Template for device-level Implicit GEMM +*/ + +#pragma once + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/device_kernel.h" +#include "cutlass/conv/convolution.h" + +#include "kernel/b2b_implicit_gemm_convolution.h" +#include "kernel/default_b2b_conv2d_fprop.h" + +namespace cutlass { +namespace conv { +namespace device { + +template +class B2bImplicitGemmConvolution { +public: + + using B2bImplicitGemmKernel = B2bImplicitGemmKernel_; + + using ElementA = typename B2bImplicitGemmKernel::ElementA; + using LayoutA = typename B2bImplicitGemmKernel::LayoutA; + using ElementB = typename B2bImplicitGemmKernel::ElementB; + using LayoutB = typename B2bImplicitGemmKernel::LayoutB; + using ElementC = typename B2bImplicitGemmKernel::ElementC; + using LayoutC = typename B2bImplicitGemmKernel::LayoutC; + using ElementAccumulator = typename B2bImplicitGemmKernel::ElementAccumulator; + using ElementCompute = typename B2bImplicitGemmKernel::ElementCompute; + using OperatorClass = typename B2bImplicitGemmKernel::OperatorClass; + using ArchTag = typename B2bImplicitGemmKernel::ArchTag; + using ThreadblockShape0 = typename B2bImplicitGemmKernel::ThreadblockShape0; + using ThreadblockShape1 = typename B2bImplicitGemmKernel::ThreadblockShape1; + using WarpShape0 = typename B2bImplicitGemmKernel::WarpShape0; + using WarpShape1 = typename B2bImplicitGemmKernel::WarpShape1; + using InstructionShape = typename B2bImplicitGemmKernel::InstructionShape; + using ThreadblockSwizzle = typename B2bImplicitGemmKernel::ThreadblockSwizzle; + using EpilogueOutputOp0 = typename B2bImplicitGemmKernel::EpilogueOutputOp0; + using EpilogueOutputOp1 = typename B2bImplicitGemmKernel::EpilogueOutputOp1; + static int const kStages = B2bImplicitGemmKernel::kStages; + static int const kConvDim = B2bImplicitGemmKernel::kConvDim; + using WarpMmaOperator0 = typename B2bImplicitGemmKernel::WarpMmaOperator0; + using WarpMmaOperator1 = typename B2bImplicitGemmKernel::WarpMmaOperator1; + using ArchMmaOperator = typename B2bImplicitGemmKernel::ArchMmaOperator; + using MathOperator = typename B2bImplicitGemmKernel::MathOperator; + + static cutlass::conv::Operator const kConvolutionalOperator = B2bImplicitGemmKernel::kConvolutionalOperator; + static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = B2bImplicitGemmKernel::kIteratorAlgorithm; + + static int const kWarpCount = + (ThreadblockShape0::kM / WarpShape0::kM) * + (ThreadblockShape0::kN / WarpShape0::kN); + + /// Argument structure + using Arguments = typename B2bImplicitGemmKernel::Arguments; + +private: + + /// Kernel parameters object + typename B2bImplicitGemmKernel::Params params_; + +public: + + /// Constructs Implicit GEMM + B2bImplicitGemmConvolution() { } + + /// Determines whether the Implicit GEMM can execute the given problem. + static Status can_implement(Arguments const &args) { + + // dispatch to iterators + Status status = B2bImplicitGemmKernel::B2bMma::IteratorA0::can_implement(args.problem_size_0); + if (Status::kSuccess != status) { + return status; + } + + status = B2bImplicitGemmKernel::B2bMma::IteratorB0::can_implement(args.problem_size_0); + if (Status::kSuccess != status) { + return status; + } + + status = B2bImplicitGemmKernel::B2bMma::IteratorB1::can_implement(args.problem_size_1); + if (Status::kSuccess != status) { + return status; + } + + // Determine grid shape + ThreadblockSwizzle threadblock_swizzle; + + dim3 grid = threadblock_swizzle.get_grid_shape( + threadblock_swizzle.get_tiled_shape( + cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_0), + {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK}, + args.problem_size_0.split_k_slices)); + + if (!(grid.y <= std::numeric_limits::max() && + grid.z <= std::numeric_limits::max())) { + + return Status::kErrorInvalidProblem; + } + + return Status::kSuccess; + } + + /// Gets the workspace size + static size_t get_workspace_size(Arguments const &args) { + + size_t workspace_bytes = 0; + + // Determine grid shape + ThreadblockSwizzle threadblock_swizzle; + + cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape( + cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_0), + {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK}, + args.problem_size_0.split_k_slices); + + if(args.split_k_mode == SplitKMode::kParallel) { + + // Split-K parallel: CTAs in k-dimension write the partial results in a temporary workspace. + // The user needs to call a reduction operator to optain the final output tensor + workspace_bytes = + sizeof(ElementAccumulator) * + size_t(cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, args.problem_size_0)) * + size_t(grid_tiled_shape.k()); + } + + else if(args.split_k_mode == SplitKMode::kSerial && args.problem_size_0.split_k_slices > 1) { + + // Split-K serial: The user workspace is used to store semaphore and serialize writing the + // final reduced output to user's output tensor + workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n()); + } + + return workspace_bytes; + } + + /// Initializes GEMM state from arguments. + Status initialize( + Arguments const &args, + void *workspace = nullptr, + cudaStream_t stream = nullptr) { + + if (args.problem_size_0.split_k_slices > 1) { + + if (!workspace) { + return Status::kErrorWorkspaceNull; + } + + cudaError_t status = cudaMemsetAsync(workspace, 0, get_workspace_size(args), stream); + + if (status != cudaSuccess) { + return Status::kErrorInternal; + } + } + + // initialize the params structure from the arguments + params_ = typename B2bImplicitGemmKernel::Params( + args, + static_cast(workspace) + ); + + int smem_size = int(sizeof(typename B2bImplicitGemmKernel::SharedStorage)); + + if (smem_size >= (48 << 10)) { + cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + smem_size); + + if (result != cudaSuccess) { + return Status::kErrorInternal; + } + + result = cudaFuncSetAttribute( + cutlass::Kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 100); + + if (result != cudaSuccess) { + return Status::kErrorInternal; + } + } + + return Status::kSuccess; + } + + /// Initializes GEMM state from arguments. + Status update(Arguments const &args, void *workspace = nullptr) { + + // update the params structure from the arguments + params_.ptr_A0 = args.ref_A0.data(); + params_.ptr_B0 = args.ref_B0.data(); + params_.ptr_C0 = args.ref_C0.data(); + params_.ptr_B1 = args.ref_B1.data(); + params_.ptr_C1 = args.ref_C1.data(); + params_.ptr_D1 = args.ref_D1.data(); + params_.output_op_0 = args.output_op_0; + params_.output_op_1 = args.output_op_1; + params_.semaphore = static_cast(workspace); + + return Status::kSuccess; + } + + /// Runs the kernel using initialized state. + Status run(cudaStream_t stream = nullptr) { + + ThreadblockSwizzle threadblock_swizzle; + + dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape); + dim3 block(32 * kWarpCount, 1, 1); + + int smem_size = int(sizeof(typename B2bImplicitGemmKernel::SharedStorage)); + + cutlass::Kernel<<>>(params_); + + cudaError_t result = cudaGetLastError(); + + return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal; + } + + /// Runs the kernel using initialized state. + Status operator()(cudaStream_t stream = nullptr) { + return run(stream); + } + + /// Runs the kernel using initialized state. + Status operator()( + Arguments const &args, + void *workspace = nullptr, + cudaStream_t stream = nullptr) { + + Status status = initialize(args, workspace); + + if (status == Status::kSuccess) { + status = run(stream); + } + + return status; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +///////////////////////////////////////////////////////////////////////////////////////////////// +} // namespace device +} // namespace conv +} // namespace cutlass +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/examples/13_two_tensor_op_fusion/fused_conv2d.cu b/examples/13_two_tensor_op_fusion/fused_conv2d.cu new file mode 100644 index 0000000000..f6bb3d7259 --- /dev/null +++ b/examples/13_two_tensor_op_fusion/fused_conv2d.cu @@ -0,0 +1,102 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.h" +#include "b2b_conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.h" +#include "b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h" +#include "b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h" + +int run() { + + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (!(props.major * 10 + props.minor >= 75)) { + std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75." + << std::endl; + + // Returning zero so this test passes on older Toolkits. Its actions are no-op. + return 0; + } + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + std::cout << "Running on SM80" << std::endl; + run_nonfused_conv2d_fprop_optimized_f16_sm80(); + run_fused_conv2d_fprop_optimized_f16_sm80(); + run_nonfused_conv2d_fprop_optimized_s8_sm80(); + run_fused_conv2d_fprop_optimized_s8_sm80(); +#elif defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + std::cout << "Running on SM75" << std::endl; + run_nonfused_conv2d_fprop_optimized_f16_sm75(); + run_fused_conv2d_fprop_optimized_f16_sm75(); + run_nonfused_conv2d_fprop_optimized_s8_sm75(); + run_fused_conv2d_fprop_optimized_s8_sm75(); +#endif + + return 0; +} + +int main() { + + bool notSupported = false; + + // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2. + // + // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples. + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { + std::cerr << "Tensor Core operations used in this example must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + + notSupported = true; + + } + + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (!(props.major * 10 + props.minor >= 75)) { + std::cerr << "Tensor Ops used in this example must be run on a machine with compute capability at least 75." + << std::endl; + + notSupported = true; + } + + if (notSupported) { + // Returning zero so this test passes on older Toolkits. Its actions are no-op. + return 0; + } + + return run(); +} + diff --git a/examples/13_fused_two_gemms/fused_gemm.cu b/examples/13_two_tensor_op_fusion/fused_gemm.cu similarity index 66% rename from examples/13_fused_two_gemms/fused_gemm.cu rename to examples/13_two_tensor_op_fusion/fused_gemm.cu index b96a0ef090..65bad94338 100644 --- a/examples/13_fused_two_gemms/fused_gemm.cu +++ b/examples/13_two_tensor_op_fusion/fused_gemm.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -22,43 +22,22 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ -/* - -This example shows fusing two GEMM mainloops into one kernel. The first GEMM computes relu(alpha*A*B) and -the second GEMM computes relu(alpha*A*B+beta*C). The performance measuring environment compares against -two unfused GEMM operations, demonstrating a speedup of the fused kernel on the -NVIDIA Turing GPU architecture. - -Problem size: - GEMM1 (M,N,K): 128*1600, 64, 576 - GEMM2 (M,N,K): 128*1600, 128, 64 - -Note that GEMM1_N = GEMM2_K - -The example requires the number of threadblocks be the same across 2 GEMMs and -thread_block_tile_N = problem_N so the data required by each layer is threadblock-resident. It -also requires warp_tile_N = thread_block_tile_N so the data required by each warp is -register-file-resident. - -Performance: - - fp16 on Tesla T4 @ 1590MHz (non-fused vs. fused): 1.39011 ms vs. 1.26035 ms - - int8 on Tesla T4 @ 1590MHz (non-fused vs. fused): 0.751759 ms vs. 0.62971 ms - - fp16 on Quadro RTX 8000 @ 1890MHz (non-fused vs. fused): 0.721144 ms vs. 0.629864 ms - - int8 on Quadro RTX 8000 @ 1890MHz (non-fused vs. fused): 0.379049 ms vs. 0.324764 ms - - int8 on GA100 @ 1200MHz (non-fused vs. fused): 0.153795 ms vs. 0.129874 ms - -*/ #include "b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h" +#include "b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm80.h" #include "b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h" #include "b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h" int run() { #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + std::cout << "Running on SM80" << std::endl; + run_nonfused_gemm_f16_sm80(); + run_fused_gemm_f16_sm80(); run_nonfused_gemm_s8_sm80(); run_fused_gemm_s8_sm80(); #elif defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + std::cout << "Running on SM75" << std::endl; run_nonfused_gemm_f16(); run_fused_gemm_f16(); run_nonfused_gemm_s8(); @@ -74,9 +53,9 @@ int main() { // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2. // - // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples. + // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples. if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { - std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + std::cerr << "Tensor Core operations used in this example must be compiled with CUDA 10.2 Toolkit or later." << std::endl; notSupported = true; } @@ -90,7 +69,7 @@ int main() { } if (!(props.major * 10 + props.minor >= 75)) { - std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75." + std::cerr << "Tensor Ops used in this example must be run on a machine with compute capability at least 75." << std::endl; notSupported = true; diff --git a/examples/13_fused_two_gemms/kernel/b2b_gemm.h b/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h similarity index 99% rename from examples/13_fused_two_gemms/kernel/b2b_gemm.h rename to examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h index a67b1e877c..5627fc319b 100644 --- a/examples/13_fused_two_gemms/kernel/b2b_gemm.h +++ b/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h b/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h new file mode 100644 index 0000000000..9a7b462a38 --- /dev/null +++ b/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h @@ -0,0 +1,475 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a pipelined Implicit GEMM kernel. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/aligned_buffer.h" +#include "cutlass/array.h" +#include "cutlass/numeric_types.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/semaphore.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv2d_problem_size.h" +#include "cutlass/conv/conv3d_problem_size.h" +#include "cutlass/epilogue/threadblock/output_iterator_parameter.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace kernel { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename B2bMma_, ///! Threadblock-scoped matrix multiply-accumulate + typename Epilogue_, ///! Epilogue + typename ThreadblockSwizzle_, ///! Threadblock swizzling function + conv::Operator ConvOperator, ///! Convolutional operator (Fprop, Dgrad, Wgrad) + typename ConvProblemSize_ = Conv2dProblemSize ///! Convolutional operator on 2D or 3D problem +> +struct B2bImplicitGemmConvolution { + + using B2bMma = B2bMma_; + using Epilogue = Epilogue_; + using EpilogueOutputOp0 = typename B2bMma::OutputOp; + using EpilogueOutputOp1 = typename Epilogue::OutputOp; + using ThreadblockSwizzle = ThreadblockSwizzle_; + static Operator const kConvolutionalOperator = ConvOperator; + + using ElementA = typename B2bMma::IteratorA0::Element; + using LayoutA = typename B2bMma::IteratorA0::Layout; + using ElementB = typename B2bMma::IteratorB0::Element; + using LayoutB = typename B2bMma::IteratorB0::Layout; + using ElementC = typename EpilogueOutputOp1::ElementOutput; + + /// Set output tensor C layout + using LayoutC = LayoutA; + + using ElementAccumulator = typename EpilogueOutputOp0::ElementAccumulator; + using ElementCompute = typename EpilogueOutputOp0::ElementCompute; + + using WarpMmaOperator0 = typename B2bMma::Policy0::Operator; + using WarpMmaOperator1 = typename B2bMma::Policy1::Operator; + + using ArchMmaOperator = typename WarpMmaOperator0::ArchMmaOperator; + using MathOperator = typename ArchMmaOperator::Operator; + + using OperatorClass = typename WarpMmaOperator0::OperatorClass; + using ArchTag = typename WarpMmaOperator0::ArchTag; + + using ThreadblockShape0 = typename B2bMma::Shape0; + using ThreadblockShape1 = typename B2bMma::Shape1; + using WarpShape0 = typename WarpMmaOperator0::Shape; + using WarpShape1 = typename WarpMmaOperator1::Shape; + using InstructionShape = typename ArchMmaOperator::Shape; + + static int const kStages = B2bMma::kStages; + static IteratorAlgorithm const kIteratorAlgorithm = B2bMma::IteratorA0::kIteratorAlgorithm; + + /// Warp count (concept: GemmShape) + using WarpCount0 = typename B2bMma::WarpCount0; + static int const kThreadCount = 32 * WarpCount0::kCount; + + using TensorRefA0 = typename B2bMma::IteratorA0::TensorRef; + using TensorRefB0 = typename B2bMma::IteratorB0::TensorRef; + using TensorRefB1 = typename B2bMma::IteratorB1::TensorRef; + using TensorRefC = cutlass::TensorRef; + + /// Check iterator A and B convolution dimension are the same and + // set device::B2bImplicitGemmConvolution::kConvDim + static_assert(B2bMma::IteratorA0::kConvDim == B2bMma::IteratorB0::kConvDim, + "Convolution on different different dimensions is not supported"); + static int const kConvDim = B2bMma::IteratorA0::kConvDim; + + /// Conv dimension and problem size structure (Conv2d or Conv3d) + using ConvProblemSize = ConvProblemSize_; + + /// Wgrad C stride idx for implicit gemm algorithm + // Conv2d row-major matrix C (KxRSC) + // Conv3d row-major matrix C (KxTRSC) + static int const kWgradCStrideIdx = + cutlass::platform::is_same::value ? 2 : 3; + + /// This chooses the appropriate stride element of the C tensor. + static int const kTensorCStrideIdx = + (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0); + + // + // + // + using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter< + LayoutC, + typename Epilogue::OutputTileIterator::Layout, + TensorRefC, + ConvOperator, + ConvProblemSize + >; + + /// Argument structure + struct Arguments { + + // + // Data members + // + + ConvProblemSize problem_size_0; + ConvProblemSize problem_size_1; + TensorRefA0 ref_A0; + TensorRefB0 ref_B0; + TensorRefC ref_C0; + TensorRefB1 ref_B1; + TensorRefC ref_C1; + TensorRefC ref_D1; + typename EpilogueOutputOp0::Params output_op_0; + typename EpilogueOutputOp1::Params output_op_1; + SplitKMode split_k_mode; + + // + // Methods + // + + /// Default ctor + CUTLASS_HOST_DEVICE + Arguments() { } + + CUTLASS_HOST_DEVICE + Arguments( + ConvProblemSize const & problem_size_0, + ConvProblemSize const & problem_size_1 + ): + problem_size_0(problem_size_0), + problem_size_1(problem_size_1) { } + + CUTLASS_HOST_DEVICE + Arguments( + ConvProblemSize const & problem_size_0, + ConvProblemSize const & problem_size_1, + TensorRefA0 const & ref_A0, + TensorRefB0 const & ref_B0, + TensorRefC const & ref_C0, + TensorRefB1 const & ref_B1, + TensorRefC const & ref_C1, + TensorRefC const & ref_D1, + typename EpilogueOutputOp0::Params const & output_op_0, + typename EpilogueOutputOp1::Params const & output_op_1, + SplitKMode const & split_k_mode = SplitKMode::kSerial + ): + problem_size_0(problem_size_0), + problem_size_1(problem_size_1), + ref_A0(ref_A0), + ref_B0(ref_B0), + ref_C0(ref_C0), + ref_B1(ref_B1), + ref_C1(ref_C1), + ref_D1(ref_D1), + output_op_0(output_op_0), + output_op_1(output_op_1), + split_k_mode(split_k_mode) + { + + } + + }; + + /// Parameters structure + struct Params { + ConvProblemSize problem_size_0; + ConvProblemSize problem_size_1; + cutlass::gemm::GemmCoord grid_tiled_shape; + gemm::GemmCoord implicit_gemm_problem_size_0; + gemm::GemmCoord implicit_gemm_problem_size_1; + int gemm_k_iterations_0; + int gemm_k_iterations_1; + typename B2bMma::IteratorA0::Params iterator_A0; + typename B2bMma::IteratorA0::Element const *ptr_A0; + typename B2bMma::IteratorB0::Params iterator_B0; + typename B2bMma::IteratorB0::Element const *ptr_B0; + typename Epilogue::OutputTileIterator::Params iterator_C0; + typename Epilogue::OutputTileIterator::Element *ptr_C0; + typename B2bMma::IteratorB1::Params iterator_B1; + typename B2bMma::IteratorB1::Element const *ptr_B1; + typename Epilogue::OutputTileIterator::Params iterator_C1; + typename Epilogue::OutputTileIterator::Element *ptr_C1; + typename Epilogue::OutputTileIterator::Params iterator_D1; + typename Epilogue::OutputTileIterator::Element *ptr_D1; + typename EpilogueOutputOp0::Params output_op_0; + typename EpilogueOutputOp1::Params output_op_1; + int *semaphore; + SplitKMode split_k_mode; + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Params(): gemm_k_iterations_0(0), gemm_k_iterations_1(0) { } + + /// + CUTLASS_HOST_DEVICE + Params( + Arguments const &args, + int *semaphore = nullptr + ): + problem_size_0(args.problem_size_0), + problem_size_1(args.problem_size_1), + implicit_gemm_problem_size_0(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_0)), + implicit_gemm_problem_size_1(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_1)), + grid_tiled_shape(grid_tiled_shape), + iterator_A0(B2bMma::IteratorA0::getParams(args.problem_size_0, args.ref_A0.layout())), + ptr_A0(args.ref_A0.data()), + iterator_B0(args.problem_size_0, args.ref_B0.layout()), + ptr_B0(args.ref_B0.data()), + iterator_C0(ConvOutputIteratorParameter::layout(args.ref_C0)), + ptr_C0(args.ref_C0.data()), + iterator_B1(args.problem_size_1, args.ref_B1.layout()), + ptr_B1(args.ref_B1.data()), + iterator_C1(ConvOutputIteratorParameter::layout(args.ref_C1)), + ptr_C1(args.ref_C1.data()), + iterator_D1(ConvOutputIteratorParameter::layout(args.ref_D1)), + ptr_D1(args.ref_D1.data()), + output_op_0(args.output_op_0), + output_op_1(args.output_op_1), + semaphore(semaphore), + split_k_mode(args.split_k_mode) + { + gemm_k_iterations_0 = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape0::kK, args.problem_size_0); + gemm_k_iterations_1 = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape1::kK, args.problem_size_1); + + ThreadblockSwizzle threadblock_swizzle; + + grid_tiled_shape = threadblock_swizzle.get_tiled_shape( + implicit_gemm_problem_size_0, + {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK}, + args.problem_size_0.split_k_slices); + } + }; + + /// Shared memory storage structure + union SharedStorage { + typename B2bMma::B2bMmaSharedStorage main_loop; + typename Epilogue::SharedStorage epilogue; + }; + + // + // Methods + // + + CUTLASS_HOST_DEVICE + B2bImplicitGemmConvolution() { } + + /// Executes one ImplicitGEMM + CUTLASS_DEVICE + void operator()(Params const ¶ms, SharedStorage &shared_storage) { + + // Compute threadblock location + ThreadblockSwizzle threadblock_swizzle; + + cutlass::gemm::GemmCoord threadblock_tile_idx = + threadblock_swizzle.get_tile_offset(params.grid_tiled_shape); + + // Early exit if CTA is out of range + if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() || + params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) { + + return; + } + + // Compute position within threadblock + int thread_idx = threadIdx.x; + + // Construct iterators to A and B operands + typename B2bMma::IteratorA0 iterator_A0( + params.iterator_A0, + params.problem_size_0, + params.ptr_A0, + thread_idx, + MatrixCoord( + threadblock_tile_idx.m() * B2bMma::Shape0::kM, + threadblock_tile_idx.k() * B2bMma::Shape0::kK + ) + ); + + typename B2bMma::IteratorB0 iterator_B0( + params.iterator_B0, + params.problem_size_0, + params.ptr_B0, + thread_idx, + MatrixCoord( + threadblock_tile_idx.k() * B2bMma::Shape0::kK, + threadblock_tile_idx.n() * B2bMma::Shape0::kN + ) + ); + + typename B2bMma::IteratorB1 iterator_B1( + params.iterator_B1, + params.problem_size_1, + params.ptr_B1, + thread_idx, + MatrixCoord( + threadblock_tile_idx.k() * B2bMma::Shape1::kK, + threadblock_tile_idx.n() * B2bMma::Shape1::kN + ) + ); + + + // Broadcast the warp_id computed by lane 0 to ensure dependent code + // is compiled as warp-uniform. + int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + int lane_idx = threadIdx.x % 32; + + // + // Main loop + // + + EpilogueOutputOp0 output_op_0(params.output_op_0); + + // Construct thread-scoped matrix multiply + B2bMma b2bMma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx); + + typename B2bMma::FragmentC0 src_accum; + typename B2bMma::FragmentC1 accumulators; + + src_accum.clear(); + accumulators.clear(); + + // Compute threadblock-scoped matrix multiply-add + b2bMma(params.gemm_k_iterations_0, accumulators, iterator_A0, iterator_B0, iterator_B1, src_accum, output_op_0); + + // + // Epilogue + // + + EpilogueOutputOp1 output_op_1(params.output_op_1); + + // Construct the semaphore. + int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m(); + + Semaphore semaphore(params.semaphore + block_idx, thread_idx); + + // Compute logical position within grid + threadblock_tile_idx = + threadblock_swizzle.get_tile_offset(params.grid_tiled_shape); + + // If performing a reduction via split-K, fetch the initial synchronization + if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { + + // Fetch the synchronization lock initially but do not block. + semaphore.fetch(); + + // Indicate which position in a serial reduction the output operator is currently updating + output_op_1.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k()); + } + + MatrixCoord threadblock_offset( + threadblock_tile_idx.m() * B2bMma::Shape1::kM, + threadblock_tile_idx.n() * B2bMma::Shape1::kN + ); + + // Tile iterator writing to destination tensor + typename Epilogue::OutputTileIterator iterator_D1( + params.iterator_D1, + params.ptr_D1, + ConvOutputIteratorParameter::extent(params.problem_size_1), + thread_idx, + threadblock_offset + ); + + // Tile iterator reading from source accumulator tensor + typename Epilogue::OutputTileIterator iterator_C1( + params.iterator_C1, + params.ptr_C1, + ConvOutputIteratorParameter::extent(params.problem_size_1), + thread_idx, + threadblock_offset + ); + + + // Construct the epilogue + Epilogue epilogue( + shared_storage.epilogue, + thread_idx, + warp_idx, + lane_idx); + + // Wait on the semaphore - this latency may have been covered by iterator construction + if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { + + // For subsequent threadblocks, the source matrix is held in the 'D' tensor. + if (threadblock_tile_idx.k()) { + iterator_C1 = iterator_D1; + } + + semaphore.wait(threadblock_tile_idx.k()); + + __threadfence(); + } + // Each split-k-slice writes to a unique tensor location + else if (params.split_k_mode == SplitKMode::kParallel) { + iterator_D1.add_pointer_offset(threadblock_tile_idx.k() * + cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size_1)); + } + + // Run efficient epilogue + epilogue(output_op_1, iterator_D1, accumulators, iterator_C1); + + // + // Release the semaphore + // + + if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { + + int lock = 0; + if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) { + + // The final threadblock resets the semaphore for subsequent grids. + lock = 0; + } + else { + // Otherwise, the semaphore is incremented + lock = threadblock_tile_idx.k() + 1; + } + + semaphore.release(lock); + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace kernel +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop.h new file mode 100644 index 0000000000..a9813e6d2a --- /dev/null +++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop.h @@ -0,0 +1,1281 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief + Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped + matrix multiply-add with the appropriate threadblock-scoped epilogue. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/conv/kernel/default_conv2d.h" + +#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h" +#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h" +#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h" +#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h" +#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h" + +#include "kernel/b2b_implicit_gemm_convolution.h" +#include "threadblock/b2b_implicit_gemm_pipelined.h" +#include "threadblock/b2b_implicit_gemm_multistage.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace kernel { + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Defines a kernel for Conv2dFprop +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape0, + typename ThreadblockShape1, + typename WarpShape0, + typename WarpShape1, + typename InstructionShape, + typename EpilogueOutputOp0, + typename EpilogueOutputOp1, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag, + conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic, + conv::StrideSupport StrideSupport = StrideSupport::kStrided +> struct DefaultB2bConv2dFprop; + +///////////////////////////////////////////////////////////////////////////////////////////////// +// OpClassTensorOp convolutions +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and multistage +/// pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape0, + typename ThreadblockShape1, + typename WarpShape0, + typename WarpShape1, + typename InstructionShape, + typename EpilogueOutputOp0, + typename EpilogueOutputOp1, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultB2bConv2dFprop < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassTensorOp, + ArchTag, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kAnalytic +> { + + // Define the core components from GEMM + using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, + Stages, MathOperatorTag>; + using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, + Stages, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA; + using IteratorA0 = + cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, LayoutA, + ThreadMapA0 + >; + + using SmemIteratorA0 = typename MmaCore0::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB; + using IteratorB0 = + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, LayoutB, + ThreadMapB0 + >; + + using SmemIteratorB0 = typename MmaCore0::SmemIteratorB; + + // Use fragment iterator for A operand + using AccumulatorLayout = cutlass::layout::ColumnMajor; + using FragmentIteratorA1 = + cutlass::gemm::warp::MmaTensorOpFragmentIterator< + cutlass::MatrixShape, //warp shape + cutlass::MatrixShape, //accumulator shape + MmaCore1::Shape::kK, //kBlocksColumn + ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>; + + // Define iterators over tiles from the B operand + using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB; + using IteratorB1 = + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, LayoutB, + ThreadMapB1 + >; + + using SmemIteratorB1 = typename MmaCore1::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp; + using MmaPolicy0 = typename MmaCore0::MmaPolicy; + using MmaPolicy1 = typename MmaCore1::MmaPolicy; + + // Define the Mma + using B2bMma = threadblock::B2bImplicitGemmMultistage< + ThreadblockShape0, + IteratorA0, + SmemIteratorA0, + arch::CacheOperation::Always, + IteratorB0, + SmemIteratorB0, + arch::CacheOperation::Global, + ThreadblockShape1, + FragmentIteratorA1, + IteratorB1, + SmemIteratorB1, + arch::CacheOperation::Global, + EpilogueOutputOp0, + MmaPolicy0, + MmaPolicy1, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp< + ThreadblockShape1, + WarpMmaTensorOp1, + 1, + EpilogueOutputOp1, + EpilogueOutputOp1::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution< + B2bMma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and multistage +/// pipeline with interleaved layout. +template < + typename ElementA, + typename ElementB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape0, + typename ThreadblockShape1, + typename WarpShape0, + typename WarpShape1, + typename InstructionShape, + typename EpilogueOutputOp0, + typename EpilogueOutputOp1, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag, + int InterleavedK +> +struct DefaultB2bConv2dFprop < + ElementA, + layout::TensorNCxHWx, + ElementB, + layout::TensorCxRSKx, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassTensorOp, + ArchTag, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kAnalytic +> { + + // Define the core components from GEMM + using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved, + ElementB, layout::RowMajorInterleaved, + ElementAccumulator, LayoutC, arch::OpClassTensorOp, + Stages, MathOperatorTag, true>; + using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved, + ElementB, layout::RowMajorInterleaved, + ElementAccumulator, LayoutC, arch::OpClassTensorOp, + Stages, MathOperatorTag, true>; + + // Define iterators over tiles from the A operand + // Note GEMM shared memory threadmap is used here because conv global memory + // layout needs to be mapped to fprop which is similar to the crosswise + // layout which is used by the interleaved GEMM shared memory threadmap. + // The Interleaved GEMM global memory layout is similar to the congruous + // layout. + using ThreadMapA0 = typename MmaCore0::SmemThreadMapA; + using IteratorA0 = + cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, layout::TensorNCxHWx, + ThreadMapA0 + >; + + using SmemIteratorA0 = typename MmaCore0::SmemIteratorA; + + // Define iterators over tiles from the B operand + // Note GEMM shared memory threadmap is used here because conv global memory + // layout needs to be mapped to fprop which is similar to the crosswise + // layout which is used by the interleaved GEMM shared memory threadmap. + // The Interleaved GEMM global memory layout is similar to the congruous + // layout. + using ThreadMapB0 = typename MmaCore0::SmemThreadMapB; + using IteratorB0 = + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, layout::TensorCxRSKx, + ThreadMapB0 + >; + + using SmemIteratorB0 = typename MmaCore0::SmemIteratorB; + + // Use fragment iterator for A operand + using AccumulatorLayout = cutlass::layout::RowMajor; + using FragmentIteratorA1 = + cutlass::gemm::warp::MmaTensorOpFragmentIterator< + cutlass::MatrixShape, //warp shape + cutlass::MatrixShape, //accumulator shape + MmaCore1::Shape::kK, //kBlocksColumn + ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>; + + using ThreadMapB1 = typename MmaCore1::SmemThreadMapB; + using IteratorB1 = + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, layout::TensorCxRSKx, + ThreadMapB1 + >; + + using SmemIteratorB1 = typename MmaCore1::SmemIteratorB; + + + // Warp-level GEMM components + using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp; + using MmaPolicy0 = typename MmaCore0::MmaPolicy; + using MmaPolicy1 = typename MmaCore1::MmaPolicy; + + // Define the Mma + using B2bMma = threadblock::B2bImplicitGemmMultistage< + ThreadblockShape0, + IteratorA0, + SmemIteratorA0, + arch::CacheOperation::Always, + IteratorB0, + SmemIteratorB0, + arch::CacheOperation::Global, + ThreadblockShape1, + FragmentIteratorA1, + IteratorB1, + SmemIteratorB1, + arch::CacheOperation::Global, + EpilogueOutputOp0, + MmaPolicy0, + MmaPolicy1, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue< + ThreadblockShape1, + WarpMmaTensorOp1, + 1, + EpilogueOutputOp1, + EpilogueOutputOp1::kCount, + InterleavedK + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution< + B2bMma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm +/// and 2 stage pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape0, + typename ThreadblockShape1, + typename WarpShape0, + typename WarpShape1, + typename InstructionShape, + typename EpilogueOutputOp0, + typename EpilogueOutputOp1, + typename ThreadblockSwizzle, + typename MathOperatorTag +> +struct DefaultB2bConv2dFprop < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassTensorOp, + ArchTag, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kAnalytic +> { + + // Define the core components from GEMM + using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, + 2, MathOperatorTag>; + using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, + 2, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA; + using IteratorA0 = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, LayoutA, + ThreadMapA0 + > + >; + + using SmemIteratorA0 = typename MmaCore0::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB; + using IteratorB0 = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, LayoutB, + ThreadMapB0 + > + >; + + using SmemIteratorB0 = typename MmaCore0::SmemIteratorB; + + // Use fragment iterator for A operand + using AccumulatorLayout = cutlass::layout::ColumnMajor; + using FragmentIteratorA1 = + cutlass::gemm::warp::MmaTensorOpFragmentIterator< + cutlass::MatrixShape, //warp shape + cutlass::MatrixShape, //accumulator shape + MmaCore1::Shape::kK, //kBlocksColumn + ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>; + + // Define iterators over tiles from the B operand + using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB; + using IteratorB1 = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, LayoutB, + ThreadMapB1 + > + >; + + using SmemIteratorB1 = typename MmaCore1::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp; + using MmaPolicy0 = typename MmaCore0::MmaPolicy; + using MmaPolicy1 = typename MmaCore1::MmaPolicy; + + // Define the Mma + using B2bMma = threadblock::B2bImplicitGemmPipelined< + ThreadblockShape0, + IteratorA0, + SmemIteratorA0, + IteratorB0, + SmemIteratorB0, + ThreadblockShape1, + FragmentIteratorA1, + IteratorB1, + SmemIteratorB1, + ElementC, + LayoutC, + EpilogueOutputOp0, + MmaPolicy0, + MmaPolicy1 + >; + + // Define the epilogue + using Epilogue = typename detail::DefaultConvEpilogue< + ArchTag, + ThreadblockShape1, + WarpMmaTensorOp1, + 1, + EpilogueOutputOp1 + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution< + B2bMma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and 2 stage +/// pipeline with interleaved layout. +template < + typename ElementA, + typename ElementB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape0, + typename ThreadblockShape1, + typename WarpShape0, + typename WarpShape1, + typename InstructionShape, + typename EpilogueOutputOp0, + typename EpilogueOutputOp1, + typename ThreadblockSwizzle, + typename MathOperatorTag, + int InterleavedK +> +struct DefaultB2bConv2dFprop < + ElementA, + layout::TensorNCxHWx, + ElementB, + layout::TensorCxRSKx, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassTensorOp, + ArchTag, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kAnalytic +> { + + // Define the core components from GEMM + using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved, + ElementB, layout::RowMajorInterleaved, + ElementAccumulator, LayoutC, arch::OpClassTensorOp, + 2, MathOperatorTag, true>; + using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved, + ElementB, layout::RowMajorInterleaved, + ElementAccumulator, LayoutC, arch::OpClassTensorOp, + 2, MathOperatorTag, true>; + + // Define iterators over tiles from the A operand + // Note GEMM shared memory threadmap is used here because conv global memory + // layout needs to be mapped to fprop which is similar to the crosswise + // layout which is used by the interleaved GEMM shared memory threadmap. + // The Interleaved GEMM global memory layout is similar to the congruous + // layout. + using ThreadMapA0 = typename MmaCore0::SmemThreadMapA; + using IteratorA0 = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, layout::TensorNCxHWx, + ThreadMapA0 + > + >; + + using SmemIteratorA0 = typename MmaCore0::SmemIteratorA; + + // Define iterators over tiles from the B operand + // Note GEMM shared memory threadmap is used here because conv global memory + // layout needs to be mapped to fprop which is similar to the crosswise + // layout which is used by the interleaved GEMM shared memory threadmap. + // The Interleaved GEMM global memory layout is similar to the congruous + // layout. + using ThreadMapB0 = typename MmaCore0::SmemThreadMapB; + using IteratorB0 = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, layout::TensorCxRSKx, + ThreadMapB0 + > + >; + + using SmemIteratorB0 = typename MmaCore0::SmemIteratorB; + + // Use fragment iterator for A operand + using AccumulatorLayout = cutlass::layout::RowMajor; + using FragmentIteratorA1 = + cutlass::gemm::warp::MmaTensorOpFragmentIterator< + cutlass::MatrixShape, //warp shape + cutlass::MatrixShape, //accumulator shape + MmaCore1::Shape::kK, //kBlocksColumn + ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>; + + // Define iterators over tiles from the B operand + using ThreadMapB1 = typename MmaCore1::SmemThreadMapB; + using IteratorB1 = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, layout::TensorCxRSKx, + ThreadMapB1 + > + >; + + using SmemIteratorB1 = typename MmaCore1::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp; + using MmaPolicy0 = typename MmaCore0::MmaPolicy; + using MmaPolicy1 = typename MmaCore1::MmaPolicy; + + // Define the Mma + using B2bMma = threadblock::B2bImplicitGemmPipelined< + ThreadblockShape0, + IteratorA0, + SmemIteratorA0, + IteratorB0, + SmemIteratorB0, + ThreadblockShape1, + FragmentIteratorA1, + IteratorB1, + SmemIteratorB1, + ElementC, + LayoutC, + EpilogueOutputOp0, + MmaPolicy0, + MmaPolicy1 + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue< + ThreadblockShape1, + WarpMmaTensorOp1, + 1, + EpilogueOutputOp1, + EpilogueOutputOp1::kCount, + InterleavedK + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution< + B2bMma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm and +/// multistage pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape0, + typename ThreadblockShape1, + typename WarpShape0, + typename WarpShape1, + typename InstructionShape, + typename EpilogueOutputOp0, + typename EpilogueOutputOp1, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultB2bConv2dFprop < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassTensorOp, + ArchTag, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kOptimized +> { + + // Define the core components from GEMM + using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, + Stages, MathOperatorTag>; + using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, + Stages, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA; + using IteratorA0 = + cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, LayoutA, + ThreadMapA0 + >; + + using SmemIteratorA0 = typename MmaCore0::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB; + using IteratorB0 = + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, LayoutB, + ThreadMapB0 + >; + + using SmemIteratorB0 = typename MmaCore0::SmemIteratorB; + + // Use fragment iterator for A operand + using AccumulatorLayout = cutlass::layout::ColumnMajor; + using FragmentIteratorA1 = + cutlass::gemm::warp::MmaTensorOpFragmentIterator< + cutlass::MatrixShape, //warp shape + cutlass::MatrixShape, //accumulator shape + MmaCore1::Shape::kK, //kBlocksColumn + ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>; + + // Define iterators over tiles from the B operand + using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB; + using IteratorB1 = + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, LayoutB, + ThreadMapB1 + >; + + using SmemIteratorB1 = typename MmaCore1::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp; + using MmaPolicy0 = typename MmaCore0::MmaPolicy; + using MmaPolicy1 = typename MmaCore1::MmaPolicy; + + // Define the Mma + using B2bMma = threadblock::B2bImplicitGemmMultistage< + ThreadblockShape0, + IteratorA0, + SmemIteratorA0, + arch::CacheOperation::Always, + IteratorB0, + SmemIteratorB0, + arch::CacheOperation::Global, + ThreadblockShape1, + FragmentIteratorA1, + IteratorB1, + SmemIteratorB1, + arch::CacheOperation::Global, + EpilogueOutputOp0, + MmaPolicy0, + MmaPolicy1, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp< + ThreadblockShape1, + WarpMmaTensorOp1, + 1, + EpilogueOutputOp1, + EpilogueOutputOp1::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution< + B2bMma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dFprop specialzation for Optimzed IteratorAlgorithm and +// multistage pipeline with interleaved layout. +template < + typename ElementA, + typename ElementB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape0, + typename ThreadblockShape1, + typename WarpShape0, + typename WarpShape1, + typename InstructionShape, + typename EpilogueOutputOp0, + typename EpilogueOutputOp1, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag, + int InterleavedK +> +struct DefaultB2bConv2dFprop < + ElementA, + layout::TensorNCxHWx, + ElementB, + layout::TensorCxRSKx, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassTensorOp, + ArchTag, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kOptimized +> { + + // Define the core components from GEMM + using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved, + ElementB, layout::RowMajorInterleaved, + ElementAccumulator, LayoutC, arch::OpClassTensorOp, + Stages, MathOperatorTag, true>; + using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved, + ElementB, layout::RowMajorInterleaved, + ElementAccumulator, LayoutC, arch::OpClassTensorOp, + Stages, MathOperatorTag, true>; + + // Define iterators over tiles from the A operand + // Note GEMM shared memory threadmap is used here because conv global memory + // layout needs to be mapped to fprop which is similar to the crosswise + // layout which is used by the interleaved GEMM shared memory threadmap. + // The Interleaved GEMM global memory layout is similar to the congruous + // layout. + using ThreadMapA0 = typename MmaCore0::SmemThreadMapA; + using IteratorA0 = + cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, layout::TensorNCxHWx, + ThreadMapA0 + >; + + using SmemIteratorA0 = typename MmaCore0::SmemIteratorA; + + // Define iterators over tiles from the B operand + // Note GEMM shared memory threadmap is used here because conv global memory + // layout needs to be mapped to fprop which is similar to the crosswise + // layout which is used by the interleaved GEMM shared memory threadmap. + // The Interleaved GEMM global memory layout is similar to the congruous + // layout. + using ThreadMapB0 = typename MmaCore0::SmemThreadMapB; + using IteratorB0 = + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, layout::TensorCxRSKx, + ThreadMapB0 + >; + + using SmemIteratorB0 = typename MmaCore0::SmemIteratorB; + + // Use fragment iterator for A operand + using AccumulatorLayout = cutlass::layout::RowMajor; + using FragmentIteratorA1 = + cutlass::gemm::warp::MmaTensorOpFragmentIterator< + cutlass::MatrixShape, //warp shape + cutlass::MatrixShape, //accumulator shape + MmaCore1::Shape::kK, //kBlocksColumn + ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>; + + using ThreadMapB1 = typename MmaCore1::SmemThreadMapB; + using IteratorB1 = + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, layout::TensorCxRSKx, + ThreadMapB1 + >; + + using SmemIteratorB1 = typename MmaCore1::SmemIteratorB; + + + // Warp-level GEMM components + using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp; + using MmaPolicy0 = typename MmaCore0::MmaPolicy; + using MmaPolicy1 = typename MmaCore1::MmaPolicy; + + // Define the Mma + using B2bMma = threadblock::B2bImplicitGemmMultistage< + ThreadblockShape0, + IteratorA0, + SmemIteratorA0, + arch::CacheOperation::Always, + IteratorB0, + SmemIteratorB0, + arch::CacheOperation::Global, + ThreadblockShape1, + FragmentIteratorA1, + IteratorB1, + SmemIteratorB1, + arch::CacheOperation::Global, + EpilogueOutputOp0, + MmaPolicy0, + MmaPolicy1, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue< + ThreadblockShape1, + WarpMmaTensorOp1, + 1, + EpilogueOutputOp1, + EpilogueOutputOp1::kCount, + InterleavedK + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution< + B2bMma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm +/// and 2 stage pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape0, + typename ThreadblockShape1, + typename WarpShape0, + typename WarpShape1, + typename InstructionShape, + typename EpilogueOutputOp0, + typename EpilogueOutputOp1, + typename ThreadblockSwizzle, + typename MathOperatorTag +> +struct DefaultB2bConv2dFprop < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassTensorOp, + ArchTag, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kOptimized +> { + + // Define the core components from GEMM + using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, + 2, MathOperatorTag>; + using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, + 2, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA; + using IteratorA0 = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, LayoutA, + ThreadMapA0 + > + >; + + using SmemIteratorA0 = typename MmaCore0::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB; + using IteratorB0 = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, LayoutB, + ThreadMapB0 + > + >; + + using SmemIteratorB0 = typename MmaCore0::SmemIteratorB; + + // Use fragment iterator for A operand + using AccumulatorLayout = cutlass::layout::ColumnMajor; + using FragmentIteratorA1 = + cutlass::gemm::warp::MmaTensorOpFragmentIterator< + cutlass::MatrixShape, //warp shape + cutlass::MatrixShape, //accumulator shape + MmaCore1::Shape::kK, //kBlocksColumn + ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>; + + // Define iterators over tiles from the B operand + using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB; + using IteratorB1 = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, LayoutB, + ThreadMapB1 + > + >; + + using SmemIteratorB1 = typename MmaCore1::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp; + using MmaPolicy0 = typename MmaCore0::MmaPolicy; + using MmaPolicy1 = typename MmaCore1::MmaPolicy; + + // Define the Mma + using B2bMma = threadblock::B2bImplicitGemmPipelined< + ThreadblockShape0, + IteratorA0, + SmemIteratorA0, + IteratorB0, + SmemIteratorB0, + ThreadblockShape1, + FragmentIteratorA1, + IteratorB1, + SmemIteratorB1, + ElementC, + LayoutC, + EpilogueOutputOp0, + MmaPolicy0, + MmaPolicy1 + >; + + // Define the epilogue + using Epilogue = typename detail::DefaultConvEpilogue< + ArchTag, + ThreadblockShape1, + WarpMmaTensorOp1, + 1, + EpilogueOutputOp1 + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution< + B2bMma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv2dFprop specialzation for Optimized IteratorAlgorithm and 2 stage +/// pipeline with interleaved layout. +template < + typename ElementA, + typename ElementB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape0, + typename ThreadblockShape1, + typename WarpShape0, + typename WarpShape1, + typename InstructionShape, + typename EpilogueOutputOp0, + typename EpilogueOutputOp1, + typename ThreadblockSwizzle, + typename MathOperatorTag, + int InterleavedK +> +struct DefaultB2bConv2dFprop < + ElementA, + layout::TensorNCxHWx, + ElementB, + layout::TensorCxRSKx, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassTensorOp, + ArchTag, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kOptimized +> { + + // Define the core components from GEMM + using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape0, WarpShape0, InstructionShape, ElementA, layout::ColumnMajorInterleaved, + ElementB, layout::RowMajorInterleaved, + ElementAccumulator, LayoutC, arch::OpClassTensorOp, + 2, MathOperatorTag, true>; + using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape1, WarpShape1, InstructionShape, ElementA, layout::ColumnMajorInterleaved, + ElementB, layout::RowMajorInterleaved, + ElementAccumulator, LayoutC, arch::OpClassTensorOp, + 2, MathOperatorTag, true>; + + // Define iterators over tiles from the A operand + // Note GEMM shared memory threadmap is used here because conv global memory + // layout needs to be mapped to fprop which is similar to the crosswise + // layout which is used by the interleaved GEMM shared memory threadmap. + // The Interleaved GEMM global memory layout is similar to the congruous + // layout. + + // Define iterators over tiles from the A operand + using ThreadMapA0 = typename MmaCore0::SmemThreadMapA; + using IteratorA0 = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, layout::TensorNCxHWx, + ThreadMapA0 + > + >; + + using SmemIteratorA0 = typename MmaCore0::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB0 = typename MmaCore0::SmemThreadMapB; + using IteratorB0 = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, layout::TensorCxRSKx, + ThreadMapB0 + > + >; + + using SmemIteratorB0 = typename MmaCore0::SmemIteratorB; + + // Use fragment iterator for A operand + using AccumulatorLayout = cutlass::layout::RowMajor; + using FragmentIteratorA1 = + cutlass::gemm::warp::MmaTensorOpFragmentIterator< + cutlass::MatrixShape, //warp shape + cutlass::MatrixShape, //accumulator shape + MmaCore1::Shape::kK, //kBlocksColumn + ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp0>; + + using ThreadMapB1 = typename MmaCore1::SmemThreadMapB; + using IteratorB1 = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, layout::TensorCxRSKx, + ThreadMapB1 + > + >; + + using SmemIteratorB1 = typename MmaCore1::SmemIteratorB; + + + // Warp-level GEMM components + using WarpMmaTensorOp1 = typename MmaCore1::MmaTensorOp; + using MmaPolicy0 = typename MmaCore0::MmaPolicy; + using MmaPolicy1 = typename MmaCore1::MmaPolicy; + + // Define the Mma + using B2bMma = threadblock::B2bImplicitGemmPipelined< + ThreadblockShape0, + IteratorA0, + SmemIteratorA0, + IteratorB0, + SmemIteratorB0, + ThreadblockShape1, + FragmentIteratorA1, + IteratorB1, + SmemIteratorB1, + ElementC, + LayoutC, + EpilogueOutputOp0, + MmaPolicy0, + MmaPolicy1 + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue< + ThreadblockShape1, + WarpMmaTensorOp1, + 1, + EpilogueOutputOp1, + EpilogueOutputOp1::kCount, + InterleavedK + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::B2bImplicitGemmConvolution< + B2bMma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop + >; +}; + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace kernel +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/examples/13_fused_two_gemms/kernel/default_b2b_gemm.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h similarity index 76% rename from examples/13_fused_two_gemms/kernel/default_b2b_gemm.h rename to examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h index dab9db904c..cdf537566b 100644 --- a/examples/13_fused_two_gemms/kernel/default_b2b_gemm.h +++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h @@ -1,29 +1,28 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * - * Redistribution and use in source and binary forms, with or without - *modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright notice, - *this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - *notice, this list of conditions and the following disclaimer in the - *documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the names of its - *contributors may be used to endorse or promote products derived from this - *software without specific prior written permission. + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, - *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING - *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, - *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ + /*! \file \brief Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with @@ -118,6 +117,75 @@ template < > struct DefaultB2bGemm; +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for Ampere Architecture +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Access granularity of A matrix in units of elements + int kAlignmentB, + /// Element type for C and D matrix operands + typename ElementC, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape0, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape1, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape0, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape1, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp0, + /// Epilogue output operator + typename EpilogueOutputOp1, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of stages used in the pipelined mainloop + int Stages, + /// If true, kernel is configured to support serial reduction in the + /// epilogue + bool SplitKSerial, + /// Operation performed by GEMM + typename Operator> +struct DefaultB2bGemm { + /// Define the threadblock-scoped matrix multiply-accumulate + using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma< + ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, + ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, + ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1, + InstructionShape, Stages, Operator, EpilogueOutputOp0>::ThreadblockB2bMma; + + static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK; + + /// Define the epilogue + using Epilogue = + typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp< + ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1, + EpilogueOutputOp1::kCount>::Epilogue; + + /// Define the kernel-level GEMM operator. + using B2bGemmKernel = kernel::B2bGemm; +}; + + //////////////////////////////////////////////////////////////////////////////// /// Partial specialization for Turing Architecture diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage.h new file mode 100644 index 0000000000..8462cfe6f0 --- /dev/null +++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage.h @@ -0,0 +1,757 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a multistage threadblock-scoped Implicit GEMM Convolution kernel. +*/ + +#pragma once + +#include "cutlass/aligned_buffer.h" +#include "cutlass/arch/memory.h" +#include "cutlass/array.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/numeric_types.h" +#include "cutlass/arch/cache_operation.h" +#include "cutlass/gemm/threadblock/mma_base.h" +#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h" + +#include "threadblock/b2b_mma_base.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape0_, + /// Iterates over tiles of A operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorA0_, + /// Iterates over tiles of A operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorA0_, + /// Cache operation for operand A + cutlass::arch::CacheOperation::Kind CacheOpA0, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorB0_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB0_, + /// Cache operation for operand B + cutlass::arch::CacheOperation::Kind CacheOpB0, + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape1_, + /// Iterates over the intermediate accumulator tile + // (concept::MmaTensorOpFragmentIterator) + typename FragmentIteratorA1_, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorB1_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB1_, + /// Cache operation for operand B + cutlass::arch::CacheOperation::Kind CacheOpB1, + /// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...) + typename OutputOp_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy0_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy1_, + /// Number of stages, + int Stages, + /// Used for partial specialization + typename Enable = bool> +class B2bImplicitGemmMultistage : + public gemm::threadblock::B2bMmaBase { +public: + ///< Base class + using Base = gemm::threadblock::B2bMmaBase; + ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using Shape0 = Shape0_; + ///< Iterates over tiles of A operand in global memory + using IteratorA0 = IteratorA0_; + ///< Iterates over tiles of B operand in global memory + using IteratorB0 = IteratorB0_; + ///< Policy describing tuning details + using Policy0 = Policy0_; + + using SmemIteratorA0 = SmemIteratorA0_; + using SmemIteratorB0 = SmemIteratorB0_; + + ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using Shape1 = Shape1_; + ///< Iterates over tiles of A operand in global memory + using FragmentIteratorA1 = FragmentIteratorA1_; + ///< Iterates over tiles of B operand in global memory + using IteratorB1 = IteratorB1_; + ///< Policy describing tuning details + using Policy1 = Policy1_; + + using SmemIteratorB1 = SmemIteratorB1_; + + ///< Epilogue after 1st Gemm + using OutputOp = OutputOp_; + + static cutlass::arch::CacheOperation::Kind const kCacheOpA0 = CacheOpA0; + static cutlass::arch::CacheOperation::Kind const kCacheOpB0 = CacheOpB0; + static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1; + + // + // Dependent types + // + + using ElementC = typename Policy0::Operator::ElementC; + + /// Fragment of accumulator tile + using FragmentC0 = typename Policy0::Operator::FragmentC; + + /// Warp-level Mma + using Operator0 = typename Policy0::Operator; + + /// Fragment of accumulator tile + using FragmentC1 = typename Policy1::Operator::FragmentC; + + /// Warp-level Mma + using Operator1 = typename Policy1::Operator; + + /// Internal structure exposed for introspection. + struct Detail { + + static_assert(Base::kWarpGemmIterations0 > 1, + "The pipelined structure requires at least two warp-level " + "GEMM operations."); + static_assert(Base::kWarpGemmIterations1 > 1, + "The pipelined structure requires at least two warp-level " + "GEMM operations."); + + /// Number of cp.async instructions to load one stage of operand A + static int const AsyncCopyIterationsPerStageA0 = + IteratorA0::ThreadMap::Iterations::kCount; + + /// Number of cp.async instructions to load one stage of operand B + static int const AsyncCopyIterationsPerStageB0 = + IteratorB0::ThreadMap::Iterations::kCount; + + /// Number of cp.async instructions to load one stage of operand B + static int const AsyncCopyIterationsPerStageB1 = + IteratorB1::ThreadMap::Iterations::kCount; + + /// Number of stages + static int const kStages = Stages; + + /// Number of cp.async instructions to load on group of operand A + static int const kAccessesPerGroupA0 = + (AsyncCopyIterationsPerStageA0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0; + + /// Number of cp.async instructions to load on group of operand B + static int const kAccessesPerGroupB0 = + (AsyncCopyIterationsPerStageB0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0; + + /// Number of cp.async instructions to load on group of operand B + static int const kAccessesPerGroupB1 = + (AsyncCopyIterationsPerStageB1 + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1; + }; + + private: + + using WarpLoadedFragmentA0 = typename Operator0::FragmentA; + using WarpLoadedFragmentB0 = typename Operator0::FragmentB; + /// Warp Fragment of operand A1 loaded from accmulator tile + using WarpLoadedFragmentA1 = typename FragmentIteratorA1::Fragment; + using WarpLoadedFragmentB1 = typename Operator1::FragmentB; + using WarpTransformedFragmentA0 = typename Operator0::TransformedFragmentA; + using WarpTransformedFragmentB0 = typename Operator0::TransformedFragmentB; + using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA; + using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB; + + private: + + // + // Data members + // + + /// Iterator to write threadblock-scoped tile of A operand to shared memory + SmemIteratorA0 smem_iterator_A0_; + + /// Iterator to write threadblock-scoped tile of B operand to shared memory + SmemIteratorB0 smem_iterator_B0_; + + /// Iterator to write threadblock-scoped tile of B operand to shared memory + SmemIteratorB1 smem_iterator_B1_; + +public: + + /// Construct from tensor references + CUTLASS_DEVICE + B2bImplicitGemmMultistage( + ///< Shared storage needed for internal use by threadblock-scoped GEMM + typename Base::B2bMmaSharedStorage &shared_storage, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx + ): + Base(shared_storage, thread_idx, warp_idx, lane_idx), + smem_iterator_A0_(shared_storage.sharedStorage0.operand_A_ref(), thread_idx), + smem_iterator_B0_(shared_storage.sharedStorage0.operand_B_ref(), thread_idx), + smem_iterator_B1_(shared_storage.sharedStorage1.operand_B_ref(), thread_idx) + { + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + + int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN); + int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN); + + int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM; + int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM; + + // Add per-warp offsets in units of warp-level tiles + this->warp_tile_iterator_A0_.add_tile_offset( + {warp_idx_m, Base::kWarpGemmIterations0 * warp_idx_k}); + this->warp_tile_iterator_B0_.add_tile_offset( + {Base::kWarpGemmIterations0 * warp_idx_k, warp_idx_n}); + this->warp_tile_iterator_B1_.add_tile_offset( + {Base::kWarpGemmIterations1 * warp_idx_k, warp_idx_n}); + } + + CUTLASS_DEVICE + void copy_tiles_and_advance_0( + IteratorA0 &iterator_A0, IteratorB0 &iterator_B0, + int group_start_A0 = 0, int group_start_B0 = 0) { + + iterator_A0.set_iteration_index(group_start_A0); + this->smem_iterator_A0_.set_iteration_index(group_start_A0); + + // Async Copy for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupA0; ++j) { + + if (group_start_A0 + j < Detail::AsyncCopyIterationsPerStageA0) { + typename IteratorA0::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_A0_.get()); + + int const kSrcBytes = sizeof_bits::value * + IteratorA0::ThreadMap::kElementsPerAccess / 8; + + cutlass::arch::cp_async_zfill( + dst_ptr, iterator_A0.get(), iterator_A0.valid()); + + ++iterator_A0; + + ++this->smem_iterator_A0_; + } + } + + iterator_B0.set_iteration_index(group_start_B0); + + this->smem_iterator_B0_.set_iteration_index(group_start_B0); + + // Async Copy for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupB0; ++j) { + if (group_start_B0 + j < Detail::AsyncCopyIterationsPerStageB0) { + typename IteratorB0::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_B0_.get()); + + int const kSrcBytes = sizeof_bits::value * + IteratorB0::ThreadMap::kElementsPerAccess / 8; + + cutlass::arch::cp_async_zfill( + dst_ptr, iterator_B0.get(), iterator_B0.valid()); + + ++iterator_B0; + ++this->smem_iterator_B0_; + } + } + } + + CUTLASS_DEVICE + void copy_tiles_and_advance_1( + IteratorB1 &iterator_B1, + int group_start_B1 = 0) { + + iterator_B1.set_iteration_index(group_start_B1); + + this->smem_iterator_B1_.set_iteration_index(group_start_B1); + + // Async Copy for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) { + if (group_start_B1 + j < Detail::AsyncCopyIterationsPerStageB1) { + typename IteratorB1::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_B1_.get()); + + int const kSrcBytes = sizeof_bits::value * + IteratorB1::ThreadMap::kElementsPerAccess / 8; + + cutlass::arch::cp_async_zfill( + dst_ptr, iterator_B1.get(), iterator_B1.valid()); + + ++iterator_B1; + ++this->smem_iterator_B1_; + } + } + } + + /// Perform a threadblock-scoped matrix multiply-accumulate + CUTLASS_DEVICE + void operator()( + ///< problem size of GEMM + int gemm_k_iterations_0, + ///< destination accumulator tile + FragmentC1 &accum, + ///< iterator over A operand in global memory + IteratorA0 iterator_A0, + ///< iterator over B operand in global memory + IteratorB0 iterator_B0, + ///< iterator over B operand in global memory + IteratorB1 iterator_B1, + ///< initial value of accumulator + FragmentC0 const &src_accum, + ///< epilogue operation after 1st Gemm + OutputOp output_op_0, + ///< Imaginary strides used for planar-complex only - ignored here + int64_t imag_stride_A = 0, + int64_t imag_stride_B = 0) { + + // + // Prologue + // + + // Issue several complete stages + CUTLASS_PRAGMA_UNROLL + for (int stage = 0; stage < Base::kStages - 1; + ++stage, --gemm_k_iterations_0) { + + iterator_A0.set_iteration_index(0); + this->smem_iterator_A0_.set_iteration_index(0); + + // Async Copy for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA0; ++j) { + typename IteratorA0::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_A0_.get()); + + int const kSrcBytes = + sizeof_bits::value * + IteratorA0::ThreadMap::kElementsPerAccess / 8; + + cutlass::arch::cp_async_zfill( + dst_ptr, iterator_A0.get(), iterator_A0.valid()); + + ++iterator_A0; + ++this->smem_iterator_A0_; + } + + iterator_B0.set_iteration_index(0); + this->smem_iterator_B0_.set_iteration_index(0); + + // Async Copy for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB0; ++j) { + typename IteratorB0::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_B0_.get()); + + int const kSrcBytes = + sizeof_bits::value * + IteratorB0::ThreadMap::kElementsPerAccess / 8; + + cutlass::arch::cp_async_zfill( + dst_ptr, iterator_B0.get(), iterator_B0.valid()); + + ++iterator_B0; + ++this->smem_iterator_B0_; + } + + // Move to the next stage + iterator_A0.advance(); + iterator_B0.advance(); + + this->smem_iterator_A0_.add_tile_offset({0, 1}); + this->smem_iterator_B0_.add_tile_offset({1, 0}); + + // Inserts a fence to group cp.async instructions into stages. + cutlass::arch::cp_async_fence(); + } + + // Perform accumulation in the 'd' output operand + FragmentC0 accum0 = src_accum; + + // Waits until kStages-2 stages have committed. + cutlass::arch::cp_async_wait(); + __syncthreads(); + + // Pair of fragments used to overlap shared memory loads and math + // instructions + WarpLoadedFragmentA0 warp_loaded_frag_A0[2]; + WarpLoadedFragmentB0 warp_loaded_frag_B0[2]; + WarpTransformedFragmentA0 warp_transformed_frag_A0[2]; + WarpTransformedFragmentB0 warp_transformed_frag_B0[2]; + + Operator0 warp_mma0; + + this->warp_tile_iterator_A0_.set_kgroup_index(0); + this->warp_tile_iterator_B0_.set_kgroup_index(0); + + this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[0]); + this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[0]); + + ++this->warp_tile_iterator_A0_; + ++this->warp_tile_iterator_B0_; + + // Start issuing the first group of the next stage outside of the mainloop + copy_tiles_and_advance_0(iterator_A0, iterator_B0); + + int smem_write_stage_idx = Base::kStages - 1; + int smem_read_stage_idx = 0; + + warp_mma0.transform(warp_transformed_frag_A0[0], warp_transformed_frag_B0[0], + warp_loaded_frag_A0[0], warp_loaded_frag_B0[0]); + + // + // Mainloop + // + + CUTLASS_GEMM_LOOP + for (; gemm_k_iterations_0 > (-Base::kStages + 1);) { + + // + // Loop over GEMM K dimension + // + + // Computes a warp-level GEMM on data held in shared memory + // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0; + ++warp_mma_k) { + + // Load warp-level tiles from shared memory, wrapping to k offset if + // this is the last group as the case may be. + + this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0); + this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0); + + this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[(warp_mma_k + 1) % 2]); + + ++this->warp_tile_iterator_A0_; + ++this->warp_tile_iterator_B0_; + + if (warp_mma_k > 0) + warp_mma0.transform(warp_transformed_frag_A0[warp_mma_k % 2], + warp_transformed_frag_B0[warp_mma_k % 2], + warp_loaded_frag_A0[warp_mma_k % 2], + warp_loaded_frag_B0[warp_mma_k % 2]); + + // Issue global->shared copies for the next stage + int group_start_iteration_A0, group_start_iteration_B0; + + if (warp_mma_k + 1 == Base::kWarpGemmIterations0) { + group_start_iteration_A0 = 0; + group_start_iteration_B0 = 0; + } else { + group_start_iteration_A0 = + (warp_mma_k + 1) * Detail::kAccessesPerGroupA0; + group_start_iteration_B0 = + (warp_mma_k + 1) * Detail::kAccessesPerGroupB0; + } + + copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0, + group_start_iteration_B0); + + warp_mma0( + accum0, + warp_transformed_frag_A0[warp_mma_k % 2], + warp_transformed_frag_B0[warp_mma_k % 2], + accum0 + ); + + if (warp_mma_k + 1 == Base::kWarpGemmIterations0) + warp_mma0.transform(warp_transformed_frag_A0[(warp_mma_k + 1) % 2], + warp_transformed_frag_B0[(warp_mma_k + 1) % 2], + warp_loaded_frag_A0[(warp_mma_k + 1) % 2], + warp_loaded_frag_B0[(warp_mma_k + 1) % 2]); + + if (warp_mma_k + 2 == Base::kWarpGemmIterations0) { + // Inserts a fence to group cp.async instructions into stages. + cutlass::arch::cp_async_fence(); + + // Waits until kStages-2 stages of cp.async have committed + arch::cp_async_wait(); + __syncthreads(); + + // Move to the next stage + iterator_A0.advance(); + iterator_B0.advance(); + + this->smem_iterator_A0_.add_tile_offset({0, 1}); + this->smem_iterator_B0_.add_tile_offset({1, 0}); + + // Add negative offsets to return iterators to the 'start' of the + // circular buffer in shared memory + if (smem_write_stage_idx == (Base::kStages - 1)) { + this->smem_iterator_A0_.add_tile_offset({0, -Base::kStages}); + this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0}); + smem_write_stage_idx = 0; + } else { + ++smem_write_stage_idx; + } + + if (smem_read_stage_idx == (Base::kStages - 1)) { + this->warp_tile_iterator_A0_.add_tile_offset( + {0, -Base::kStages * Policy0::kPartitionsK * + Base::kWarpGemmIterations0}); + this->warp_tile_iterator_B0_.add_tile_offset( + {-Base::kStages * Policy0::kPartitionsK * + Base::kWarpGemmIterations0, + 0}); + smem_read_stage_idx = 0; + } else { + ++smem_read_stage_idx; + } + + --gemm_k_iterations_0; + } + } + + } + + // Insert fence and wait for all outstanding cp.async operations to commit. + cutlass::arch::cp_async_fence(); + cutlass::arch::cp_async_wait<0>(); + __syncthreads(); + + + // 2nd Implicit Gemm + + /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile + FragmentIteratorA1 warp_tile_iterator_A1_(accum0); + + // + // Prologue + // + int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1; + + // Issue several complete stages + CUTLASS_PRAGMA_UNROLL + for (int stage = 0; stage < Base::kStages - 1; + ++stage, --gemm_k_iterations_1) { + + iterator_B1.set_iteration_index(0); + this->smem_iterator_B1_.set_iteration_index(0); + + // Async Copy for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB1; ++j) { + typename IteratorB1::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_B1_.get()); + + int const kSrcBytes = + sizeof_bits::value * + IteratorB1::ThreadMap::kElementsPerAccess / 8; + + cutlass::arch::cp_async_zfill( + dst_ptr, iterator_B1.get(), iterator_B1.valid()); + + ++iterator_B1; + ++this->smem_iterator_B1_; + } + + // Move to the next stage + iterator_B1.advance(); + + this->smem_iterator_B1_.add_tile_offset({1, 0}); + + // Inserts a fence to group cp.async instructions into stages. + cutlass::arch::cp_async_fence(); + } + + // Waits until kStages-2 stages have committed. + cutlass::arch::cp_async_wait(); + __syncthreads(); + + // Pair of fragments used to overlap shared memory loads and math + // instructions + WarpLoadedFragmentA1 warp_loaded_frag_A1[2]; + WarpLoadedFragmentB1 warp_loaded_frag_B1[2]; + WarpTransformedFragmentA1 warp_transformed_frag_A1[2]; + WarpTransformedFragmentB1 warp_transformed_frag_B1[2]; + + Operator1 warp_mma1; + + this->warp_tile_iterator_B1_.set_kgroup_index(0); + + warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0], output_op_0); + this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[0]); + + ++warp_tile_iterator_A1_; + ++this->warp_tile_iterator_B1_; + + // Start issuing the first group of the next stage outside of the mainloop + copy_tiles_and_advance_1(iterator_B1); + + smem_write_stage_idx = Base::kStages - 1; + smem_read_stage_idx = 0; + + warp_mma1.transform(warp_transformed_frag_A1[0], warp_transformed_frag_B1[0], + warp_loaded_frag_A1[0], warp_loaded_frag_B1[0]); + + + // + // Mainloop + // + + CUTLASS_GEMM_LOOP + for (gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1 - (Base::kStages - 1); + gemm_k_iterations_1 > (-Base::kStages + 1); gemm_k_iterations_1--) { + // + // Loop over GEMM K dimension + // + + // Computes a warp-level GEMM on data held in shared memory + // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; + ++warp_mma_k) { + + // Load warp-level tiles from shared memory, wrapping to k offset if + // this is the last group as the case may be. + + this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1); + + warp_tile_iterator_A1_.load(warp_loaded_frag_A1[(warp_mma_k + 1) % 2], output_op_0); + this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[(warp_mma_k + 1) % 2]); + + ++warp_tile_iterator_A1_; + ++this->warp_tile_iterator_B1_; + + if (warp_mma_k > 0) + warp_mma1.transform(warp_transformed_frag_A1[warp_mma_k % 2], + warp_transformed_frag_B1[warp_mma_k % 2], + warp_loaded_frag_A1[warp_mma_k % 2], + warp_loaded_frag_B1[warp_mma_k % 2]); + + // Issue global->shared copies for the next stage + int group_start_iteration_B1; + + if (warp_mma_k + 1 == Base::kWarpGemmIterations1) { + group_start_iteration_B1 = 0; + } else { + group_start_iteration_B1 = + (warp_mma_k + 1) * Detail::kAccessesPerGroupB1; + } + + copy_tiles_and_advance_1(iterator_B1, + group_start_iteration_B1); + + warp_mma1( + accum, + warp_transformed_frag_A1[warp_mma_k % 2], + warp_transformed_frag_B1[warp_mma_k % 2], + accum + ); + + if (warp_mma_k + 1 == Base::kWarpGemmIterations1) + warp_mma1.transform(warp_transformed_frag_A1[(warp_mma_k + 1) % 2], + warp_transformed_frag_B1[(warp_mma_k + 1) % 2], + warp_loaded_frag_A1[(warp_mma_k + 1) % 2], + warp_loaded_frag_B1[(warp_mma_k + 1) % 2]); + + if (warp_mma_k + 2 == Base::kWarpGemmIterations1) { + // Inserts a fence to group cp.async instructions into stages. + cutlass::arch::cp_async_fence(); + + // Waits until kStages-2 stages of cp.async have committed + arch::cp_async_wait(); + __syncthreads(); + + // Move to the next stage + iterator_B1.advance(); + + this->smem_iterator_B1_.add_tile_offset({1, 0}); + + // Add negative offsets to return iterators to the 'start' of the + // circular buffer in shared memory + if (smem_write_stage_idx == (Base::kStages - 1)) { + this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0}); + smem_write_stage_idx = 0; + } else { + ++smem_write_stage_idx; + } + + if (smem_read_stage_idx == (Base::kStages - 1)) { + this->warp_tile_iterator_B1_.add_tile_offset( + {-Base::kStages * Policy1::kPartitionsK * + Base::kWarpGemmIterations1, + 0}); + smem_read_stage_idx = 0; + } else { + ++smem_read_stage_idx; + } + + } + } + + } + + // Insert fence and wait for all outstanding cp.async operations to commit. + cutlass::arch::cp_async_fence(); + cutlass::arch::cp_async_wait<0>(); + __syncthreads(); + + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h new file mode 100644 index 0000000000..b1e929ed23 --- /dev/null +++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h @@ -0,0 +1,483 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a double-buffered threadblock-scoped GEMM kernel. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/aligned_buffer.h" +#include "cutlass/numeric_conversion.h" + +#include "cutlass/numeric_types.h" +#include "cutlass/matrix_shape.h" + +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h" + +#include "threadblock/b2b_mma_base.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape0_, + /// Iterates over tiles of A operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator) + typename IteratorA0_, + /// Iterates over tiles of A operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorA0_, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator) + typename IteratorB0_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB0_, + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape1_, + /// Iterates over the intermediate accumulator tile + // (concept::MmaTensorOpFragmentIterator) + typename FragmentIteratorA1_, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator) + typename IteratorB1_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB1_, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...) + typename OutputOp_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy0_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy1_, + /// Transformation applied to A operand + typename TransformA0_ = NumericArrayConverter< + typename SmemIteratorA0_::Element, + typename IteratorA0_::Element, + IteratorA0_::Fragment::kElements>, + /// + /// Transformation applied to A operand + typename TransformB0_ = NumericArrayConverter< + typename SmemIteratorB0_::Element, + typename IteratorB0_::Element, + IteratorB0_::Fragment::kElements>, + /// + /// Transformation applied to A operand + typename TransformB1_ = NumericArrayConverter< + typename SmemIteratorB1_::Element, + typename IteratorB1_::Element, + IteratorB1_::Fragment::kElements>, + /// Used for partial specialization + typename Enable = bool +> +class B2bImplicitGemmPipelined : public gemm::threadblock::B2bMmaBase { +public: + + ///< Base class + using Base = gemm::threadblock::B2bMmaBase; + + using Shape0 = Shape0_; ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using IteratorA0 = IteratorA0_; ///< Iterates over tiles of A operand in global memory + using IteratorB0 = IteratorB0_; ///< Iterates over tiles of B operand in global memory + using Policy0 = Policy0_; ///< Policy0 describing tuning details + + using SmemIteratorA0 = SmemIteratorA0_; + using SmemIteratorB0 = SmemIteratorB0_; + + using Shape1 = Shape1_; ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using FragmentIteratorA1 = FragmentIteratorA1_; ///< Iterates over tiles of A operand in global memory + using IteratorB1 = IteratorB1_; ///< Iterates over tiles of B operand in global memory + using Policy1 = Policy1_; ///< Policy1 describing tuning details + + using SmemIteratorB1 = SmemIteratorB1_; + + using ElementC = ElementC_; ///< Data type of accumulator matrix + using LayoutC = LayoutC_; ///< Layout of accumulator matrix + + using OutputOp = OutputOp_; ///< Epilogue after 1st Gemm + + using TransformA0 = TransformA0_; + using TransformB0 = TransformB0_; + using TransformB1 = TransformB1_; + + // + // Dependent types + // + + /// Fragment of operand A loaded from global memory + using FragmentA0 = typename IteratorA0::Fragment; + + /// Fragment of operand B loaded from global memory + using FragmentB0 = typename IteratorB0::Fragment; + + /// Fragment of accumulator tile + using FragmentC0 = typename Policy0::Operator::FragmentC; + + /// Warp-level Mma + using Operator0 = typename Policy0::Operator; + + /// Fragment of operand B loaded from global memory + using FragmentB1 = typename IteratorB1::Fragment; + + /// Fragment of accumulator tile + using FragmentC1 = typename Policy1::Operator::FragmentC; + + /// Warp-level Mma + using Operator1 = typename Policy1::Operator; + + /// Obtain the arch tag from the warp-level operator + using ArchTag = typename Policy0::Operator::ArchTag; + + /// Complex transform on A0 operand + static ComplexTransform const kTransformA0 = Operator0::kTransformA; + + /// Complex transform on B0 operand + static ComplexTransform const kTransformB0 = Operator0::kTransformB; + + /// Complex transform on B1 operand + static ComplexTransform const kTransformB1 = Operator1::kTransformB; + + // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline) + static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2"); + +private: + + using WarpFragmentA0 = typename Operator0::FragmentA; + using WarpFragmentB0 = typename Operator0::FragmentB; + /// Warp Fragment of operand A1 loaded from accmulator tile + using WarpFragmentA1 = typename FragmentIteratorA1::Fragment; + using WarpFragmentB1 = typename Operator1::FragmentB; + +protected: + + /// Iterator to write threadblock-scoped tile of A operand to shared memory + SmemIteratorA0 smem_iterator_A_; + + /// Iterator to write threadblock-scoped tile of B0 operand to shared memory + SmemIteratorB0 smem_iterator_B0_; + + /// Iterator to write threadblock-scoped tile of B1 operand to shared memory + SmemIteratorB1 smem_iterator_B1_; + +public: + + /// Construct from tensor references + CUTLASS_DEVICE + B2bImplicitGemmPipelined( + typename Base::B2bMmaSharedStorage &shared_storage, ///< Shared storage needed for internal use by threadblock-scoped GEMM + int thread_idx, ///< ID within the threadblock + int warp_idx, ///< ID of warp + int lane_idx ///< ID of each thread within a warp + ): + Base(shared_storage, thread_idx, warp_idx, lane_idx), + smem_iterator_A_(shared_storage.sharedStorage0.operand_A_ref(), thread_idx), + smem_iterator_B0_(shared_storage.sharedStorage0.operand_B_ref(), thread_idx), + smem_iterator_B1_(shared_storage.sharedStorage1.operand_B_ref(), thread_idx) { + + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + + int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN); + int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN); + + int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM; + int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM; + + //These may change across different GEMM layers + int tile_offset_k_0 = Base::kWarpGemmIterations0 * warp_idx_k; + int tile_offset_k_1 = Base::kWarpGemmIterations1 * warp_idx_k; + + // Add per-warp offsets in units of warp-level tiles + this->warp_tile_iterator_A0_.add_tile_offset({warp_idx_m, tile_offset_k_0}); + this->warp_tile_iterator_B0_.add_tile_offset({tile_offset_k_0, warp_idx_n}); + this->warp_tile_iterator_B1_.add_tile_offset({tile_offset_k_1, warp_idx_n}); + + } + + /// Perform a threadblock-scoped matrix multiply-accumulate + CUTLASS_DEVICE + void operator()( + int gemm_k_iterations_0, ///< number of iterations of the mainloop + FragmentC1 &accum, ///< destination accumulator tile + IteratorA0 iterator_A, ///< iterator over A operand in global memory + IteratorB0 iterator_B0, ///< iterator over B0 operand in global memory + IteratorB1 iterator_B1, ///< iterator over B1 operand in global memory + FragmentC0 const &src_accum, ///< source accumulator tile + OutputOp output_op_0, ///< epilogue operation after 1st Gemm + TransformA0 transform_A0 = TransformA0(), ///< transformation applied to A0 fragment + TransformB0 transform_B0 = TransformB0(), ///< transformation applied to B0 fragment + TransformB1 transform_B1 = TransformB1()) { ///< transformation applied to B1 fragment + + // + // Prologue + // + + // Perform accumulation in the 'd' output operand + FragmentC0 accum0 = src_accum; + + FragmentA0 tb_frag_A; + FragmentB0 tb_frag_B0; + + tb_frag_A.clear(); + tb_frag_B0.clear(); + + // The last kblock is loaded in the prolog + iterator_A.load(tb_frag_A); + iterator_B0.load(tb_frag_B0); + + ++iterator_A; + ++iterator_B0; + + this->smem_iterator_A_.store(transform_A0(tb_frag_A)); + this->smem_iterator_B0_.store(transform_B0(tb_frag_B0)); + + ++this->smem_iterator_A_; + ++this->smem_iterator_B0_; + + __syncthreads(); + + // Pair of fragments used to overlap shared memory loads and math instructions + WarpFragmentA0 warp_frag_A0[2]; + WarpFragmentB0 warp_frag_B0[2]; + + this->warp_tile_iterator_A0_.set_kgroup_index(0); + this->warp_tile_iterator_B0_.set_kgroup_index(0); + + this->warp_tile_iterator_A0_.load(warp_frag_A0[0]); + this->warp_tile_iterator_B0_.load(warp_frag_B0[0]); + + ++this->warp_tile_iterator_A0_; + ++this->warp_tile_iterator_B0_; + + Operator0 warp_mma0; + + int smem_write_stage_idx = 1; + + // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing + // shared memory loads (which have the tighest latency requirement). + + // + // Mainloop + // + + // Note: The main loop does not support Base::kWarpGemmIterations == 2. + CUTLASS_GEMM_LOOP + for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) { + // + // Loop over GEMM K dimension + // + + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0; ++warp_mma_k) { + + // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group + // as the case may be. + + if (warp_mma_k == Base::kWarpGemmIterations0 - 1) { + + // Write fragments to shared memory + this->smem_iterator_A_.store(transform_A0(tb_frag_A)); + + this->smem_iterator_B0_.store(transform_B0(tb_frag_B0)); + + __syncthreads(); + + ++this->smem_iterator_A_; + ++this->smem_iterator_B0_; + + // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory + if (smem_write_stage_idx == 1) { + this->smem_iterator_A_.add_tile_offset({0, -Base::kStages}); + this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0}); + } + else { + this->warp_tile_iterator_A0_.add_tile_offset( + {0, -Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0}); + this->warp_tile_iterator_B0_.add_tile_offset( + {-Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0, + 0}); + } + + smem_write_stage_idx ^= 1; + } + + this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0); + this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0); + + this->warp_tile_iterator_A0_.load(warp_frag_A0[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_B0_.load(warp_frag_B0[(warp_mma_k + 1) % 2]); + + ++this->warp_tile_iterator_A0_; + ++this->warp_tile_iterator_B0_; + + if (warp_mma_k == 0) { + + iterator_A.load(tb_frag_A); + iterator_B0.load(tb_frag_B0); + + ++iterator_A; + ++iterator_B0; + } + + warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2], + warp_frag_B0[warp_mma_k % 2], accum0); + + } + } + + + //2nd Implicit Gemm + + /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile + FragmentIteratorA1 warp_tile_iterator_A1_(accum0); + + // + // Prologue + // + + FragmentB1 tb_frag_B1; + + tb_frag_B1.clear(); + + // The last kblock is loaded in the prolog + iterator_B1.load(tb_frag_B1); + + + ++iterator_B1; + + this->smem_iterator_B1_.store(transform_B1(tb_frag_B1)); + + ++this->smem_iterator_B1_; + + __syncthreads(); + + // Pair of fragments used to overlap shared memory loads and math instructions + WarpFragmentA1 warp_frag_A1[2]; + WarpFragmentB1 warp_frag_B1[2]; + + this->warp_tile_iterator_B1_.set_kgroup_index(0); + + warp_tile_iterator_A1_.load(warp_frag_A1[0], output_op_0); + this->warp_tile_iterator_B1_.load(warp_frag_B1[0]); + + ++warp_tile_iterator_A1_; + ++this->warp_tile_iterator_B1_; + + Operator1 warp_mma1; + + smem_write_stage_idx = 1; + + int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1; + + // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing + // shared memory loads (which have the tighest latency requirement). + + // + // Mainloop + // + + // Note: The main loop does not support Base::kWarpGemmIterations == 2. + CUTLASS_PRAGMA_UNROLL + for (; gemm_k_iterations_1 > 0; --gemm_k_iterations_1) { + // + // Loop over GEMM K dimension + // + + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; ++warp_mma_k) { + + // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group + // as the case may be. + + if (warp_mma_k == Base::kWarpGemmIterations1 - 1) { + + this->smem_iterator_B1_.store(transform_B1(tb_frag_B1)); + + __syncthreads(); + + ++this->smem_iterator_B1_; + + // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory + if (smem_write_stage_idx == 1) { + this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0}); + } + else { + this->warp_tile_iterator_B1_.add_tile_offset( + {-Base::kStages * Policy1::kPartitionsK * Base::kWarpGemmIterations1, + 0}); + } + + smem_write_stage_idx ^= 1; + } + + this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1); + + warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2], output_op_0); + this->warp_tile_iterator_B1_.load(warp_frag_B1[(warp_mma_k + 1) % 2]); + + ++warp_tile_iterator_A1_; + ++this->warp_tile_iterator_B1_; + + if (warp_mma_k == 0) { + + iterator_B1.load(tb_frag_B1); + + ++iterator_B1; + } + + warp_mma1(accum, warp_frag_A1[warp_mma_k % 2], + warp_frag_B1[warp_mma_k % 2], accum); + + } + } + + + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/examples/13_fused_two_gemms/threadblock/b2b_mma_base.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h similarity index 99% rename from examples/13_fused_two_gemms/threadblock/b2b_mma_base.h rename to examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h index 01cca8b7a2..4293ec3dc9 100644 --- a/examples/13_fused_two_gemms/threadblock/b2b_mma_base.h +++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/13_fused_two_gemms/threadblock/b2b_mma_multistage.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage.h similarity index 95% rename from examples/13_fused_two_gemms/threadblock/b2b_mma_multistage.h rename to examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage.h index 8782b7af55..f09045a8b8 100644 --- a/examples/13_fused_two_gemms/threadblock/b2b_mma_multistage.h +++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -635,40 +635,9 @@ class B2bMmaMultistage : ++stage, --gemm_k_iterations_1) { if (gemm_k_iterations_1 == 0) { -// iterator_A1.clear_mask(); iterator_B1.clear_mask(); } -#if 0 - iterator_A1.set_iteration_index(0); - this->smem_iterator_A1_.set_iteration_index(0); - - // LDGSTS for operand A - CUTLASS_PRAGMA_UNROLL - for (int j = 0; j < Detail::TBLDGSTSIterationsA1; ++j) { - typename IteratorA1::AccessType *dst_ptr = - reinterpret_cast( - this->smem_iterator_A1_.get()); - - CUTLASS_PRAGMA_UNROLL - for (int v = 0; v < IteratorA1::kAccessesPerVector; ++v) { - int const kSrcBytes = - sizeof_bits::value * - IteratorA1::ThreadMap::kElementsPerAccess / - IteratorA1::kAccessesPerVector / 8; - - int src_bytes = (iterator_A0.valid() ? kSrcBytes : 0); - - cutlass::arch::cp_async_zfill( - dst_ptr + v, iterator_A0.get(), iterator_A0.valid()); - - ++iterator_A0; - } - - ++this->smem_iterator_A0_; - } -#endif - iterator_B1.set_iteration_index(0); this->smem_iterator_B1_.set_iteration_index(0); @@ -696,19 +665,14 @@ class B2bMmaMultistage : } // Move to the next stage - //iterator_A1.add_tile_offset({0, 1}); iterator_B1.add_tile_offset({1, 0}); - //this->smem_iterator_A1_.add_tile_offset({0, 1}); this->smem_iterator_B1_.add_tile_offset({1, 0}); // Defines the boundary of a stage of cp.async. cutlass::arch::cp_async_fence(); } - // Perform accumulation in the 'd' output operand -// FragmentC0 accum0 = src_accum; - // DEPBAR+SYNC cutlass::arch::cp_async_wait(); __syncthreads(); @@ -722,7 +686,6 @@ class B2bMmaMultistage : Operator1 warp_mma1; -// this->warp_tile_iterator_A1_.set_kgroup_index(0); this->warp_tile_iterator_B1_.set_kgroup_index(0); warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0], output_op_0); @@ -732,7 +695,6 @@ class B2bMmaMultistage : ++this->warp_tile_iterator_B1_; if (gemm_k_iterations_1 == 0) { -// iterator_A1.clear_mask(); iterator_B1.clear_mask(); } @@ -762,7 +724,6 @@ class B2bMmaMultistage : // Load warp-level tiles from shared memory, wrapping to k offset if // this is the last group as the case may be. -// this->warp_tile_iterator_A1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1); this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1); warp_tile_iterator_A1_.load(warp_loaded_frag_A1[(warp_mma_k + 1) % 2], output_op_0); @@ -777,6 +738,7 @@ class B2bMmaMultistage : warp_loaded_frag_A1[warp_mma_k % 2], warp_loaded_frag_B1[warp_mma_k % 2]); + warp_mma1( accum, warp_transformed_frag_A1[warp_mma_k % 2], @@ -823,7 +785,7 @@ class B2bMmaMultistage : if (smem_read_stage_idx == (Base::kStages - 1)) { this->warp_tile_iterator_B1_.add_tile_offset( - {-Base::kStages * Policy0::kPartitionsK * + {-Base::kStages * Policy1::kPartitionsK * Base::kWarpGemmIterations1, 0}); smem_read_stage_idx = 0; @@ -831,7 +793,6 @@ class B2bMmaMultistage : ++smem_read_stage_idx; } -// --gemm_k_iterations_1; if (gemm_k_iterations_1 == 1) { iterator_B1.clear_mask(); } diff --git a/examples/13_fused_two_gemms/threadblock/b2b_mma_pipelined.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h similarity index 99% rename from examples/13_fused_two_gemms/threadblock/b2b_mma_pipelined.h rename to examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h index 9887932a37..d6cc9922b5 100644 --- a/examples/13_fused_two_gemms/threadblock/b2b_mma_pipelined.h +++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -454,11 +454,11 @@ class B2bMmaPipelined : public B2bMmaBasesmem_iterator_B1_.store(tb_frag_B1); __syncthreads(); - ++smem_iterator_B1_; + ++this->smem_iterator_B1_; // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory if (smem_write_stage_idx == 1) { - smem_iterator_B1_.add_tile_offset({-Base::kStages, 0}); + this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0}); } else { this->warp_tile_iterator_B1_.add_tile_offset( diff --git a/examples/13_fused_two_gemms/threadblock/default_b2b_mma.h b/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma.h similarity index 75% rename from examples/13_fused_two_gemms/threadblock/default_b2b_mma.h rename to examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma.h index b3621f56e6..5a95013159 100644 --- a/examples/13_fused_two_gemms/threadblock/default_b2b_mma.h +++ b/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -93,7 +93,7 @@ template < struct DefaultB2bMma; //////////////////////////////////////////////////////////////////////////////// -/// Specialization for row-major output +/// Specialization for row-major output with 2-stage pipeline template < /// Element type for A matrix operand typename ElementA, @@ -110,8 +110,6 @@ template < /// Element type for internal accumulation typename ElementAccumulator, /// Tag indicating architecture to tune for - typename OperatorClass, - /// Tag indicating architecture to tune for typename ArchTag, /// Threadblock-level tile size (concept: GemmShape) typename ThreadblockShape0, @@ -129,7 +127,7 @@ template < typename EpilogueOutputOp> struct DefaultB2bMma { @@ -137,11 +135,11 @@ struct DefaultB2bMma; + arch::OpClassTensorOp, 2, Operator>; using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore< ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA, ElementB, LayoutB, ElementAccumulator, layout::RowMajor, - OperatorClass, 2, Operator>; + arch::OpClassTensorOp, 2, Operator>; // Define iterators over tiles from the A operand using IteratorA0 = @@ -162,7 +160,7 @@ struct DefaultB2bMma, //warp shape cutlass::MatrixShape, //accumulator shape MmaCore1::Shape::kK, //kBlocksColumn - ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp, true>; + ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp>; // Define iterators over tiles from the B operand using IteratorB1 = @@ -181,9 +179,120 @@ struct DefaultB2bMma; }; + +//////////////////////////////////////////////////////////////////////////////// +/// Specialization for row-major output for multi-stage +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Tag indicating architecture to tune for + typename ArchTag, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape0, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape1, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape0, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape1, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape, + /// Number of stages used in the multistage mainloop + int Stages, + /// Operation performed by GEMM + typename Operator, + /// Epilogue output operator + typename EpilogueOutputOp> +struct DefaultB2bMma { + + static cutlass::arch::CacheOperation::Kind const CacheOpA = + ((sizeof_bits::value * kAlignmentA) == 128) + ? cutlass::arch::CacheOperation::Global + : cutlass::arch::CacheOperation::Always; + + static cutlass::arch::CacheOperation::Kind const CacheOpB = + ((sizeof_bits::value * kAlignmentB) == 128) + ? cutlass::arch::CacheOperation::Global + : cutlass::arch::CacheOperation::Always; + + + // Define the MmaCore components + using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, + Stages, Operator, false, CacheOpA, CacheOpB>; + using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, + Stages, Operator, false, CacheOpA, CacheOpB>; + + // Define iterators over tiles from the A operand + using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA; + using AccessTypeA0 = cutlass::Array; + using IteratorA0 = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementA, LayoutA, 1, ThreadMapA0, AccessTypeA0>; + + // Define iterators over tiles from the B operand + using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB; + using AccessTypeB0 = cutlass::Array; + using IteratorB0 = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementB, LayoutB, 0, ThreadMapB0, AccessTypeB0>; + + // Use fragment iterator for A operand + using AccumulatorLayout = cutlass::layout::ColumnMajor; + using FragmentIteratorA1 = + cutlass::gemm::warp::MmaTensorOpFragmentIterator< + cutlass::MatrixShape, //warp shape + cutlass::MatrixShape, //accumulator shape + MmaCore1::Shape::kK, //kBlocksColumn + ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp>; + + // Define iterators over tiles from the B operand + using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB; + using AccessTypeB1 = cutlass::Array; + using IteratorB1 = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementB, LayoutB, 0, ThreadMapB1, AccessTypeB1>; + + // Define the threadblock-scoped pipelined matrix multiply + using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaMultistage< + typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA, + MmaCore0::kCacheOpA, + IteratorB0, typename MmaCore0::SmemIteratorB, MmaCore0::kCacheOpB, + typename MmaCore1::Shape, FragmentIteratorA1, + IteratorB1, typename MmaCore1::SmemIteratorB, MmaCore1::kCacheOpB, + ElementAccumulator, layout::RowMajor, + EpilogueOutputOp, + typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy, Stages>; + +}; + + //////////////////////////////////////////////////////////////////////////////// -/// Specialization for column-major-interleaved output +/// Specialization for column-major-interleaved output with 2-stage pipeline template < /// Element type for A matrix operand typename ElementA, @@ -258,7 +367,7 @@ struct DefaultB2bMma, //accumulator shape MmaCore1::Shape::kK, //kBlocksColumn ElementAccumulator, ElementA, AccumulatorLayout, - InstructionShape, EpilogueOutputOp, true /*only handle beta=0 for 1st Gemm epilogue*/>; + InstructionShape, EpilogueOutputOp>; // Define iterators over tiles from the B operand using IteratorB1 = @@ -281,7 +390,7 @@ struct DefaultB2bMma, //accumulator shape MmaCore1::Shape::kK, //kBlocksColumn ElementAccumulator, ElementA, AccumulatorLayout, - InstructionShape, EpilogueOutputOp, true /*only handle beta=0 for 1st Gemm epilogue*/>; + InstructionShape, EpilogueOutputOp>; // Define iterators over tiles from the B operand using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB; diff --git a/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt b/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt index 49e1a4f9e3..c8cad3ae72 100644 --- a/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt +++ b/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu index 84eadc5eab..58f5a87405 100644 --- a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu +++ b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -191,8 +191,12 @@ int run() { // Instantiate CUTLASS kernel depending on templates Gemm gemm_op; + // Check the problem size is supported or not + cutlass::Status status = gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + // Initialize CUTLASS kernel with arguments and workspace pointer - cutlass::Status status = gemm_op.initialize(arguments, workspace.get()); + status = gemm_op.initialize(arguments, workspace.get()); CUTLASS_CHECK(status); // Launch initialized CUTLASS kernel @@ -258,7 +262,7 @@ int main() { } if (!((props.major * 10 + props.minor) >= 80)) { - std::cerr << "Turing Tensor Core operations must be run on a machine with compute capability at least 80." + std::cerr << "Ampere Tensor Core operations must be run on a machine with compute capability at least 80." << std::endl; notSupported = true; } diff --git a/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt b/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt index 2d0929c3a8..ce786e653f 100644 --- a/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt +++ b/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu index 1b233c488b..c88a889b01 100644 --- a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu +++ b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -223,8 +223,12 @@ int run() { // Instantiate CUTLASS kernel depending on templates Gemm gemm_op; + // Check the problem size is supported or not + cutlass::Status status = gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + // Initialize CUTLASS kernel with arguments and workspace pointer - cutlass::Status status = gemm_op.initialize(arguments, workspace.get()); + status = gemm_op.initialize(arguments, workspace.get()); CUTLASS_CHECK(status); // Launch initialized CUTLASS kernel diff --git a/examples/22_ampere_tensorop_conv2dfprop/CMakeLists.txt b/examples/16_ampere_tensorop_conv2dfprop/CMakeLists.txt similarity index 94% rename from examples/22_ampere_tensorop_conv2dfprop/CMakeLists.txt rename to examples/16_ampere_tensorop_conv2dfprop/CMakeLists.txt index 1b7daac3dc..42db35fa14 100644 --- a/examples/22_ampere_tensorop_conv2dfprop/CMakeLists.txt +++ b/examples/16_ampere_tensorop_conv2dfprop/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -22,7 +22,7 @@ cutlass_example_add_executable( - 22_ampere_tensorop_conv2dfprop + 16_ampere_tensorop_conv2dfprop ampere_tensorop_conv2dfprop.cu ) diff --git a/examples/22_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu similarity index 97% rename from examples/22_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu rename to examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu index cb7c398661..4c417bc60b 100644 --- a/examples/22_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu +++ b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -318,7 +318,7 @@ struct Options { /// Prints the usage statement. std::ostream & print_usage(std::ostream &out) const { - out << "22_ampere_tensorop_conv2dfprop example\n\n" + out << "16_ampere_tensorop_conv2dfprop example\n\n" << " This example uses Ampere's Tensor Core operators on F16 data types to compute\n" << " forward convolution on tensors of layout NHWC.\n\n" << "Options:\n\n" @@ -340,8 +340,8 @@ struct Options { << " --tag String to replicate across the first column in the results table\n"; out << "\n\nExamples:\n\n" - << "$ ./examples/22_ampere_tensorop_conv2dfprop/22_ampere_tensorop_conv2dfprop --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n" - << "$ ./examples/22_ampere_tensorop_conv2dfprop/22_ampere_tensorop_conv2dfprop --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check\n\n"; + << "$ ./examples/16_ampere_tensorop_conv2dfprop/16_ampere_tensorop_conv2dfprop --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n" + << "$ ./examples/16_ampere_tensorop_conv2dfprop/16_ampere_tensorop_conv2dfprop --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check\n\n"; return out; } @@ -474,8 +474,8 @@ Result profile_convolution(Options const &options) { // Split K dimension into 1 partitions int split_k_slices = 1; - typename ImplicitGemm::Arguments arguments{ - { + // Construct Conv2dProblemSize with user defined output size + cutlass::conv::Conv2dProblemSize problem_size( options.input_size, options.filter_size, options.padding, @@ -483,15 +483,18 @@ Result profile_convolution(Options const &options) { options.dilation, options.output_size(), mode, - split_k_slices - }, + split_k_slices + ); + + // Construct ImplicitGemm::Argument structure with conv2d + // problem size, data pointers, and epilogue values + typename ImplicitGemm::Arguments arguments{ + problem_size, tensor_a.device_ref(), tensor_b.device_ref(), tensor_c.device_ref(), tensor_c.device_ref(), {options.alpha, options.beta}, - - }; // @@ -505,6 +508,9 @@ Result profile_convolution(Options const &options) { // Allocate workspace memory cutlass::device_memory::allocation workspace(workspace_size); + result.status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(result.status); + result.status = implicit_gemm_op.initialize(arguments, workspace.get()); CUTLASS_CHECK(result.status); @@ -522,15 +528,6 @@ Result profile_convolution(Options const &options) { if (options.reference_check) { std::cout << "Verification on host...\n"; - cutlass::conv::Conv2dProblemSize problem_size( - options.input_size, - options.filter_size, - options.padding, - options.conv_stride, - options.dilation, - mode - ); - // Compute with reference implementation cutlass::reference::host::Conv2dFprop< ElementInputA, @@ -576,7 +573,7 @@ Result profile_convolution(Options const &options) { std::stringstream ss; - ss << "22_ampere_workspace_conv2dfprop_" + ss << "16_ampere_workspace_conv2dfprop_" << options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c() << "_" << options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c() @@ -667,7 +664,7 @@ int main(int argc, char const **args) { bool notSupported = false; - // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 10.2. + // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0. // // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples. if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) { diff --git a/examples/13_fused_two_gemms/CMakeLists.txt b/examples/17_fprop_per_channel_bias/CMakeLists.txt similarity index 90% rename from examples/13_fused_two_gemms/CMakeLists.txt rename to examples/17_fprop_per_channel_bias/CMakeLists.txt index ba51537ca2..726f0d202d 100644 --- a/examples/13_fused_two_gemms/CMakeLists.txt +++ b/examples/17_fprop_per_channel_bias/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -20,14 +20,9 @@ # STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -cutlass_example_add_executable( - 13_fused_two_gemms - fused_gemm.cu - ) -target_include_directories( - 13_fused_two_gemms - PRIVATE - . +cutlass_example_add_executable( + 17_fprop_per_channel_bias + fprop_per_channel_bias.cu ) diff --git a/examples/17_fprop_per_channel_bias/fprop_per_channel_bias.cu b/examples/17_fprop_per_channel_bias/fprop_per_channel_bias.cu new file mode 100644 index 0000000000..db504935ba --- /dev/null +++ b/examples/17_fprop_per_channel_bias/fprop_per_channel_bias.cu @@ -0,0 +1,300 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/** +The convolution version of 12_gemm_bias_relu. Similarly, we put bias vector in Operand C and the +rest is the same as normal convolution. +*/ + +#include +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/conv/kernel/default_conv2d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "cutlass/util/command_line.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/host_reorder.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/device/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/device/convolution.h" +#include "cutlass/util/tensor_view_io.h" + +#include "helper.h" + +// The code section below describes datatype for input, output tensors and computation between +// elements +using ElementAccumulator = float; // Data type of accumulator +using ElementComputeEpilogue = ElementAccumulator; // Data type of epilogue computation +using ElementInputA = cutlass::half_t; // Data type of elements in input tensor +using ElementInputB = cutlass::half_t; // Data type of elements in input tensor +using ElementOutput = float; // Data type of elements in output tensor + +using LayoutInputA = cutlass::layout::TensorNHWC; +using LayoutInputB = cutlass::layout::TensorNHWC; +using LayoutOutput = cutlass::layout::TensorNHWC; + +// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM +using MMAOp = cutlass::arch::OpClassTensorOp; + +// This code section describes CUDA SM architecture number +using SmArch = cutlass::arch::Sm80; + +// This code section describes the tile size a thread block will compute +using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 32>; // Threadblock tile shape + +// This code section describes tile size a warp will compute +using WarpShape = cutlass::gemm::GemmShape<64, 64, 32>; // Warp tile shape + +// This code section describes the size of MMA op +using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; // TensorCore instruction shape + +// This code section describes how threadblocks are scheduled on GPU +using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; + +// Number of pipelines you want to use +constexpr int NumStages = 4; + +// This code section describe iterator algorithm selected is Analytic or Optimized +static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = cutlass::conv::IteratorAlgorithm::kOptimized; + +// This code section describes the epilogue part of the kernel, we use default value +using EpilogueOp = cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, // Data type of output matrix. + 128 / cutlass::sizeof_bits::value, // The number of elements per vectorized. + // memory access. This becomes the vector width of + // math instructions in the epilogue too. + ElementAccumulator, // Data type of accumulator + ElementComputeEpilogue, // Data type for alpha in linear combination + cutlass::epilogue::thread::ScaleType::NoBetaScaling>; // alpha X C + per channel bias + + +using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop< + ElementInputA, LayoutInputA, + ElementInputB, LayoutInputB, + ElementOutput, LayoutOutput, + ElementAccumulator, + MMAOp, + SmArch, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOp, + SwizzleThreadBlock, + NumStages, + cutlass::arch::OpMultiplyAdd, + IteratorAlgorithm +>::Kernel; + +using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +int run() { + + // Construct Conv2dProblemSize with user defined output size + cutlass::conv::Conv2dProblemSize problem_size( + {1, 7, 7, 512}, // activation + {512, 3, 3, 512}, // filter + {1, 1, 1, 1}, // padding + {1, 1}, // striding + {1, 1}, // dilation + cutlass::conv::Mode::kCrossCorrelation, // mode (convolution or cross-correlation) + 1 // split-k slices + ); + + // Initialize tensors using CUTLASS helper functions + cutlass::HostTensor tensor_a(problem_size.activation_extent()); + cutlass::HostTensor tensor_b(problem_size.filter_extent()); + + // Create tensor C with dimensions 1x1x1xk which is the bias vector + cutlass::HostTensor tensor_c_bias({1, 1, 1, problem_size.K}); + + // Create tensor D used to store output from CUTLASS kernel + cutlass::HostTensor tensor_d(problem_size.output_extent()); + // Create matrix D with dimensions M x N used to store output from reference + // kernel + cutlass::HostTensor tensor_ref_d(problem_size.output_extent()); + + // Fill input and output matrices on host using CUTLASS helper functions + cutlass::reference::host::TensorFillRandomUniform( + tensor_a.host_view(), + 1, + ElementInputA(4), + ElementInputA(-4), + 0); // <- Fill tensor A on host with uniform-distribution random data + cutlass::reference::host::TensorFillRandomUniform( + tensor_b.host_view(), + 1, + ElementInputB(4), + ElementInputB(-4), + 0); // <- Fill tensor B on host with uniform-distribution random data + cutlass::reference::host::TensorFillRandomUniform( + tensor_c_bias.host_view(), + 1, + ElementOutput(4), + ElementOutput(-4), + 0); // <- Fill matrix C on host with uniform-distribution random data + cutlass::reference::host::TensorFill( + tensor_d.host_view()); // <- fill matrix D on host with zeros + cutlass::reference::host::TensorFill( + tensor_ref_d.host_view()); // <- fill matrix D for reference on host with zeros + + // Copy data from host to GPU + tensor_a.sync_device(); + tensor_b.sync_device(); + tensor_c_bias.sync_device(); + tensor_d.sync_device(); + tensor_ref_d.sync_device(); + + // Initialize alpha for dot product computation + ElementComputeEpilogue alpha = ElementComputeEpilogue(1); + + // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch + // instantiated CUTLASS kernel + typename ImplicitGemm::Arguments arguments{ + problem_size, + tensor_a.device_ref(), // <- reference to tensor A on device + tensor_b.device_ref(), // <- reference to tensor B on device + // tensor C is treated as the bias vector. We can enable the CONV + // to project away the N, H, W dimension by setting the stride to zero. + {tensor_c_bias.device_data(), LayoutOutput::Stride(0)}, + tensor_d.device_ref(), // <- reference to tensor D on device + {alpha} }; + + // Instantiate CUTLASS kernel depending on templates + ImplicitGemm implicit_gemm_op; + + // Using the arguments, query for extra workspace required for matrix multiplication computation + size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments); + + // Allocate workspace memory + cutlass::device_memory::allocation workspace(workspace_size); + + // Check the problem size is supported or not + cutlass::Status status = implicit_gemm_op.can_implement(arguments); + CUTLASS_CHECK(status); + + // Initialize CUTLASS kernel with arguments and workspace pointer + status = implicit_gemm_op.initialize(arguments, workspace.get()); + CUTLASS_CHECK(status); + + // Launch initialized CUTLASS kernel + status = implicit_gemm_op(); + + CUTLASS_CHECK(status); + + // + // Create instantiation for device reference conv kernel + // + + // Launch device reference to compute strictly the product A * B + cutlass::reference::device::Conv2d< + ElementInputA, + LayoutInputA, + ElementInputB, + LayoutInputB, + ElementOutput, + LayoutOutput, + ElementComputeEpilogue, + ElementAccumulator, + cutlass::NumericConverter> + ( + cutlass::conv::Operator::kFprop, + problem_size, + tensor_a.device_ref(), + tensor_b.device_ref(), + tensor_c_bias.device_ref(), + tensor_ref_d.device_ref(), + alpha, 0 + ); + + // Wait for kernels to finish + cudaDeviceSynchronize(); + + // Copy output data from CUTLASS and reference kernel to host for comparison + tensor_d.sync_host(); + tensor_ref_d.sync_host(); + + // Compute bias + relu in host code + for (int n = 0; n < problem_size.N; ++n) { + for (int p = 0; p < problem_size.P; ++p) { + for (int q = 0; q < problem_size.Q; ++q) { + for (int k = 0; k < problem_size.K; ++k) { + + tensor_ref_d.at({n, p, q, k}) = + std::max(ElementOutput(0), + ElementOutput(tensor_ref_d.at({n, p, q, k}) + + tensor_c_bias.at({0, 0, 0, k}))); + } + } + } + } + + // Check if output from CUTLASS kernel and reference kernel are equal or not + std::cout << (cutlass::reference::host::TensorEquals(tensor_d.host_view(), + tensor_ref_d.host_view()) + ? "Passed" + : "Failed") + << std::endl; + + CUTLASS_CHECK(status); + return 0; +} + +int main(int argc, char const **args) { + + bool notSupported = false; + + // Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0. + // + // CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples. + if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) { + std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl; + notSupported = true; + } + + cudaDeviceProp props; + CUDA_CHECK(cudaGetDeviceProperties(&props, 0)); + + if (!(props.major > 8 || (props.major == 8 && props.minor >= 0))) { + std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80." + << std::endl; + notSupported = true; + } + + if (notSupported) { + return 0; + } + + return run(); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d51df92c70..e5bfb78ca5 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -78,10 +78,11 @@ foreach(EXAMPLE 10_planar_complex 11_planar_complex_array 12_gemm_bias_relu - 13_fused_two_gemms + 13_two_tensor_op_fusion 14_ampere_tf32_tensorop_gemm 15_ampere_sparse_tensorop_gemm - 22_ampere_tensorop_conv2dfprop + 16_ampere_tensorop_conv2dfprop + 17_fprop_per_channel_bias ) add_subdirectory(${EXAMPLE}) diff --git a/include/cutlass/aligned_buffer.h b/include/cutlass/aligned_buffer.h index 8b3bb0713d..75163cae50 100644 --- a/include/cutlass/aligned_buffer.h +++ b/include/cutlass/aligned_buffer.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/arch.h b/include/cutlass/arch/arch.h index eb0a2ad43b..14b5c9d22a 100644 --- a/include/cutlass/arch/arch.h +++ b/include/cutlass/arch/arch.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/cache_operation.h b/include/cutlass/arch/cache_operation.h index 646b51ded3..d6435fa9bf 100644 --- a/include/cutlass/arch/cache_operation.h +++ b/include/cutlass/arch/cache_operation.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/memory.h b/include/cutlass/arch/memory.h index d9f386eec7..4abaf0d858 100644 --- a/include/cutlass/arch/memory.h +++ b/include/cutlass/arch/memory.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -187,10 +187,10 @@ struct global_load struct global_store; @@ -294,7 +294,6 @@ struct global_store { ///////////////////////////////////////////////////////////////////////////////////////////////// - } // namespace arch } // namespace cutlass diff --git a/include/cutlass/arch/memory_sm75.h b/include/cutlass/arch/memory_sm75.h index 3fd121b903..a541bc9ddd 100644 --- a/include/cutlass/arch/memory_sm75.h +++ b/include/cutlass/arch/memory_sm75.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -73,7 +73,7 @@ inline __device__ void ldsm(Array & D, void const* ptr); #endif */ -#if (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2) +#if (! defined (__clang__) && __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2) extern "C" { // // This NVVM intrinsic is subject to change in future versions of CUDA. @@ -91,7 +91,7 @@ inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) { // We prefer to use the new CVTA intrinsics if they are available, otherwise we will fall back to // the previous internal intrinsics if they are available. -#if (defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 11) +#if (! defined (__clang__) && defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 11) // // This NVVM intrinsic converts an address in shared memory to a plain // unsigned integer. This is necessary to pass to shared memory instructions @@ -104,7 +104,7 @@ inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) { /// CUTLASS helper to get SMEM pointer return static_cast(__cvta_generic_to_shared(ptr)); -#elif (defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2) +#elif (! defined (__clang__) && defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2) return __nvvm_get_smem_pointer(ptr); @@ -120,7 +120,10 @@ inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) { #else - return 0; + CUTLASS_UNUSED(ptr); + CUTLASS_NOT_IMPLEMENTED(); + return 0; + #endif } @@ -146,7 +149,9 @@ inline __device__ void ldsm( #else - assert(0); + CUTLASS_UNUSED(D); + CUTLASS_UNUSED(ptr); + CUTLASS_NOT_IMPLEMENTED(); #endif } @@ -168,7 +173,9 @@ inline __device__ void ldsm( #else - assert(0); + CUTLASS_UNUSED(D); + CUTLASS_UNUSED(ptr); + CUTLASS_NOT_IMPLEMENTED(); #endif } @@ -190,7 +197,9 @@ inline __device__ void ldsm( #else - assert(0); + CUTLASS_UNUSED(D); + CUTLASS_UNUSED(ptr); + CUTLASS_NOT_IMPLEMENTED(); #endif } @@ -216,7 +225,9 @@ inline __device__ void ldsm( #else - assert(0); + CUTLASS_UNUSED(D); + CUTLASS_UNUSED(ptr); + CUTLASS_NOT_IMPLEMENTED(); #endif } @@ -238,7 +249,9 @@ inline __device__ void ldsm( #else - assert(0); + CUTLASS_UNUSED(D); + CUTLASS_UNUSED(ptr); + CUTLASS_NOT_IMPLEMENTED(); #endif } @@ -260,7 +273,9 @@ inline __device__ void ldsm( #else - assert(0); + CUTLASS_UNUSED(D); + CUTLASS_UNUSED(ptr); + CUTLASS_NOT_IMPLEMENTED(); #endif } diff --git a/include/cutlass/arch/memory_sm80.h b/include/cutlass/arch/memory_sm80.h index 045196cb8f..1b5bb10bb7 100644 --- a/include/cutlass/arch/memory_sm80.h +++ b/include/cutlass/arch/memory_sm80.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -74,15 +74,16 @@ template < /// Size of the access in bytes int SizeInBytes> struct cp_async { - // Make sure the size is supported. - static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16), - "Size is not supported"); /// Copy CUTLASS_DEVICE cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) { #if CUDA_CP_ASYNC_ACTIVATED - + + // Make sure the size is supported. + static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16), + "Size is not supported"); + unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr); asm volatile( @@ -108,15 +109,16 @@ template < /// Size of the access in bytes int SizeInBytes> struct cp_async_zfill { - // Make sure the size is supported. - static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16), - "Size is not supported"); /// Copy with zero fill CUTLASS_DEVICE cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) { #if CUDA_CP_ASYNC_ACTIVATED - + + // Make sure the size is supported. + static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16), + "Size is not supported"); + unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr); int src_in_bytes = (pred_guard ? SizeInBytes : 0); @@ -146,16 +148,13 @@ template < /// Size of the access in bytes int SizeInBytes> struct cp_async { - // Make sure the size is supported. - static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16), - "Size is not supported"); /// Copy CUTLASS_DEVICE cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) { #if CUDA_CP_ASYNC_ACTIVATED - static_assert(SizeInBytes == 16, + static_assert(SizeInBytes == 16, "cp.async only supports CacheOperation::Global when access size is 16B."); unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr); @@ -183,16 +182,13 @@ template < /// Size of the access in bytes int SizeInBytes> struct cp_async_zfill { - // Make sure the size is supported. - static_assert((SizeInBytes == 4 || SizeInBytes == 8 || SizeInBytes == 16), - "Size is not supported"); /// Copy with zero fill CUTLASS_DEVICE cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) { #if CUDA_CP_ASYNC_ACTIVATED - static_assert(SizeInBytes == 16, + static_assert(SizeInBytes == 16, "cp.async only supports CacheOperation::Global when access size is 16B."); unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr); diff --git a/include/cutlass/arch/mma.h b/include/cutlass/arch/mma.h index 729cd17917..1672e60713 100644 --- a/include/cutlass/arch/mma.h +++ b/include/cutlass/arch/mma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/mma_sm50.h b/include/cutlass/arch/mma_sm50.h index cc4a94b17e..fa8e1949ec 100644 --- a/include/cutlass/arch/mma_sm50.h +++ b/include/cutlass/arch/mma_sm50.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/mma_sm60.h b/include/cutlass/arch/mma_sm60.h index 5c82f74ec3..1b18609690 100644 --- a/include/cutlass/arch/mma_sm60.h +++ b/include/cutlass/arch/mma_sm60.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/mma_sm61.h b/include/cutlass/arch/mma_sm61.h index 6cbe260633..5ee65c2574 100644 --- a/include/cutlass/arch/mma_sm61.h +++ b/include/cutlass/arch/mma_sm61.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/mma_sm70.h b/include/cutlass/arch/mma_sm70.h index b03ce2c1de..213d6bb54e 100644 --- a/include/cutlass/arch/mma_sm70.h +++ b/include/cutlass/arch/mma_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/mma_sm75.h b/include/cutlass/arch/mma_sm75.h index c5e0db9720..62015d3dd7 100644 --- a/include/cutlass/arch/mma_sm75.h +++ b/include/cutlass/arch/mma_sm75.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/mma_sm80.h b/include/cutlass/arch/mma_sm80.h index 289c205cad..c4fdaedf5d 100644 --- a/include/cutlass/arch/mma_sm80.h +++ b/include/cutlass/arch/mma_sm80.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -112,7 +112,13 @@ struct Mma< ); #else - assert(0); + + CUTLASS_UNUSED(d); + CUTLASS_UNUSED(a); + CUTLASS_UNUSED(b); + CUTLASS_UNUSED(c); + CUTLASS_NOT_IMPLEMENTED(); + #endif } }; @@ -178,7 +184,13 @@ struct Mma< ); #else - assert(0); + + CUTLASS_UNUSED(d); + CUTLASS_UNUSED(a); + CUTLASS_UNUSED(b); + CUTLASS_UNUSED(c); + CUTLASS_NOT_IMPLEMENTED(); + #endif } }; @@ -230,7 +242,13 @@ struct Mma, 32, tfloat32_t, layout::RowMajor, "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])); #else - assert(0); + + CUTLASS_UNUSED(d); + CUTLASS_UNUSED(a); + CUTLASS_UNUSED(b); + CUTLASS_UNUSED(c); + CUTLASS_NOT_IMPLEMENTED(); + #endif } }; @@ -291,7 +309,13 @@ struct Mma< ); #else - assert(0); + + CUTLASS_UNUSED(d); + CUTLASS_UNUSED(a); + CUTLASS_UNUSED(b); + CUTLASS_UNUSED(c); + CUTLASS_NOT_IMPLEMENTED(); + #endif } }; @@ -352,7 +376,13 @@ struct Mma< "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])); #else - assert(0); + + CUTLASS_UNUSED(d); + CUTLASS_UNUSED(a); + CUTLASS_UNUSED(b); + CUTLASS_UNUSED(c); + CUTLASS_NOT_IMPLEMENTED(); + #endif } }; @@ -413,7 +443,13 @@ struct Mma< "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])); #else - assert(0); + + CUTLASS_UNUSED(d); + CUTLASS_UNUSED(a); + CUTLASS_UNUSED(b); + CUTLASS_UNUSED(c); + CUTLASS_NOT_IMPLEMENTED(); + #endif } }; @@ -472,7 +508,13 @@ struct Mma< : "d"(A), "d"(B), "d"(C[0]), "d"(C[1])); #else - assert(0); + + CUTLASS_UNUSED(d); + CUTLASS_UNUSED(a); + CUTLASS_UNUSED(b); + CUTLASS_UNUSED(c); + CUTLASS_NOT_IMPLEMENTED(); + #endif } }; diff --git a/include/cutlass/arch/mma_sparse_sm80.h b/include/cutlass/arch/mma_sparse_sm80.h index a93fd2924c..8d3aaaf0a6 100644 --- a/include/cutlass/arch/mma_sparse_sm80.h +++ b/include/cutlass/arch/mma_sparse_sm80.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/simd.h b/include/cutlass/arch/simd.h index 2503094ad3..4e7265c403 100644 --- a/include/cutlass/arch/simd.h +++ b/include/cutlass/arch/simd.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/simd_sm60.h b/include/cutlass/arch/simd_sm60.h index 36030a3661..277cf1af36 100644 --- a/include/cutlass/arch/simd_sm60.h +++ b/include/cutlass/arch/simd_sm60.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/simd_sm61.h b/include/cutlass/arch/simd_sm61.h index 94f1c617c3..3f7b2d8ae3 100644 --- a/include/cutlass/arch/simd_sm61.h +++ b/include/cutlass/arch/simd_sm61.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/wmma.h b/include/cutlass/arch/wmma.h index 0a556aee3a..fa6d288a61 100644 --- a/include/cutlass/arch/wmma.h +++ b/include/cutlass/arch/wmma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/wmma_sm70.h b/include/cutlass/arch/wmma_sm70.h index 94eeb93deb..55af75a4ae 100644 --- a/include/cutlass/arch/wmma_sm70.h +++ b/include/cutlass/arch/wmma_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/wmma_sm72.h b/include/cutlass/arch/wmma_sm72.h index 1b8cc1161e..9e79d16ad4 100644 --- a/include/cutlass/arch/wmma_sm72.h +++ b/include/cutlass/arch/wmma_sm72.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/wmma_sm75.h b/include/cutlass/arch/wmma_sm75.h index f630712fc6..e0d15bf4a7 100644 --- a/include/cutlass/arch/wmma_sm75.h +++ b/include/cutlass/arch/wmma_sm75.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/array.h b/include/cutlass/array.h index 3faa11d022..4eee99602b 100644 --- a/include/cutlass/array.h +++ b/include/cutlass/array.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/array_planar_complex.h b/include/cutlass/array_planar_complex.h index e2dbbc47cb..0d9a94a987 100644 --- a/include/cutlass/array_planar_complex.h +++ b/include/cutlass/array_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/array_subbyte.h b/include/cutlass/array_subbyte.h index 78081facc7..81008df727 100644 --- a/include/cutlass/array_subbyte.h +++ b/include/cutlass/array_subbyte.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -45,9 +45,6 @@ template < class Array { public: - static_assert(sizeof_bits::value * N >= 8, - "Array<> specialized for sub-byte types assume the actual stored element size is 1 byte"); - static int const kSizeBits = sizeof_bits::value * N; /// Storage type diff --git a/include/cutlass/bfloat16.h b/include/cutlass/bfloat16.h index 3a4b8bd76e..fc32a509ad 100644 --- a/include/cutlass/bfloat16.h +++ b/include/cutlass/bfloat16.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/complex.h b/include/cutlass/complex.h index 7c0ab3b4f3..3312619cbb 100644 --- a/include/cutlass/complex.h +++ b/include/cutlass/complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -52,6 +52,23 @@ enum class ComplexTransform { kConjugate }; +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Defines ComplexTransform inversions +template +struct InvertComplexTransform; + +/// Invert ComplexTransform from kNone to kConjugate +template <> +struct InvertComplexTransform { + static ComplexTransform const transform = ComplexTransform::kConjugate; +}; + +/// Invert ComplexTransform from kConjugate to kNone +template <> +struct InvertComplexTransform { + static ComplexTransform const transform = ComplexTransform::kNone; +}; +///////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// // @@ -291,6 +308,30 @@ CUTLASS_HOST_DEVICE T &imag(complex &z) { return z.imag(); } +/// Returns the real part of the real number +template +CUTLASS_HOST_DEVICE T const &real(T const &r) { + return r; +} + +/// Returns the real part of the real number +template +CUTLASS_HOST_DEVICE T &real(T &r) { + return r; +} + +/// Returns the imaginary part of the real number +template +CUTLASS_HOST_DEVICE T const &imag(T const &r) { + return T(); +} + +/// Returns the imaginary part of the complex number +template +CUTLASS_HOST_DEVICE T &imag(T &r) { + return T(); +} + // // Output operators // diff --git a/include/cutlass/constants.h b/include/cutlass/constants.h index 690891b227..9666b2b9d5 100644 --- a/include/cutlass/constants.h +++ b/include/cutlass/constants.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/conv/conv2d_problem_size.h b/include/cutlass/conv/conv2d_problem_size.h index 735103722d..fd87e1acdd 100644 --- a/include/cutlass/conv/conv2d_problem_size.h +++ b/include/cutlass/conv/conv2d_problem_size.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/conv/conv3d_problem_size.h b/include/cutlass/conv/conv3d_problem_size.h index 91827d2724..495fcc3bf2 100644 --- a/include/cutlass/conv/conv3d_problem_size.h +++ b/include/cutlass/conv/conv3d_problem_size.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -188,7 +188,7 @@ struct Conv3dProblemSize : public Conv2dProblemSize { mode, split_k_slices, groups ) { // set output Z - Z = ((D + pad_d - T * dilation_d) / stride_d) + 1; + Z = ((D + pad_d * 2 - T * dilation_d) / stride_d) + 1; } /// Equality operator (ignores mode and split_k_slice) diff --git a/include/cutlass/conv/convolution.h b/include/cutlass/conv/convolution.h index c743ea6faa..95afe94f57 100644 --- a/include/cutlass/conv/convolution.h +++ b/include/cutlass/conv/convolution.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/conv/device/implicit_gemm_convolution.h b/include/cutlass/conv/device/implicit_gemm_convolution.h index 0aa03d1997..2e5e3b0c82 100644 --- a/include/cutlass/conv/device/implicit_gemm_convolution.h +++ b/include/cutlass/conv/device/implicit_gemm_convolution.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/conv/kernel/default_conv2d.h b/include/cutlass/conv/kernel/default_conv2d.h index 57fae79655..603856a4f1 100644 --- a/include/cutlass/conv/kernel/default_conv2d.h +++ b/include/cutlass/conv/kernel/default_conv2d.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -41,7 +41,6 @@ #include "cutlass/conv/threadblock/implicit_gemm_pipelined.h" #include "cutlass/conv/threadblock/implicit_gemm_multistage.h" #include "cutlass/conv/kernel/implicit_gemm_convolution.h" - ///////////////////////////////////////////////////////////////////////////////////////////////// namespace cutlass { @@ -101,4 +100,3 @@ struct DefaultConvEpilogue< } // namespace cutlass ///////////////////////////////////////////////////////////////////////////////////////////////// - diff --git a/include/cutlass/conv/kernel/default_conv2d_dgrad.h b/include/cutlass/conv/kernel/default_conv2d_dgrad.h index c590f57efc..f81c389728 100644 --- a/include/cutlass/conv/kernel/default_conv2d_dgrad.h +++ b/include/cutlass/conv/kernel/default_conv2d_dgrad.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/conv/kernel/default_conv2d_fprop.h b/include/cutlass/conv/kernel/default_conv2d_fprop.h index c38d5150b1..d22fb7f0ba 100644 --- a/include/cutlass/conv/kernel/default_conv2d_fprop.h +++ b/include/cutlass/conv/kernel/default_conv2d_fprop.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -1376,4 +1376,3 @@ struct DefaultConv2dFprop < } // namespace cutlass ///////////////////////////////////////////////////////////////////////////////////////////////// - diff --git a/include/cutlass/conv/kernel/default_conv2d_wgrad.h b/include/cutlass/conv/kernel/default_conv2d_wgrad.h index c7912203a4..1bb68689d0 100644 --- a/include/cutlass/conv/kernel/default_conv2d_wgrad.h +++ b/include/cutlass/conv/kernel/default_conv2d_wgrad.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/conv/kernel/default_conv3d_dgrad.h b/include/cutlass/conv/kernel/default_conv3d_dgrad.h index a92b4bfb6a..475cceecc6 100644 --- a/include/cutlass/conv/kernel/default_conv3d_dgrad.h +++ b/include/cutlass/conv/kernel/default_conv3d_dgrad.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -34,6 +34,9 @@ #include "cutlass/cutlass.h" #include "cutlass/conv/kernel/default_conv2d.h" +#include "cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h" +#include "cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h" + #include "cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h" #include "cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h" #include "cutlass/conv/threadblock/conv2d_tile_iterator.h" @@ -45,7 +48,7 @@ namespace conv { namespace kernel { ///////////////////////////////////////////////////////////////////////////////////////////////// -/// Defines a kernel for Conv2dDgrad +/// Defines a kernel for Conv3dDgrad template < typename ElementA, typename LayoutA, @@ -67,7 +70,7 @@ template < conv::StrideSupport StrideSupport = StrideSupport::kStrided > struct DefaultConv3dDgrad; -/// Defines a kernel for Conv2dDgrad specialzation for Analytic IteratorAlgorithm Dgrad Strided +/// Defines a kernel for Conv3dDgrad specialzation for Analytic IteratorAlgorithm Dgrad Strided // and multistage pipeline. template < typename ElementA, @@ -174,6 +177,117 @@ struct DefaultConv3dDgrad < }; +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines a kernel for Conv3dDgrad specialzation for Optimized IteratorAlgorithm Dgrad Strided +// and multistage pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename OperatorClass, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultConv3dDgrad < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kOptimized, + StrideSupport::kUnity +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass, + Stages, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + + using IteratorA = + cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, + ThreadMapA, + StrideSupport::kUnity + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + + using IteratorB = + cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Global, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp< + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kDgrad, + Conv3dProblemSize + >; +}; + + ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernel diff --git a/include/cutlass/conv/kernel/default_conv3d_fprop.h b/include/cutlass/conv/kernel/default_conv3d_fprop.h index 7694c8b9e8..5660458855 100644 --- a/include/cutlass/conv/kernel/default_conv3d_fprop.h +++ b/include/cutlass/conv/kernel/default_conv3d_fprop.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -34,6 +34,10 @@ #include "cutlass/cutlass.h" #include "cutlass/conv/kernel/default_conv2d.h" +#include "cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h" +#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h" + + #include "cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h" #include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h" @@ -68,6 +72,113 @@ template < ///////////////////////////////////////////////////////////////////////////////////////////////// +/// Defines a kernel for Conv3dFprop specialization for Analytic Iterator Algorithm +/// and 2 stage pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + typename MathOperatorTag +> +struct DefaultConv3dFprop < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassTensorOp, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + 2, + MathOperatorTag, + IteratorAlgorithm::kAnalytic +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, + 2, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementA, + ThreadMapA + > + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using IteratorB = + cutlass::conv::threadblock::TileIterator< + cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic< + cutlass::MatrixShape, + ElementB, + ThreadMapB + > + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmPipelined< + ThreadblockShape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + MmaPolicy + >; + + // Define the epilogue + using Epilogue = typename detail::DefaultConvEpilogue< + ArchTag, + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop, + Conv3dProblemSize + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + /// Defines a kernel for Conv2dFprop specialzation for Analytic IteratorAlgorithm and multistage // pipeline. template < @@ -173,6 +284,114 @@ struct DefaultConv3dFprop < ///////////////////////////////////////////////////////////////////////////////////////////////// +/// Defines a kernel for Conv3dFprop specialzation for Optimized IteratorAlgorithm and multistage +// pipeline. +template < + typename ElementA, + typename LayoutA, + typename ElementB, + typename LayoutB, + typename ElementC, + typename LayoutC, + typename ElementAccumulator, + typename ArchTag, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename EpilogueOutputOp, + typename ThreadblockSwizzle, + int Stages, + typename MathOperatorTag +> +struct DefaultConv3dFprop < + ElementA, + LayoutA, + ElementB, + LayoutB, + ElementC, + LayoutC, + ElementAccumulator, + arch::OpClassTensorOp, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + MathOperatorTag, + IteratorAlgorithm::kOptimized +> { + + // Define the core components from GEMM + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor, + ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, + Stages, MathOperatorTag>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using IteratorA = + cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementA, + LayoutA, + ThreadMapA + >; + + using SmemIteratorA = typename MmaCore::SmemIteratorA; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + + using IteratorB = + cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized< + cutlass::MatrixShape, + ElementB, + LayoutB, + ThreadMapB + >; + + using SmemIteratorB = typename MmaCore::SmemIteratorB; + + // Warp-level GEMM components + using WarpMmaTensorOp = typename MmaCore::MmaTensorOp; + using MmaPolicy = typename MmaCore::MmaPolicy; + + // Define the Mma + using Mma = threadblock::ImplicitGemmMultistage< + ThreadblockShape, + IteratorA, + SmemIteratorA, + arch::CacheOperation::Always, + IteratorB, + SmemIteratorB, + arch::CacheOperation::Global, + MmaPolicy, + Stages + >; + + // Define the epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp< + ThreadblockShape, + WarpMmaTensorOp, + 1, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + // Define the kernel + using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution< + Mma, + Epilogue, + ThreadblockSwizzle, + conv::Operator::kFprop, + Conv3dProblemSize + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace kernel } // namespace conv } // namespace cutlass diff --git a/include/cutlass/conv/kernel/default_conv3d_wgrad.h b/include/cutlass/conv/kernel/default_conv3d_wgrad.h index b0f5b91558..2f7ea86807 100644 --- a/include/cutlass/conv/kernel/default_conv3d_wgrad.h +++ b/include/cutlass/conv/kernel/default_conv3d_wgrad.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution.h b/include/cutlass/conv/kernel/implicit_gemm_convolution.h index 2ec1566889..fbc44b15b0 100644 --- a/include/cutlass/conv/kernel/implicit_gemm_convolution.h +++ b/include/cutlass/conv/kernel/implicit_gemm_convolution.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -216,8 +216,7 @@ struct ImplicitGemmConvolution { ): problem_size(args.problem_size), implicit_gemm_problem_size(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)), - grid_tiled_shape(grid_tiled_shape), - iterator_A(args.problem_size, args.ref_A.layout()), + iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())), ptr_A(args.ref_A.data()), iterator_B(args.problem_size, args.ref_B.layout()), ptr_B(args.ref_B.data()), diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h index 14c8a4e829..8afb4968b1 100644 --- a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h +++ b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h index f76dcde931..937216d5e6 100644 --- a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h +++ b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h index d32da7c3bf..e33e4ccb23 100644 --- a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h +++ b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -186,6 +186,11 @@ class Conv2dDgradOutputGradientTileAccessIteratorAnalytic < } } + CUTLASS_HOST_DEVICE + static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) { + return Params(problem_size, layout); + } + /// Overrides the internal iteration index CUTLASS_HOST_DEVICE void set_iteration_index(Index index) { @@ -402,6 +407,11 @@ class Conv2dDgradOutputGradientTileAccessIteratorAnalytic < } } + CUTLASS_HOST_DEVICE + static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) { + return Params(problem_size, layout); + } + /// Overrides the internal iteration index CUTLASS_HOST_DEVICE void set_iteration_index(Index index) { diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h index 71299cf578..078c9e7fc1 100644 --- a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h +++ b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -98,35 +98,7 @@ class Conv2dDgradOutputGradientTileAccessIteratorOptimized { // Parameters structure // - struct Params : Conv2dDgradOutputGradientIteratorOptimizedParams { - - // - // Methods - // - - CUTLASS_HOST_DEVICE - Params() { } - - CUTLASS_HOST_DEVICE - Params(Conv2dDgradOutputGradientIteratorOptimizedParams const &base): - Conv2dDgradOutputGradientIteratorOptimizedParams(base) { } - - CUTLASS_HOST_DEVICE - Params( - Conv2dProblemSize const &problem_size, - Layout const &layout - ): - Conv2dDgradOutputGradientIteratorOptimizedParams( - problem_size, - layout, - sizeof_bits::value, - {Shape::kRow, Shape::kColumn}, - ThreadMap::kThreads, - ThreadMap::kElementsPerAccess, - {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided}, - {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided} - ) { } - }; + using Params = Conv2dDgradOutputGradientIteratorOptimizedParams; private: @@ -239,10 +211,22 @@ class Conv2dDgradOutputGradientTileAccessIteratorOptimized { set_iteration_index(0); } + CUTLASS_HOST_DEVICE + static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) { + return Params(problem_size, + layout, + sizeof_bits::value, + {Shape::kRow, Shape::kColumn}, + ThreadMap::kThreads, + ThreadMap::kElementsPerAccess, + {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided}, + {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}); + } + private: /// Returns the coordinate in the output gradient tensor dy that is correspoinding to - // output nhw and filter position k, r, s + // activation nhw and filter position k, r, s CUTLASS_HOST_DEVICE TensorCoord at_(int n, int h, int w, int r, int s) const { diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h index 92dd705d6b..51a5150456 100644 --- a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h +++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -146,6 +146,11 @@ class Conv2dFpropActivationTileAccessIteratorAnalytic { set_iteration_index(0); } + CUTLASS_HOST_DEVICE + static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) { + return Params(problem_size, layout); + } + /// Overrides the internal iteration index CUTLASS_HOST_DEVICE void set_iteration_index(Index index) { diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h index afb015d352..573255da32 100644 --- a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h +++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -95,33 +95,7 @@ class Conv2dFpropActivationTileAccessIteratorOptimized { // Parameters structure // - struct Params : Conv2dFpropActivationIteratorOptimizedParams { - - CUTLASS_HOST_DEVICE - Params() { } - - CUTLASS_HOST_DEVICE - Params(Conv2dFpropActivationIteratorOptimizedParams const &base): - Conv2dFpropActivationIteratorOptimizedParams(base) { } - - CUTLASS_HOST_DEVICE - Params( - Conv2dProblemSize const &problem_size, - Layout const &layout - ): - Conv2dFpropActivationIteratorOptimizedParams( - problem_size, - layout, - sizeof_bits::value, - {Shape::kRow, Shape::kColumn}, - ThreadMap::kThreads, - ThreadMap::kElementsPerAccess, - {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided}, - {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided} - ) { - - } - }; + using Params = Conv2dFpropActivationIteratorOptimizedParams; private: @@ -234,6 +208,18 @@ class Conv2dFpropActivationTileAccessIteratorOptimized { set_iteration_index(0); } + CUTLASS_HOST_DEVICE + static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) { + return Params(problem_size, + layout, + sizeof_bits::value, + {Shape::kRow, Shape::kColumn}, + ThreadMap::kThreads, + ThreadMap::kElementsPerAccess, + {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided}, + {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}); + } + private: /// Returns the coordinate in the activations tensor X that is correspoinding to diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h index 6547e9c5ba..b0a89adae2 100644 --- a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h +++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h index bf0d1d3124..2f12e41fef 100644 --- a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h +++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/conv/threadblock/conv2d_params.h b/include/cutlass/conv/threadblock/conv2d_params.h index ac6b2e3095..3c64b1f75e 100644 --- a/include/cutlass/conv/threadblock/conv2d_params.h +++ b/include/cutlass/conv/threadblock/conv2d_params.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -68,7 +68,7 @@ struct Conv2dAnalyticParams { CUTLASS_HOST_DEVICE Conv2dAnalyticParams( - Conv2dProblemSize const &problem_size, + Conv2dProblemSize const &, // unused; placeholder to match other Params interfaces. Layout const &layout ): layout(layout) { @@ -168,7 +168,10 @@ struct Conv2dFpropActivationIteratorOptimizedParams { layout::PitchLinearCoord threadmap_iterations, layout::PitchLinearCoord threadmap_delta ): - layout(layout), PQ(problem_size.P * problem_size.Q), pq_divmod(PQ), q_divmod(problem_size.Q) { + layout(layout), + PQ(problem_size.P * problem_size.Q), + pq_divmod(PQ), + q_divmod(problem_size.Q) { TRACE_CONV_INITIALIZERS("conv2d_fprop", "activation", element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta); @@ -176,7 +179,9 @@ struct Conv2dFpropActivationIteratorOptimizedParams { int conv_sign = (problem_size.mode == Mode::kConvolution ? -1 : 1); // next S - inc_next[0] = conv_sign * (int64_t(layout.stride()[0]) * problem_size.dilation_w) * element_size_bits / 8; + inc_next[0] = conv_sign * ( + int64_t(layout.stride()[0]) * problem_size.dilation_w + ) * element_size_bits / 8; // next R inc_next[1] = conv_sign * ( @@ -388,7 +393,7 @@ struct Conv2dDgradOutputGradientIteratorOptimizedParams { int filter_k_delta; // number of logical elements to add to filter_k_ - int HW; // product of H*W + int HW; // product of H*W FastDivmod hw_divmod; FastDivmod w_divmod; @@ -411,7 +416,10 @@ struct Conv2dDgradOutputGradientIteratorOptimizedParams { layout::PitchLinearCoord threadmap_iterations, layout::PitchLinearCoord threadmap_delta ): - layout(layout), HW(problem_size.H *problem_size.W), hw_divmod(HW), w_divmod(problem_size.W) { + layout(layout), + HW(problem_size.H *problem_size.W), + hw_divmod(HW), + w_divmod(problem_size.W) { TRACE_CONV_INITIALIZERS("conv2d_dgrad", "output_gradient", element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta); @@ -419,7 +427,9 @@ struct Conv2dDgradOutputGradientIteratorOptimizedParams { int conv_sign = (problem_size.mode == Mode::kConvolution ? 1 : -1); // next S - inc_next[0] = conv_sign * (layout.stride()[0] * problem_size.dilation_w) * element_size_bits / 8; + inc_next[0] = conv_sign * ( + layout.stride()[0] * problem_size.dilation_w + ) * element_size_bits / 8; // next R inc_next[1] = conv_sign * ( diff --git a/include/cutlass/conv/threadblock/conv2d_tile_iterator.h b/include/cutlass/conv/threadblock/conv2d_tile_iterator.h index ce52017e37..61f02d19fe 100644 --- a/include/cutlass/conv/threadblock/conv2d_tile_iterator.h +++ b/include/cutlass/conv/threadblock/conv2d_tile_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -92,6 +92,12 @@ class TileIterator { ): tile_access_iterator_(params, problem_size, ptr, thread_idx, threadblock_offset) { } + CUTLASS_HOST_DEVICE + static Params getParams(ConvProblemSize const &problem_size, Layout const &layout) { + return TileAccessIterator::getParams(problem_size, layout); + } + + /// Adds a pointer offset in units of Element CUTLASS_HOST_DEVICE void add_pointer_offset(LongIndex pointer_offset) { diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h index 13d8338c2f..1e3a5837d0 100644 --- a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h +++ b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h index 74a887794b..7762d6191f 100644 --- a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h +++ b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h index 84c788d6d4..53fc920575 100644 --- a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h +++ b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -133,6 +133,11 @@ class Conv2dWgradOutputGradientTileAccessIteratorAnalytic { } } + CUTLASS_HOST_DEVICE + static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) { + return Params(problem_size, layout); + } + /// Overrides the internal iteration index CUTLASS_HOST_DEVICE void set_iteration_index(Index index) { diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h index 4a20cb1d8b..f138ef59a4 100644 --- a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h +++ b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -86,35 +86,7 @@ class Conv2dWgradOutputGradientTileAccessIteratorOptimized { // Parameters structure // - struct Params : Conv2dWgradOutputGradientIteratorOptimizedParams { - - // - // Methods - // - - CUTLASS_HOST_DEVICE - Params() { } - - CUTLASS_HOST_DEVICE - Params(Conv2dWgradOutputGradientIteratorOptimizedParams const &base): - Conv2dWgradOutputGradientIteratorOptimizedParams(base) { } - - CUTLASS_HOST_DEVICE - Params( - Conv2dProblemSize const &problem_size, - Layout const &layout - ): - Conv2dWgradOutputGradientIteratorOptimizedParams( - problem_size, - layout, - sizeof_bits::value, - {Shape::kRow, Shape::kColumn}, - ThreadMap::kThreads, - ThreadMap::kElementsPerAccess, - {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided}, - {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided} - ) { } - }; + using Params = Conv2dWgradOutputGradientIteratorOptimizedParams; private: @@ -176,6 +148,18 @@ class Conv2dWgradOutputGradientTileAccessIteratorOptimized { set_iteration_index(0); } + CUTLASS_HOST_DEVICE + static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) { + return Params(problem_size, + layout, + sizeof_bits::value, + {Shape::kRow, Shape::kColumn}, + ThreadMap::kThreads, + ThreadMap::kElementsPerAccess, + {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided}, + {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}); + } + /// Overrides the internal iteration index CUTLASS_HOST_DEVICE void set_iteration_index(Index index) { diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h index 0033568278..01437547c6 100644 --- a/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h +++ b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h new file mode 100644 index 0000000000..ee532ff61e --- /dev/null +++ b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h @@ -0,0 +1,283 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) + matrix from memory. + + This iterator assumes TensorNHWC layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv3d_problem_size.h" + +#include "cutlass/conv/threadblock/conv3d_params.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename ThreadMap_, + conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity +> +class Conv3dDgradFilterTileAccessIteratorOptimized { +public: + + // + // Types + // + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorNDHWC; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using TensorCoord = typename Layout::TensorCoord; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized; + static StrideSupport const kStrideSupport = StrideSupport_; + static int const kConvDim = 3; + using ConvProblemSize = typename conv::Conv3dProblemSize; + + // + // Parameters structure + // + + struct Params : Conv3dDgradFilterIteratorOptimizedParams { + + // + // Methods + // + CUTLASS_HOST_DEVICE + Params() { } + + CUTLASS_HOST_DEVICE + Params(Conv3dDgradFilterIteratorOptimizedParams const &base): + Conv3dDgradFilterIteratorOptimizedParams(base) { } + + CUTLASS_HOST_DEVICE + Params( + Conv3dProblemSize const &problem_size, + Layout const &layout + ): + Conv3dDgradFilterIteratorOptimizedParams( + problem_size, + layout, + sizeof_bits::value, + {Shape::kRow, Shape::kColumn}, + ThreadMap::kThreads, + ThreadMap::kElementsPerAccess, + {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided}, + {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided} + ) { } + + }; + +private: + + Conv3dDgradFilterIteratorOptimizedParams const ¶ms_; + Conv3dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + uint32_t predicates_; + int filter_trs_; + int filter_k_; + + // + // Assertions + // + + // We map predicates into bits packed in this uint32_t container + static_assert(ThreadMap::Iterations::kStrided * + ThreadMap::Iterations::kContiguous < sizeof(predicates_) * 8, + "Currently, the number of loads per iteration is limited by the size of the predicates container."); + +public: + + CUTLASS_HOST_DEVICE + Conv3dDgradFilterTileAccessIteratorOptimized( + Conv3dDgradFilterIteratorOptimizedParams const ¶ms, + Conv3dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)), + predicates_(0), + filter_trs_(0), + filter_k_(0) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + filter_k_ = threadblock_offset.row() + thread_coord.strided(); + Index column = threadblock_offset.column() + thread_coord.contiguous(); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + + int filter_k = filter_k_ + s * ThreadMap::Delta::kStrided; + int filter_c = column + c * ThreadMap::Delta::kContiguous; + + uint32_t pred = ((filter_k < problem_size_.K && filter_c < problem_size_.C) ? 1u : 0); + + int pred_idx = c + s * ThreadMap::Iterations::kContiguous; + + predicates_ |= (pred << pred_idx); + } + } + + pointer_ += ( + filter_k_ * params.layout.stride()[3] + column + ) * sizeof_bits::value / 8; + + set_iteration_index(0); + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + + pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + CUTLASS_HOST_DEVICE + void advance() { + + LongIndex next = params_.inc_next_trs; + + // moves to the next tile + ++filter_trs_; + if (filter_trs_ == params_.TRS) { + + filter_trs_ = 0; + next = params_.inc_next_k; + filter_k_ += params_.filter_k_delta; + } + + // Clear predicates if needed + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + if (filter_k_ + s * ThreadMap::Delta::kStrided >= problem_size_.K) { + uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous); + predicates_ = (predicates_ & (~kClearMask)); + } + } + + pointer_ += next; + } + + /// Returns true if the current coordinate is within the filter tensor W + CUTLASS_HOST_DEVICE + bool valid() { + LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous; + return (predicates_ & (1u << pred_idx)); + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + return reinterpret_cast(pointer_ + + iteration_contiguous_ * ThreadMap::Delta::kContiguous * sizeof_bits::value / 8); + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv3dDgradFilterTileAccessIteratorOptimized &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + + // Move to the next K coordinate within the tile + pointer_ += params_.inc_next_strided; + + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv3dProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.C % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + return Status::kSuccess; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + + diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h index 47e7de46a0..1d70ab3d57 100644 --- a/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h +++ b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -212,6 +212,11 @@ class Conv3dDgradOutputGradientTileAccessIteratorAnalytic < } } + CUTLASS_HOST_DEVICE + static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) { + return Params(problem_size, layout); + } + /// Overrides the internal iteration index CUTLASS_HOST_DEVICE void set_iteration_index(Index index) { diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h new file mode 100644 index 0000000000..2a62c2924b --- /dev/null +++ b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h @@ -0,0 +1,484 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) + matrix from memory. + + This iterator assumes TensorNDHWC layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv3d_problem_size.h" +#include "cutlass/conv/threadblock/conv3d_params.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename ThreadMap_, + conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity +> +class Conv3dDgradOutputGradientTileAccessIteratorOptimized { +public: + + static_assert(StrideSupport_ == conv::StrideSupport::kUnity, + "Only unit-stride dgrad is supported at this time."); + + // + // Types + // + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorNDHWC; + using TensorCoord = typename Layout::TensorCoord; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized; + static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity; + static int const kConvDim = 3; + using ConvProblemSize = typename conv::Conv3dProblemSize; + using Coord3D = Coord<3>; + + using Mask = uint64_t; + + // + // Simplifying assertions + // + static_assert(ThreadMap::Iterations::kContiguous == 1, + "Require Iterations::kContiguous == 1"); + + // + // Parameters structure + // + + using Params = Conv3dDgradOutputGradientIteratorOptimizedParams; + +private: + + Params const ¶ms_; + ConvProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + + + // One pointer per access + char const *pointer_[ThreadMap::Iterations::kStrided]; + + // current filter position (t, r, s) + int filter_t_; + int filter_r_; + int filter_s_; + int filter_k_; + + Index masks_[ThreadMap::Iterations::kStrided][3]; + +public: + + CUTLASS_HOST_DEVICE + Conv3dDgradOutputGradientTileAccessIteratorOptimized( + Params const ¶ms, + ConvProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() // tile index - units are threadblock-scoped tiles + ): + params_(params), + problem_size_(problem_size), + filter_k_(0), + filter_t_(0), + filter_r_(0), + filter_s_(0) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + filter_k_ = threadblock_offset.column() + thread_coord.contiguous(); + + int offset_n[ThreadMap::Iterations::kStrided]; + int offset_d[ThreadMap::Iterations::kStrided]; + int offset_h[ThreadMap::Iterations::kStrided]; + int offset_w[ThreadMap::Iterations::kStrided]; + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + + pointer_[s] = reinterpret_cast(ptr); + + int offset_ndhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided; + + // The subseqnet fast_divmod() operations are equivalent to the following logical computation: + // + // + // offset_n[s] = offset_ndhw / (problem_size_.D * problem_size_.H * problem_size_.W); + // int residual = offset_ndhw % (problem_size_.D * problem_size_.H * problem_size_.W); + // + // + // offset_d[s] = residual / (problem_size_.H * problem_size_.W); + // residual = residual % (problem_size_.H * problem_size_.W); + // + // offset_h[s] = residual / problem_size_.W; + // offset_w[s] = residual % problem_size_.W; + // + + int residual; + + // input: (ndhw offset) output: (n offset and resudial (dhw offset)) + params_.dhw_divmod(offset_n[s], residual, offset_ndhw); + // input: (dhw offset) output: (d offset and resudial (hw)) + params_.hw_divmod(offset_d[s], residual, residual); + // input: (hw offset) output: (h offset and resudial (w offset)) + params_.w_divmod(offset_h[s], offset_w[s], residual); + + TensorCoord coord = at_(offset_n[s], offset_d[s], offset_h[s], offset_w[s], 0, 0, 0); + + pointer_[s] += params_.layout(coord) * sizeof_bits::value / 8; + } + + clear_mask(); + + CUTLASS_PRAGMA_NO_UNROLL + for (int t = 0; t < problem_size_.T; ++t) { + CUTLASS_PRAGMA_UNROLL + for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) { + + int t_ = t; + if (problem_size_.mode == Mode::kConvolution) { + t_ = problem_size_.T - 1 - t; + } + + int z = offset_d[s_idx] + problem_size_.pad_d - t_ * problem_size_.dilation_d; + + bool pred = (offset_n[s_idx] < problem_size_.N && z >= 0 && z < problem_size_.Z); + masks_[s_idx][0] |= (pred << t); + } + } + + CUTLASS_PRAGMA_NO_UNROLL + for (int r = 0; r < problem_size_.R; ++r) { + CUTLASS_PRAGMA_UNROLL + for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) { + + int r_ = r; + if (problem_size_.mode == Mode::kConvolution) { + r_ = problem_size_.R - 1 - r; + } + + int p = offset_h[s_idx] + problem_size_.pad_h - r_ * problem_size_.dilation_h; + + bool pred = (p >= 0 && p < problem_size_.P); + masks_[s_idx][1] |= (pred << r); + } + } + + CUTLASS_PRAGMA_NO_UNROLL + for (int s = 0; s < problem_size_.S; ++s) { + CUTLASS_PRAGMA_UNROLL + for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) { + + int s_ = s; + if (problem_size_.mode == Mode::kConvolution) { + s_ = problem_size_.S - 1 - s; + } + + int q = offset_w[s_idx] + problem_size_.pad_w - s_ * problem_size_.dilation_w; + + bool pred = (q >= 0 && q < problem_size_.Q); + masks_[s_idx][2] |= (pred << s); + } + } + + if (filter_k_ >= problem_size.K) { + clear_mask(); + } + + set_iteration_index(0); + + } + + CUTLASS_HOST_DEVICE + static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) { + return Params(problem_size, + layout, + sizeof_bits::value, + {Shape::kRow, Shape::kColumn}, + ThreadMap::kThreads, + ThreadMap::kElementsPerAccess, + {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided}, + {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}); + } + +private: + + + /// Returns the coordinate in the output gradient tensor dy that is correspoinding to + // activation ndhw and filter position k, t, r, s + CUTLASS_HOST_DEVICE + TensorCoord at_(int n, int d, int h, int w, int t, int r, int s) const { + + if (problem_size_.mode == Mode::kConvolution) { + t = problem_size_.T - 1 - t; + r = problem_size_.R - 1 - r; + s = problem_size_.S - 1 - s; + } + + int z = d + problem_size_.pad_d - t * problem_size_.dilation_d; + int p = h + problem_size_.pad_h - r * problem_size_.dilation_h; + int q = w + problem_size_.pad_w - s * problem_size_.dilation_w; + + return TensorCoord(n, z, p, q, filter_k_); + } + + + /// Adds a pointer offset in units of element + CUTLASS_HOST_DEVICE + void add_byte_offset_(LongIndex byte_offset) { + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + pointer_[s] += byte_offset; + } + } + + /// Clears the predicates + CUTLASS_HOST_DEVICE + void clear_mask_(bool clear) { + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + + // We are using inline PTX assembly here to avoid an CUDA C++ compilation + // artifact in which control flow instructions are generated. Instead, our + // intent is to predicate the mov instructions. + #if defined(__CUDA_ARCH__) + asm volatile( + "{\n" + " .reg .pred p;\n" + " .reg .u32 m;" + " mov.u32 m, %2;" + " setp.ne.b32 p, %1, 0;\n" + " @p mov.u32 m, 0;\n" + " mov.u32 %0, m;\n" + "}\n" + : + "=r"(masks_[s][0]) + : + "r"((int)clear), + "r"(masks_[s][0]) + ); + asm volatile( + "{\n" + " .reg .pred p;\n" + " .reg .u32 m;" + " mov.u32 m, %2;" + " setp.ne.b32 p, %1, 0;\n" + " @p mov.u32 m, 0;\n" + " mov.u32 %0, m;\n" + "}\n" + : + "=r"(masks_[s][1]) + : + "r"((int)clear), + "r"(masks_[s][1]) + ); + asm volatile( + "{\n" + " .reg .pred p;\n" + " .reg .u32 m;" + " mov.u32 m, %2;" + " setp.ne.b32 p, %1, 0;\n" + " @p mov.u32 m, 0;\n" + " mov.u32 %0, m;\n" + "}\n" + : + "=r"(masks_[s][2]) + : + "r"((int)clear), + "r"(masks_[s][2]) + ); + #else + if (clear) { + masks_[s][0] = 0; + masks_[s][1] = 0; + masks_[s][2] = 0; + } + #endif + } + } + +public: + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + add_byte_offset_(pointer_offset * sizeof_bits::value / 8); + } + + + CUTLASS_HOST_DEVICE + void advance() { + + int next_idx = 0; + + // moves to the next tile + ++filter_s_; + if (filter_s_ == problem_size_.S) { + + filter_s_ = 0; + ++filter_r_; + next_idx = 1; + + if (filter_r_ == problem_size_.R) { + filter_r_ = 0; + ++filter_t_; + + if (filter_t_ < problem_size_.T) { + next_idx = 2; + } + else { + filter_t_ = 0; + next_idx = 3; + } + } + } + + add_byte_offset_(params_.inc_next[next_idx]); + + if (next_idx == 3) { + filter_k_ += params_.filter_k_delta; + } + + clear_mask_(filter_k_ >= problem_size_.K); + + } + + + /// Clears the predicates + CUTLASS_HOST_DEVICE + void clear_mask() { + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + masks_[s][0] = Mask(0); + masks_[s][1] = Mask(0); + masks_[s][2] = Mask(0); + } + } + + CUTLASS_HOST_DEVICE + bool valid() { + + return + (masks_[iteration_strided_][0] & (Index(1) << filter_t_)) && + (masks_[iteration_strided_][1] & (Index(1) << filter_r_)) && + (masks_[iteration_strided_][2] & (Index(1) << filter_s_)); + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + + return reinterpret_cast(pointer_[iteration_strided_]); + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv3dDgradOutputGradientTileAccessIteratorOptimized &operator++() { + + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(ConvProblemSize const &problem_size) { + + // This is specialized for unit stride + if (problem_size.stride() != Coord3D({1, 1, 1})) { + return Status::kErrorNotSupported; + } + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.K % (128/sizeof_bits::value)) { + return Status::kErrorNotSupported; + } + + // Limit on filter size + if (problem_size.T > 32 || problem_size.R > 32 || problem_size.S > 32) { + return Status::kErrorNotSupported; + } + return Status::kSuccess; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + + diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h index f5d14b5b10..7cadf860f7 100644 --- a/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h +++ b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -46,6 +46,7 @@ #include "cutlass/layout/matrix.h" #include "cutlass/conv/convolution.h" #include "cutlass/conv/conv3d_problem_size.h" +#include "cutlass/conv/threadblock/conv3d_params.h" ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -91,25 +92,7 @@ class Conv3dFpropActivationTileAccessIteratorAnalytic { // Parameters structure // - struct Params { - - Layout layout; - - // - // Methods - // - - CUTLASS_HOST_DEVICE - Params() { } - - CUTLASS_HOST_DEVICE - Params( - ConvProblemSize const &problem_size, - Layout const &layout - ): layout(layout) { - - } - }; + using Params = Conv3dAnalyticParams; private: @@ -168,6 +151,11 @@ class Conv3dFpropActivationTileAccessIteratorAnalytic { set_iteration_index(0); } + CUTLASS_HOST_DEVICE + static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) { + return Params(problem_size, layout); + } + /// Overrides the internal iteration index CUTLASS_HOST_DEVICE void set_iteration_index(Index index) { diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h new file mode 100644 index 0000000000..9246c59221 --- /dev/null +++ b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h @@ -0,0 +1,472 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile) + matrix from memory. + + This iterator assumes TensorNDHWC layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv3d_problem_size.h" +#include "cutlass/conv/threadblock/conv3d_params.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename Layout_, + typename ThreadMap_ +> +class Conv3dFpropActivationTileAccessIteratorOptimized { +public: + + // + // Types + // + + using Shape = Shape_; + using Element = Element_; + using Layout = Layout_; + using TensorCoord = typename Layout::TensorCoord; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized; + static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided; + static int const kConvDim = 3; + using ConvProblemSize = typename conv::Conv3dProblemSize; + + using Mask = uint64_t; + + // + // Simplifying assertions + // + static_assert(ThreadMap::Iterations::kContiguous == 1, + "Require Iterations::kContiguous == 1"); + + // + // Parameters structure + // + + using Params = Conv3dFpropActivationIteratorOptimizedParams; + +private: + + Conv3dFpropActivationIteratorOptimizedParams const ¶ms_; + Conv3dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + + // One pointer per access + char const *pointer_[ThreadMap::Iterations::kStrided]; + + // current filter position (t, r, s) + int filter_t_; + int filter_r_; + int filter_s_; + int filter_c_; + + // mask for t, r, and s + Index masks_[ThreadMap::Iterations::kStrided][3]; + +public: + + CUTLASS_HOST_DEVICE + Conv3dFpropActivationTileAccessIteratorOptimized( + Conv3dFpropActivationIteratorOptimizedParams const ¶ms, + Conv3dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() // tile index - units are threadblock-scoped tiles + ) : + params_(params), + problem_size_(problem_size), + filter_t_(0), + filter_r_(0), + filter_s_(0), + filter_c_(0) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + filter_c_ = threadblock_offset.column() + thread_coord.contiguous(); + + int offset_n[ThreadMap::Iterations::kStrided]; + int offset_z[ThreadMap::Iterations::kStrided]; + int offset_p[ThreadMap::Iterations::kStrided]; + int offset_q[ThreadMap::Iterations::kStrided]; + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + + pointer_[s] = reinterpret_cast(ptr); + + int offset_nzpq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided; + + // The subseqnet fast_divmod() operations are equivalent to the following logical computation: + // + // + // offset_n[s] = offset_nzpq / (problem_size_.Z * problem_size_.P * problem_size_.Q); + // int residual = offset_nzpq % (problem_size_.Z * problem_size_.P * problem_size_.Q); + // + // offset_z[s] = residual / (problem_size_.P * problem_size_.Q); + // residual = residual % (problem_size_.P * problem_size_.Q); + // + // offset_p[s] = residual / problem_size_.Q; + // offset_q[s] = residual % problem_size_.Q; + // + + int residual; + + // input: (nzpq offset) output: (n offset and resudial (zpq offset)) + params.zpq_divmod(offset_n[s], residual, offset_nzpq); + // input: (zpq offset) output: (z offset and resudial (pq)) + params.pq_divmod(offset_z[s], residual, residual); + // input: (pq offset) output: (p offset and resudial (q offset)) + params.q_divmod(offset_p[s], offset_q[s], residual); + + TensorCoord coord = at_(offset_n[s], offset_z[s], offset_p[s], offset_q[s], 0, 0, 0); + + pointer_[s] += params_.layout(coord) * sizeof_bits::value / 8; + } + + clear_mask(); + + // mask predicates for filter position T + CUTLASS_PRAGMA_NO_UNROLL + for (int t = 0; t < problem_size_.T; ++t) { + CUTLASS_PRAGMA_UNROLL + for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) { + + int t_ = t; + if (problem_size_.mode == Mode::kConvolution) { + t_ = problem_size_.T - 1 - t; + } + + int d = offset_z[s_idx] * problem_size_.stride_d - problem_size_.pad_d + t_ * problem_size_.dilation_d; + + bool pred = (offset_n[s_idx] < problem_size_.N && d >= 0 && d < problem_size_.D); + masks_[s_idx][0] |= (pred << t); + } + } + + // mask predicates for filter position R + CUTLASS_PRAGMA_NO_UNROLL + for (int r = 0; r < problem_size_.R; ++r) { + CUTLASS_PRAGMA_UNROLL + for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) { + + int r_ = r; + if (problem_size_.mode == Mode::kConvolution) { + r_ = problem_size_.R - 1 - r; + } + + int h = offset_p[s_idx] * problem_size_.stride_h - problem_size_.pad_h + r_ * problem_size_.dilation_h; + + bool pred = (h >= 0 && h < problem_size_.H); + masks_[s_idx][1] |= (pred << r); + } + } + + // mask predicates for filter position S + CUTLASS_PRAGMA_NO_UNROLL + for (int s = 0; s < problem_size_.S; ++s) { + CUTLASS_PRAGMA_UNROLL + for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) { + + int s_ = s; + if (problem_size_.mode == Mode::kConvolution) { + s_ = problem_size_.S - 1 - s; + } + + int w = offset_q[s_idx] * problem_size_.stride_w - problem_size_.pad_w + s_ * problem_size_.dilation_w; + + bool pred = (w >= 0 && w < problem_size_.W); + masks_[s_idx][2] |= (pred << s); + } + } + + if (filter_c_ >= problem_size.C) { + clear_mask(); + } + + set_iteration_index(0); + } + + CUTLASS_HOST_DEVICE + static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) { + return Params(problem_size, + layout, + sizeof_bits::value, + {Shape::kRow, Shape::kColumn}, + ThreadMap::kThreads, + ThreadMap::kElementsPerAccess, + {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided}, + {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}); + } + +private: + + /// Returns the coordinate in the activations tensor X that is correspoinding to + // output nzpq and filter position t, r, s + CUTLASS_HOST_DEVICE + TensorCoord at_(int n, int z, int p, int q, int t, int r, int s) const { + + if (problem_size_.mode == Mode::kConvolution) { + t = problem_size_.T - 1 - t; + r = problem_size_.R - 1 - r; + s = problem_size_.S - 1 - s; + } + + int d = z * problem_size_.stride_d - problem_size_.pad_d + t * problem_size_.dilation_d; + int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h; + int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w; + + return TensorCoord(n, d, h, w, filter_c_); + } + + /// Adds a pointer offset in units of element + CUTLASS_HOST_DEVICE + void add_byte_offset_(LongIndex byte_offset) { + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + pointer_[s] += byte_offset; + } + } + + + /// Clears the predicates + CUTLASS_HOST_DEVICE + void clear_mask_(bool clear) { + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + + // We are using inline PTX assembly here to avoid an CUDA C++ compilation + // artifact in which control flow instructions are generated. Instead, our + // intent is to predicate the mov instructions. + #if defined(__CUDA_ARCH__) + asm volatile( + "{\n" + " .reg .pred p;\n" + " .reg .u32 m;" + " mov.u32 m, %2;" + " setp.ne.b32 p, %1, 0;\n" + " @p mov.u32 m, 0;\n" + " mov.u32 %0, m;\n" + "}\n" + : + "=r"(masks_[s][0]) + : + "r"((int)clear), + "r"(masks_[s][0]) + ); + asm volatile( + "{\n" + " .reg .pred p;\n" + " .reg .u32 m;" + " mov.u32 m, %2;" + " setp.ne.b32 p, %1, 0;\n" + " @p mov.u32 m, 0;\n" + " mov.u32 %0, m;\n" + "}\n" + : + "=r"(masks_[s][1]) + : + "r"((int)clear), + "r"(masks_[s][1]) + ); + asm volatile( + "{\n" + " .reg .pred p;\n" + " .reg .u32 m;" + " mov.u32 m, %2;" + " setp.ne.b32 p, %1, 0;\n" + " @p mov.u32 m, 0;\n" + " mov.u32 %0, m;\n" + "}\n" + : + "=r"(masks_[s][2]) + : + "r"((int)clear), + "r"(masks_[s][2]) + ); + #else + if (clear) { + masks_[s][0] = 0; + masks_[s][1] = 0; + masks_[s][2] = 0; + } + #endif + } + } + +public: + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + add_byte_offset_(pointer_offset * sizeof_bits::value / 8); + } + + CUTLASS_HOST_DEVICE + void advance() { + + int next_idx = 0; + + // moves to the next tile + ++filter_s_; + if (filter_s_ == problem_size_.S) { + + filter_s_ = 0; + ++filter_r_; + next_idx = 1; + + if (filter_r_ == problem_size_.R) { + filter_r_ = 0; + ++filter_t_; + + if (filter_t_ < problem_size_.T) { + next_idx = 2; + } + else { + filter_t_ = 0; + next_idx = 3; + } + } + } + + add_byte_offset_(params_.inc_next[next_idx]); + + if (next_idx == 3) { + filter_c_ += params_.filter_c_delta; + } + + clear_mask_(filter_c_ >= problem_size_.C); + } + + /// Clears the predicates + CUTLASS_HOST_DEVICE + void clear_mask() { + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + masks_[s][0] = Mask(0); + masks_[s][1] = Mask(0); + masks_[s][2] = Mask(0); + } + } + + CUTLASS_HOST_DEVICE + bool valid() { + + return + (masks_[iteration_strided_][0] & (Index(1) << filter_t_)) && + (masks_[iteration_strided_][1] & (Index(1) << filter_r_)) && + (masks_[iteration_strided_][2] & (Index(1) << filter_s_)); + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + + return reinterpret_cast(pointer_[iteration_strided_]); + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv3dFpropActivationTileAccessIteratorOptimized &operator++() { + + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv3dProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.C % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + // Conv3dFpropActivationTileAccessIteratorOptimized has constraint on filter positions + // due to the number of mask bits. + if (problem_size.T > 32 || problem_size.R > 32 || problem_size.S > 32) { + return Status::kErrorNotSupported; + } + return Status::kSuccess; + } + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h index bad6598baf..a7f543681b 100644 --- a/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h +++ b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -45,6 +45,7 @@ #include "cutlass/layout/matrix.h" #include "cutlass/conv/convolution.h" #include "cutlass/conv/conv3d_problem_size.h" +#include "cutlass/conv/threadblock/conv3d_params.h" ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -90,24 +91,7 @@ class Conv3dFpropFilterTileAccessIteratorAnalytic { // Parameters structure // - struct Params { - - Layout layout; - - // - // Methods - // - CUTLASS_HOST_DEVICE - Params() { } - - CUTLASS_HOST_DEVICE - Params( - ConvProblemSize const &problem_size, - Layout const &layout - ): layout(layout) { - - } - }; + using Params = Conv3dAnalyticParams; private: diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h new file mode 100644 index 0000000000..5d814890bd --- /dev/null +++ b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h @@ -0,0 +1,270 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) + matrix from memory. + + This iterator assumes TensorNHWC or TensorCxRSKx layout of tensors in Global Memory. + + The iterator is specialized for each of the three convolution operators: forward propagation (Fprop), + backward data gradient (Dgrad), and backward weight gradient (Wgrad). +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/coord.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/tensor_view.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/conv3d_problem_size.h" + +#include "cutlass/conv/threadblock/conv3d_params.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename Shape_, + typename Element_, + typename Layout_, + typename ThreadMap_ +> +class Conv3dFpropFilterTileAccessIteratorOptimized{ +public: + + // + // Types + // + + using Shape = Shape_; + using Element = Element_; + using Layout = Layout_; + using ThreadMap = ThreadMap_; + using AccessType = AlignedArray; + using TensorRef = cutlass::TensorRef; + using TensorCoord = typename Layout::TensorCoord; + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized; + static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided; + static int const kConvDim = 3; + using ConvProblemSize = typename conv::Conv3dProblemSize; + + // + // Simplifying assertions + // + static_assert(ThreadMap::Iterations::kContiguous == 1, + "Require Iterations::kContiguous == 1"); + + // + // Parameters structure + // + + struct Params : Conv3dFpropFilterIteratorOptimizedParams { + + CUTLASS_HOST_DEVICE + Params() { } + + CUTLASS_HOST_DEVICE + Params(Conv3dFpropFilterIteratorOptimizedParams const &base): + Conv3dFpropFilterIteratorOptimizedParams(base) { } + + CUTLASS_HOST_DEVICE + Params( + Conv3dProblemSize const &problem_size, + Layout const &layout + ): + Conv3dFpropFilterIteratorOptimizedParams( + problem_size, + layout, + sizeof_bits::value, + {Shape::kRow, Shape::kColumn}, + ThreadMap::kThreads, + ThreadMap::kElementsPerAccess, + {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided}, + {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided} + ) { + + } + }; + +private: + + Conv3dFpropFilterIteratorOptimizedParams const ¶ms_; + Conv3dProblemSize const &problem_size_; + LongIndex iteration_contiguous_; + LongIndex iteration_strided_; + char const *pointer_; + + uint32_t predicates_; + int filter_trs_; + int filter_c_; + + // + // Assertions + // + + // We map predicates into bits packed in this uint32_t container + static_assert(ThreadMap::Iterations::kStrided < sizeof(predicates_) * 8, + "Currently, the number of loads per iteration is limited by the size of the predicates container."); + +public: + + CUTLASS_HOST_DEVICE + Conv3dFpropFilterTileAccessIteratorOptimized( + Conv3dFpropFilterIteratorOptimizedParams const ¶ms, + Conv3dProblemSize const &problem_size, + Element const *ptr, + int thread_idx, + MatrixCoord const &threadblock_offset = MatrixCoord() + ): + params_(params), + problem_size_(problem_size), + pointer_(reinterpret_cast(ptr)), + predicates_(0), + filter_trs_(0), + filter_c_(0) { + + layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx); + + filter_c_ = threadblock_offset.row() + thread_coord.contiguous(); + Index column = threadblock_offset.column() + thread_coord.strided(); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + uint32_t pred = ((column + s * ThreadMap::Delta::kStrided < problem_size_.K) ? 1u : 0); + predicates_ |= (pred << s); + } + + if (filter_c_ >= problem_size.C) { + predicates_ = 0u; + } + + pointer_ += ( + params_.layout({filter_c_, column}) + ) * sizeof_bits::value / 8; + + set_iteration_index(0); + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(Index index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + CUTLASS_HOST_DEVICE + void advance() { + + LongIndex next = params_.inc_next_trs; + + // moves to the next tile + ++filter_trs_; + if (filter_trs_ == params_.TRS) { + + filter_trs_ = 0; + next = params_.inc_next_c; + filter_c_ += params_.filter_c_delta; + } + + if (filter_c_ >= problem_size_.C) { + predicates_ = 0; + } + + pointer_ += next; + } + + /// Returns true if the current coordinate is within the filter tensor W + CUTLASS_HOST_DEVICE + bool valid() { + return (predicates_ & (1u << iteration_strided_)); + } + + /// Returns a pointer to the vector starting at the current coordinate + CUTLASS_HOST_DEVICE + AccessType const *get() const { + return reinterpret_cast(pointer_); + } + + /// Increments to the next memory access + CUTLASS_HOST_DEVICE + Conv3dFpropFilterTileAccessIteratorOptimized &operator++() { + ++iteration_contiguous_; + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) { + return *this; + } + iteration_contiguous_ = 0; + + ++iteration_strided_; + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + + // Move to the next K coordinate within the tile + pointer_ += params_.inc_next_k; + + return *this; + } + iteration_strided_ = 0; + + return *this; + } + + /// Determines whether the Implicit GEMM can execute the given problem. + CUTLASS_HOST_DEVICE + static Status can_implement(Conv3dProblemSize const &problem_size) { + + // check alignment constraint on iterator's contiguous dimension + if (problem_size.C % (128/sizeof_bits::value)) { + return Status::kErrorInvalidProblem; + } + + return Status::kSuccess; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/conv/threadblock/conv3d_params.h b/include/cutlass/conv/threadblock/conv3d_params.h new file mode 100644 index 0000000000..c95b52d90e --- /dev/null +++ b/include/cutlass/conv/threadblock/conv3d_params.h @@ -0,0 +1,363 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! + \file + \brief Extracts the host-params objects into non-template code. +*/ + +#pragma once + +#define TRACE_CONV_PARAMS_INITIALIZERS_ENABLED 0 + +#include "cutlass/cutlass.h" +#include "cutlass/fast_math.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/conv/convolution.h" +#include "cutlass/conv/threadblock/conv2d_params.h" +#include "cutlass/conv/conv3d_problem_size.h" + +#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED +#include +#endif + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace conv { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Params structure used for all Conv3d analytic tile iterators +template< typename Layout_ = layout::TensorNDHWC > +struct Conv3dAnalyticParams { + + using Layout = Layout_; + + Layout layout; + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Conv3dAnalyticParams() { } + + CUTLASS_HOST_DEVICE + Conv3dAnalyticParams( + Conv3dProblemSize const &, // unused; placeholder to match other Params interfaces. + Layout const &layout + ): layout(layout) { + + } +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Parameters structure used for Conv3dFpropActivationTileIteratorOptimized +template< typename Layout_ = layout::TensorNDHWC > +struct Conv3dFpropActivationIteratorOptimizedParams; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Parameters structure used for Conv3dFpropActivationTileIteratorOptimized +template<> +struct Conv3dFpropActivationIteratorOptimizedParams { + + using Layout = layout::TensorNDHWC; + + Layout layout; + + int64_t inc_next[4]; // {next S, next R, next T, next C} + int filter_c_delta; // number of logical elements to add to filter_c_ + int ZPQ; // product of Z*P*Q + int PQ; // product of P*Q + + FastDivmod zpq_divmod; + FastDivmod pq_divmod; + FastDivmod q_divmod; + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Conv3dFpropActivationIteratorOptimizedParams() { } + + CUTLASS_HOST_DEVICE + Conv3dFpropActivationIteratorOptimizedParams( + Conv3dProblemSize const &problem_size, + Layout const &layout, ///< layout object + int element_size_bits, ///< size of each element in bits + MatrixCoord threadblock_shape, + int thread_count, + int access_size, + layout::PitchLinearCoord threadmap_iterations, + layout::PitchLinearCoord threadmap_delta + ): + layout(layout), + PQ(problem_size.P * problem_size.Q), + ZPQ(problem_size.Z * problem_size.P * problem_size.Q), + zpq_divmod(ZPQ), + pq_divmod(PQ), + q_divmod(problem_size.Q) { + + TRACE_CONV_INITIALIZERS("conv3d_fprop", "activation", + element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta); + + + int conv_sign = (problem_size.mode == Mode::kConvolution ? -1 : 1); + + // next S + inc_next[0] = conv_sign * ( + int64_t(layout.stride()[0]) * problem_size.dilation_w + ) * element_size_bits / 8; + + // next R + inc_next[1] = conv_sign * ( + int64_t(layout.stride()[1]) * problem_size.dilation_h + - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w + ) * element_size_bits / 8; + + // next T + inc_next[2] = conv_sign * ( + int64_t(layout.stride()[2]) * problem_size.dilation_d + - (problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h + - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w + ) * element_size_bits / 8; + + // next C + inc_next[3] = ( + threadblock_shape.column() * problem_size.split_k_slices + - conv_sign * int64_t(problem_size.T - 1) * layout.stride()[2] * problem_size.dilation_d + - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h + - conv_sign * int64_t(problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w + ) * element_size_bits / 8; + + // logical offset added to internal channel counter - units are elements, not bytes + filter_c_delta = threadblock_shape.column() * problem_size.split_k_slices; + } +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + + +template< typename Layout_ = layout::TensorNDHWC > +struct Conv3dFpropFilterIteratorOptimizedParams; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +struct Conv3dFpropFilterIteratorOptimizedParams +{ + + using Layout = layout::TensorNDHWC; + + Layout layout; + int TRS; + int filter_c_delta; + + int64_t inc_next_k; // offset in units of bytes to next K position + int64_t inc_next_trs; // offset in units of bytes to next TRS position + int64_t inc_next_c; // offset in units of bytes to next C position + + // + // Methods + // + CUTLASS_HOST_DEVICE + Conv3dFpropFilterIteratorOptimizedParams() { } + + CUTLASS_HOST_DEVICE + Conv3dFpropFilterIteratorOptimizedParams( + Conv3dProblemSize const &problem_size, + Layout const &layout, + int element_size_bits, ///< size of each element in bits + MatrixCoord threadblock_shape, + int thread_count, + int access_size, + layout::PitchLinearCoord threadmap_iterations, + layout::PitchLinearCoord threadmap_delta + ): + layout(layout) { + + TRACE_CONV_INITIALIZERS("conv3d_fprop", "filter", + element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta); + + TRS = problem_size.T * problem_size.R * problem_size.S; + + inc_next_k = (int64_t(layout.stride()[3]) * threadmap_delta.strided() * element_size_bits) / 8; + + inc_next_trs = + ( int64_t(layout.stride()[0]) + - int64_t(layout.stride()[3]) * (threadmap_iterations.strided() - 1) * threadmap_delta.strided() + ) * element_size_bits / 8; + + inc_next_c = + ( + threadblock_shape.row() * problem_size.split_k_slices + - int64_t(TRS - 1) * layout.stride()[0] + - int64_t(threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[3] + ) * element_size_bits / 8; + + filter_c_delta = threadblock_shape.row() * problem_size.split_k_slices; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Parameters object for Conv3d DGRAD OutputGradient (dy) iterator +struct Conv3dDgradOutputGradientIteratorOptimizedParams { + + using Layout = layout::TensorNDHWC; + + Layout layout; + + int64_t inc_next[4]; // {next S, next R, next T, next K} + int filter_k_delta; // number of logical elements to add to filter_k_ + + FastDivmod dhw_divmod; + FastDivmod hw_divmod; + FastDivmod w_divmod; + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Conv3dDgradOutputGradientIteratorOptimizedParams() { } + + CUTLASS_HOST_DEVICE + Conv3dDgradOutputGradientIteratorOptimizedParams( + Conv3dProblemSize const &problem_size, + Layout const &layout, ///< layout object + int element_size_bits, ///< size of each element in bits + MatrixCoord threadblock_shape, + int thread_count, + int access_size, + layout::PitchLinearCoord threadmap_iterations, + layout::PitchLinearCoord threadmap_delta + ): + layout(layout), + dhw_divmod(problem_size.D * problem_size.H * problem_size.W), + hw_divmod(problem_size.H * problem_size.W), + w_divmod(problem_size.W) { + + TRACE_CONV_INITIALIZERS("conv3d_dgrad", "output_gradient", + element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta); + + int conv_sign = (problem_size.mode == Mode::kConvolution ? 1 : -1); + + // next S + inc_next[0] = conv_sign * ( + int64_t(layout.stride()[0]) * problem_size.dilation_w + ) * element_size_bits / 8; + + // next R + inc_next[1] = conv_sign * ( + int64_t(layout.stride()[1]) * problem_size.dilation_h + - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w + ) * element_size_bits / 8; + + // next T + inc_next[2] = conv_sign * ( + int64_t(layout.stride()[2]) * problem_size.dilation_d + - (problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h + - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w + ) * element_size_bits / 8; + + // next K + inc_next[3] = ( + threadblock_shape.column() * problem_size.split_k_slices + - conv_sign * int64_t(problem_size.T - 1) * layout.stride()[2] * problem_size.dilation_d + - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h + - conv_sign * int64_t(problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w + ) * element_size_bits / 8; + + // logical offset added to internal channel counter - units are elements, not bytes + filter_k_delta = threadblock_shape.column() * problem_size.split_k_slices; + } + +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Parameters object for Conv2d DGRAD Filter (w) iterator +struct Conv3dDgradFilterIteratorOptimizedParams { + + using Layout = layout::TensorNDHWC; + + Layout layout; + int TRS; + int filter_k_delta; + + int64_t inc_next_strided; // offset in units of bytes to next K coordinate within tile + int64_t inc_next_trs; // offset in units of bytes to next TRS position + int64_t inc_next_k; // offset in units of bytes to next K position in subsequent tile + + // + // Methods + // + CUTLASS_HOST_DEVICE + Conv3dDgradFilterIteratorOptimizedParams() { } + + CUTLASS_HOST_DEVICE + Conv3dDgradFilterIteratorOptimizedParams( + Conv3dProblemSize const &problem_size, + Layout const &layout, + int element_size_bits, ///< size of each element in bits + MatrixCoord threadblock_shape, + int thread_count, + int access_size, + layout::PitchLinearCoord threadmap_iterations, + layout::PitchLinearCoord threadmap_delta + ): + layout(layout), TRS(problem_size.T * problem_size.R * problem_size.S) { + + TRACE_CONV_INITIALIZERS("conv3d_dgrad", "filter", + element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta); + + inc_next_strided = (layout.stride()[3] * threadmap_delta.strided() * element_size_bits) / 8; + + inc_next_trs = + ( layout.stride()[0] + - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[3] + ) * element_size_bits / 8; + + inc_next_k = + ( + threadblock_shape.row() * problem_size.split_k_slices * layout.stride()[3] + - (problem_size.T * problem_size.R * problem_size.S - 1) * layout.stride()[0] + - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[3] + ) * element_size_bits / 8; + + filter_k_delta = threadblock_shape.row() * problem_size.split_k_slices; + } +}; + +} // namespace threadblock +} // namespace conv +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h index 0ad49abd31..396d856a13 100644 --- a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h +++ b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h index 35c4643052..2835480d80 100644 --- a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h +++ b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h index 74017c09f6..b8af8efa44 100644 --- a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h +++ b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -151,6 +151,11 @@ class Conv3dWgradOutputGradientTileAccessIteratorAnalytic { } } + CUTLASS_HOST_DEVICE + static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) { + return Params(problem_size, layout); + } + /// Overrides the internal iteration index CUTLASS_HOST_DEVICE void set_iteration_index(Index index) { diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h index 2cab09d1f3..d3b356e07d 100644 --- a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h +++ b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -203,6 +203,11 @@ class Conv3dWgradOutputGradientTileAccessIteratorOptimized { set_iteration_index(0); } + CUTLASS_HOST_DEVICE + static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) { + return Params(problem_size, layout); + } + /// Overrides the internal iteration index CUTLASS_HOST_DEVICE void set_iteration_index(Index index) { diff --git a/include/cutlass/conv/threadblock/implicit_gemm_multistage.h b/include/cutlass/conv/threadblock/implicit_gemm_multistage.h index 1702847c10..aefdcd6db6 100644 --- a/include/cutlass/conv/threadblock/implicit_gemm_multistage.h +++ b/include/cutlass/conv/threadblock/implicit_gemm_multistage.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h b/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h index 0d56ab6b3f..3d2062d536 100644 --- a/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h +++ b/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/coord.h b/include/cutlass/coord.h index 181e3116e8..7c7aaf3a0b 100644 --- a/include/cutlass/coord.h +++ b/include/cutlass/coord.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/core_io.h b/include/cutlass/core_io.h index bd69a707d3..b25806a33c 100644 --- a/include/cutlass/core_io.h +++ b/include/cutlass/core_io.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/cutlass.h b/include/cutlass/cutlass.h index 622f037b40..5a70398026 100644 --- a/include/cutlass/cutlass.h +++ b/include/cutlass/cutlass.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -31,6 +31,16 @@ //////////////////////////////////////////////////////////////////////////////////////////////////// +#define CUTLASS_UNUSED(expr) do { (void)(expr); } while (0) + +#if defined(_MSC_VER) + #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __FUNCSIG__) +#else + #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__) +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + namespace cutlass { //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -43,6 +53,7 @@ namespace cutlass { #define CUTLASS_DEVICE __forceinline__ __device__ #else #define CUTLASS_HOST_DEVICE inline +#define CUTLASS_DEVICE inline #endif /// Status code returned by CUTLASS operations diff --git a/include/cutlass/device_kernel.h b/include/cutlass/device_kernel.h index f5166ab16a..733e7b271b 100644 --- a/include/cutlass/device_kernel.h +++ b/include/cutlass/device_kernel.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h index d352ea5a64..49a63335b6 100644 --- a/include/cutlass/epilogue/thread/activation.h +++ b/include/cutlass/epilogue/thread/activation.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -45,16 +45,33 @@ namespace thread { ///////////////////////////////////////////////////////////////////////////////////////////////// +template +struct Identity { + CUTLASS_HOST_DEVICE + T operator()(T value) const { + return value; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + /// ReLu operator - propagates NaNs template struct ReLu { CUTLASS_HOST_DEVICE - T operator()(T const & threshold, T const &value) const { + T operator()(T const & threshold, T value) const { if (value < threshold) { value = threshold; } return value; } + CUTLASS_HOST_DEVICE + T operator()(T value) const { + if (value < T()) { + value = T(); + } + return value; + } }; template @@ -107,6 +124,15 @@ struct Sigmoid > { } }; +// +// GELU function definitions implemented as described by +// Hendrycks, D., and Gimpel, K. in +// "Gaussian Error Linear Units (GELUs)." (2020) +// https://arxiv.org/pdf/1606.08415.pdf +// +// Floating-point constants are Taylor coefficients described in the paper. +// + // GELU operator template struct GELU { @@ -134,7 +160,7 @@ struct GELU > { GELU gelu_op; CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < int(rhs.size()); ++i) { + for (int i = 0; i < N; ++i) { y[i] = gelu_op(rhs[i]); } @@ -142,6 +168,72 @@ struct GELU > { } }; +// GELU operator implemented using the Taylor series approximation +template +struct GELU_taylor { + CUTLASS_HOST_DEVICE + T operator()(T const &z) const { + + T k0 = T(0.7978845608028654); + T k1 = T(0.044715); + + return T(cutlass::constants::half() * z * + (cutlass::constants::one() + fast_tanh(k0 * z * (cutlass::constants::one() + k1 * z * z)))); + } +}; + +template +struct GELU_taylor > { + CUTLASS_HOST_DEVICE + Array operator()(Array const &rhs) const { + Array y; + GELU_taylor gelu_op; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N; ++i) { + y[i] = gelu_op(rhs[i]); + } + + return y; + } +}; + +/// Computes backwards pass for GELU operator assuming d_t is the layer gradient and +/// z is computed from the forward pass. +template +struct dGELU { + CUTLASS_HOST_DEVICE + T operator()(T const &d_t, T const &z) const { + + T k0 = T(0.7978845608028654); + T k1 = T(0.044715); + T k2 = T(0.1070322243); + + T tanh_out = fast_tanh(k0 * z * (1 + k1 * z * z)); + + T ff = constants::half() * z * ((1 - tanh_out * tanh_out) * (k0 + k2 * z * z)) + + constants::half() * (1 + tanh_out); + + return ff * d_t; + } +}; + +template +struct dGELU > { + CUTLASS_HOST_DEVICE + Array operator()(Array const &d_t, Array const &z) const { + Array y; + dGELU gelu_op; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N; ++i) { + y[i] = gelu_op(d_t[i], z[i]); + } + + return y; + } +}; + ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace thread diff --git a/include/cutlass/epilogue/thread/conversion_op.h b/include/cutlass/epilogue/thread/conversion_op.h index ad17d41490..7cdf6cb0d0 100644 --- a/include/cutlass/epilogue/thread/conversion_op.h +++ b/include/cutlass/epilogue/thread/conversion_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/thread/linear_combination.h b/include/cutlass/epilogue/thread/linear_combination.h index 4fff764fe5..fa2f72ac14 100644 --- a/include/cutlass/epilogue/thread/linear_combination.h +++ b/include/cutlass/epilogue/thread/linear_combination.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -33,6 +33,7 @@ #include "cutlass/array.h" #include "cutlass/functional.h" #include "cutlass/numeric_conversion.h" +#include "cutlass/epilogue/thread/scale_type.h" ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -51,6 +52,7 @@ template < int Count, ///< Number of elements computed per operation typename ElementAccumulator_ = ElementOutput_, ///< Accumulator data type typename ElementCompute_ = ElementOutput_, ///< Data type used to compute linear combination + ScaleType::Kind Scale = ScaleType::Default, ///< Control Alpha and Beta scaling FloatRoundStyle Round = FloatRoundStyle::round_to_nearest > class LinearCombination { @@ -140,6 +142,10 @@ class LinearCombination { /// Returns true if source is needed CUTLASS_HOST_DEVICE bool is_source_needed() const { + if (Scale == ScaleType::NoBetaScaling) return true; + + if (Scale == ScaleType::OnlyAlphaScaling) return false; + return beta_ != ElementCompute(0); } @@ -208,3 +214,5 @@ class LinearCombination { } // namespace thread } // namespace epilogue } // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/thread/linear_combination_bias_relu.h b/include/cutlass/epilogue/thread/linear_combination_bias_relu.h new file mode 100644 index 0000000000..8c898f9074 --- /dev/null +++ b/include/cutlass/epilogue/thread/linear_combination_bias_relu.h @@ -0,0 +1,265 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Functor performing linear combination operations used by epilogues. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/numeric_types.h" +#include "cutlass/array.h" +#include "cutlass/functional.h" +#include "cutlass/numeric_conversion.h" +#include "cutlass/epilogue/thread/activation.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace thread { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// This is a partial specialization for fused Bias and ReLU. It supports the option of packing +/// ReLU conditionals in a bit vector that may be used by backwards passes as an optimization. +/// +/// This class can only be used with cutlass::epilogue::threadblock::EpilogueWithBroadcast<>. +/// +/// This base class is meant to define the concept required of the +/// EpilogueWithBroadcast::OutputOp +template < + typename ElementC_, + typename ElementAccumulator_, + typename ElementCompute_, + typename ElementZ_, + int ElementsPerAccess, + bool StoreT = true +> +class LinearCombinationBiasRelu { +public: + + using ElementOutput = ElementC_; + using ElementC = ElementC_; + using ElementAccumulator = ElementAccumulator_; + using ElementCompute = ElementCompute_; + using ElementZ = ElementZ_; + + using ElementT = uint1b_t; + + static int const kElementsPerAccess = ElementsPerAccess; + static int const kCount = kElementsPerAccess; + + using ElementwiseOp = ReLu; + using BinaryOp = plus; + + using FragmentAccumulator = Array; + using FragmentCompute = Array; + using FragmentC = Array; + using FragmentZ = Array; + using FragmentT = Array; + + /// If true, the 'Z' tensor is stored + static bool const kStoreZ = true; + + /// If true, the 'T' tensor is stored + static bool const kStoreT = StoreT; + + /// Host-constructable parameters structure + struct Params { + + ElementCompute alpha; ///< scales accumulators + ElementCompute beta; ///< scales source tensor + ElementCompute const *alpha_ptr; ///< pointer to accumulator scalar - if not null, loads it from memory + ElementCompute const *beta_ptr; ///< pointer to source scalar - if not null, loads it from memory + ElementCompute threshold; ///< ReLu threshold + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Params(): + alpha(ElementCompute(1)), + beta(ElementCompute()), + alpha_ptr(nullptr), + beta_ptr(nullptr), + threshold(ElementCompute()) { } + + CUTLASS_HOST_DEVICE + Params( + ElementCompute alpha, + ElementCompute beta, + ElementCompute threshold = ElementCompute() + ): + alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr), threshold(threshold) { + + } + + CUTLASS_HOST_DEVICE + Params( + ElementCompute alpha + ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr), threshold(threshold) { + + } + + CUTLASS_HOST_DEVICE + Params( + ElementCompute const *alpha_ptr, + ElementCompute const *beta_ptr, + ElementCompute threshold = ElementCompute() + ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), threshold(threshold) { + + } + + CUTLASS_HOST_DEVICE + Params( + ElementCompute const *alpha_ptr + ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr), threshold(threshold) { + + } + }; + +private: + + // + // Data members + // + + ElementCompute alpha_; + ElementCompute beta_; + ElementCompute threshold_; + +public: + + // + // Methods + // + + /// Constructor from Params + CUTLASS_HOST_DEVICE + LinearCombinationBiasRelu(Params const ¶ms) { + + alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha); + beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta); + threshold_ = params.threshold; + } + + /// Returns true if source is needed + CUTLASS_HOST_DEVICE + bool is_source_needed() const { + return beta_ != ElementCompute(0); + } + + /// Functionally required for serial reduction in the epilogue + CUTLASS_HOST_DEVICE + void set_k_partition(int k_partition, int k_partition_count) { + if (k_partition) { + beta_ = ElementCompute(1); + } + } + + /// Applies the operation when is_source_needed() is true + CUTLASS_HOST_DEVICE + void operator()( + FragmentZ &frag_Z, + FragmentT &frag_T, + FragmentAccumulator const &AB, + FragmentC const &frag_C, + FragmentCompute const &V) const { + + BinaryOp binary_op; + + FragmentCompute tmp_Accum = NumericArrayConverter()(AB); + FragmentCompute tmp_C = NumericArrayConverter()(frag_C); + FragmentCompute result_Z; + FragmentCompute result_T; + + bool conditions[kElementsPerAccess]; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kElementsPerAccess; ++i) { + ElementCompute z = binary_op(alpha_ * tmp_Accum[i] + beta_ * tmp_C[i], V[i]); + + bool condition = !(z < threshold_); + z = fmax(z, threshold_); + + result_Z[i] = z; + conditions[i] = condition; + } + + NumericArrayConverter convert_z; + frag_Z = convert_z(result_Z); + + if (kStoreT) { + PackPredicates pack_predicates; + frag_T = pack_predicates(conditions); + } + } + + /// Applies the operation when is_source_needed() is false + CUTLASS_HOST_DEVICE + void operator()( + FragmentZ &frag_Z, + FragmentT &frag_T, + FragmentAccumulator const &AB, + FragmentCompute const &V) const { + + BinaryOp binary_op; + + FragmentCompute tmp_Accum = NumericArrayConverter()(AB); + FragmentCompute result_Z; + FragmentCompute result_T; + + bool conditions[kElementsPerAccess]; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kElementsPerAccess; ++i) { + ElementCompute z = binary_op(alpha_ * tmp_Accum[i], V[i]); + + bool condition = !(z < threshold_); + z = fmax(z, threshold_); + + result_Z[i] = z; + conditions[i] = condition; + } + + NumericArrayConverter convert_z; + frag_Z = convert_z(result_Z); + + if (kStoreT) { + PackPredicates pack_predicates; + frag_T = pack_predicates(conditions); + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace thread +} // namespace epilogue +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/thread/linear_combination_clamp.h b/include/cutlass/epilogue/thread/linear_combination_clamp.h index 62a6ea7872..b2231bf767 100644 --- a/include/cutlass/epilogue/thread/linear_combination_clamp.h +++ b/include/cutlass/epilogue/thread/linear_combination_clamp.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -53,6 +53,7 @@ template < int Count, ///< Number of elements computed per operation typename ElementAccumulator_ = ElementOutput_, ///< Accumulator data type typename ElementCompute_ = ElementOutput_, ///< Data type used to compute linear combination + ScaleType::Kind Scale = ScaleType::Default, ///< Control Alpha and Beta scaling FloatRoundStyle Round = FloatRoundStyle::round_to_nearest > class LinearCombinationClamp { @@ -97,6 +98,13 @@ class LinearCombinationClamp { } + CUTLASS_HOST_DEVICE + Params( + ElementCompute alpha + ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) { + + } + CUTLASS_HOST_DEVICE Params( ElementCompute const *alpha_ptr, @@ -104,6 +112,13 @@ class LinearCombinationClamp { ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { } + + CUTLASS_HOST_DEVICE + Params( + ElementCompute const *alpha_ptr + ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) { + + } }; private: @@ -128,6 +143,10 @@ class LinearCombinationClamp { /// Returns true if source is needed CUTLASS_HOST_DEVICE bool is_source_needed() const { + if (Scale == ScaleType::NoBetaScaling) return true; + + if (Scale == ScaleType::OnlyAlphaScaling) return false; + return beta_ != ElementCompute(0); } @@ -227,9 +246,10 @@ class LinearCombinationClamp { template < typename ElementOutput_, ///< Data type used to load and store tensors int Count, ///< Number of elements computed per operation + ScaleType::Kind Scale, ///< Control Alpha and Beta scaling FloatRoundStyle Round > -class LinearCombinationClamp { +class LinearCombinationClamp { public: using ElementOutput = ElementOutput_; @@ -283,6 +303,13 @@ class LinearCombinationClamp { } + CUTLASS_HOST_DEVICE + Params( + ElementCompute alpha + ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) { + + } + CUTLASS_HOST_DEVICE Params( ElementCompute const *alpha_ptr, @@ -290,6 +317,13 @@ class LinearCombinationClamp { ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { } + + CUTLASS_HOST_DEVICE + Params( + ElementCompute const *alpha_ptr + ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) { + + } }; private: @@ -314,6 +348,10 @@ class LinearCombinationClamp { /// Returns true if source is needed CUTLASS_HOST_DEVICE bool is_source_needed() const { + if (Scale == ScaleType::NoBetaScaling) return true; + + if (Scale == ScaleType::OnlyAlphaScaling) return false; + return beta_ != ElementCompute(0); } @@ -413,6 +451,8 @@ template < typename ElementOutput_, /// Number of elements computed per operation int Count, + ///< Control Alpha and Beta scaling + ScaleType::Kind Scale = ScaleType::Default, /// Rounding mode FloatRoundStyle Round = FloatRoundStyle::round_to_nearest> class FastLinearCombinationClamp { @@ -467,9 +507,17 @@ class FastLinearCombinationClamp { Params(ElementCompute alpha, ElementCompute beta) : alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {} + CUTLASS_HOST_DEVICE + Params(ElementCompute alpha) + : alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {} + CUTLASS_HOST_DEVICE Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr) : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {} + + CUTLASS_HOST_DEVICE + Params(ElementCompute const *alpha_ptr) + : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {} }; private: @@ -491,7 +539,13 @@ class FastLinearCombinationClamp { /// Returns true if source is needed CUTLASS_HOST_DEVICE - bool is_source_needed() const { return beta_ != ElementCompute(0); } + bool is_source_needed() const { + if (Scale == ScaleType::NoBetaScaling) return true; + + if (Scale == ScaleType::OnlyAlphaScaling) return false; + + return beta_ != ElementCompute(0); + } /// Functionally required for serial reduction in the epilogue CUTLASS_HOST_DEVICE diff --git a/include/cutlass/epilogue/thread/linear_combination_gelu.h b/include/cutlass/epilogue/thread/linear_combination_gelu.h index 30b6213478..c47e89f10f 100644 --- a/include/cutlass/epilogue/thread/linear_combination_gelu.h +++ b/include/cutlass/epilogue/thread/linear_combination_gelu.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/thread/linear_combination_planar_complex.h b/include/cutlass/epilogue/thread/linear_combination_planar_complex.h index 68f334bdb8..8ecaab65ff 100644 --- a/include/cutlass/epilogue/thread/linear_combination_planar_complex.h +++ b/include/cutlass/epilogue/thread/linear_combination_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/thread/linear_combination_relu.h b/include/cutlass/epilogue/thread/linear_combination_relu.h index 7a41404791..d545a78a0f 100644 --- a/include/cutlass/epilogue/thread/linear_combination_relu.h +++ b/include/cutlass/epilogue/thread/linear_combination_relu.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -35,6 +35,7 @@ #include "cutlass/functional.h" #include "cutlass/numeric_conversion.h" #include "cutlass/epilogue/thread/activation.h" +#include "cutlass/epilogue/thread/scale_type.h" ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -53,6 +54,7 @@ template < int Count, ///< Number of elements computed per operation typename ElementAccumulator_ = ElementOutput_, ///< Accumulator data type typename ElementCompute_ = ElementOutput_, ///< Data type used to compute linear combination + ScaleType::Kind Scale = ScaleType::Default, ///< Control Alpha and Beta scaling FloatRoundStyle Round = FloatRoundStyle::round_to_nearest > class LinearCombinationRelu { @@ -93,7 +95,7 @@ class LinearCombinationRelu { CUTLASS_HOST_DEVICE Params( ElementCompute alpha, - ElementCompute beta, + ElementCompute beta = ElementCompute(0), ElementCompute threshold = ElementCompute(0) ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) { @@ -102,7 +104,7 @@ class LinearCombinationRelu { CUTLASS_HOST_DEVICE Params( ElementCompute const *alpha_ptr, - ElementCompute const *beta_ptr, + ElementCompute const *beta_ptr = nullptr, ElementCompute threshold = ElementCompute(0) ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { @@ -133,6 +135,10 @@ class LinearCombinationRelu { /// Returns true if source is needed CUTLASS_HOST_DEVICE bool is_source_needed() const { + if (Scale == ScaleType::NoBetaScaling) return true; + + if (Scale == ScaleType::OnlyAlphaScaling) return false; + return beta_ != ElementCompute(0); } @@ -170,7 +176,11 @@ class LinearCombinationRelu { multiply_add mul_add_accumulator; ReLu relu; - intermediate = mul_add_source(beta_, converted_source); // X = beta * C + uniform + if (Scale == ScaleType::NoBetaScaling) + intermediate = converted_source; + else + intermediate = mul_add_source(beta_, converted_source); // X = beta * C + uniform + intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate); // D = alpha * Accum + X // Compute threshold optionally @@ -224,9 +234,10 @@ class LinearCombinationRelu { template < typename ElementOutput_, ///< Data type used to load and store tensors int Count, ///< Number of elements computed per operation + ScaleType::Kind Scale, ///< Control Alpha and Beta scaling FloatRoundStyle Round > -class LinearCombinationRelu { +class LinearCombinationRelu { public: using ElementOutput = ElementOutput_; @@ -264,7 +275,7 @@ class LinearCombinationRelu { CUTLASS_HOST_DEVICE Params( ElementCompute alpha, - ElementCompute beta, + ElementCompute beta = ElementCompute(0), ElementCompute threshold = ElementCompute(0) ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) { @@ -273,7 +284,7 @@ class LinearCombinationRelu { CUTLASS_HOST_DEVICE Params( ElementCompute const *alpha_ptr, - ElementCompute const *beta_ptr, + ElementCompute const *beta_ptr = nullptr, ElementCompute threshold = ElementCompute(0) ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { @@ -304,6 +315,10 @@ class LinearCombinationRelu { /// Returns true if source is needed CUTLASS_HOST_DEVICE bool is_source_needed() const { + if (Scale == ScaleType::NoBetaScaling) return true; + + if (Scale == ScaleType::OnlyAlphaScaling) return false; + return beta_ != ElementCompute(0); } @@ -341,8 +356,10 @@ class LinearCombinationRelu { multiply_add mul_add_accumulator; ReLu relu; - intermediate = mul_add_source(beta_, converted_source); // X = beta * C + uniform - intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate); // D = alpha * Accum + X + if (Scale == ScaleType::NoBetaScaling) + intermediate = mul_add_source(beta_, converted_source); // X = beta * C + uniform + else + intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate); // D = alpha * Accum + X // Compute threshold optionally intermediate = relu(threshold_, intermediate); diff --git a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h index dbefd2258c..cea2d7a880 100644 --- a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h +++ b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/thread/reduction_op.h b/include/cutlass/epilogue/thread/reduction_op.h index 0331f0fad5..7078500fef 100644 --- a/include/cutlass/epilogue/thread/reduction_op.h +++ b/include/cutlass/epilogue/thread/reduction_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/thread/scale_type.h b/include/cutlass/epilogue/thread/scale_type.h new file mode 100644 index 0000000000..200db83a12 --- /dev/null +++ b/include/cutlass/epilogue/thread/scale_type.h @@ -0,0 +1,54 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Enum defines the behaviors of the epilogue. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace thread { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Specifies internal data type for computation +struct ScaleType { + enum Kind { + Default, // alpha x C + beta x D + NoBetaScaling, // alpha x C + D + OnlyAlphaScaling // alpha x C + }; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace thread +} // namespace epilogue +} // namespace cutlass diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h index 5c12f21680..84db8e131e 100644 --- a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -39,6 +39,11 @@ #include "cutlass/gemm/gemm.h" #include "cutlass/epilogue/thread/linear_combination.h" +#include "cutlass/epilogue/thread/linear_combination_relu.h" +#include "cutlass/epilogue/thread/linear_combination_gelu.h" +#include "cutlass/epilogue/thread/linear_combination_sigmoid.h" +#include "cutlass/epilogue/thread/linear_combination_planar_complex.h" + #include "cutlass/epilogue/thread/conversion_op.h" #include "cutlass/epilogue/thread/reduction_op.h" @@ -88,6 +93,7 @@ struct DefaultEpilogueComplexTensorOp { using OutputOp = OutputOp_; static int const kElementsPerAccess = ElementsPerAccess; using Operator = Operator_; + using ElementOutput = typename OutputOp::ElementOutput; using LayoutC = typename WarpMmaTensorOp::LayoutC; using ElementAccumulator = typename WarpMmaTensorOp::ElementC; @@ -173,6 +179,7 @@ struct DefaultEpilogueComplexTensorOp ; + + static int const kFragmentsPerIteration = 1; +}; + +/// Partial specialization for float <= float x 4 +template < + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename ThreadMap +> +struct DefaultIteratorsTensorOp { + + using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp< + WarpShape, + InstructionShape, + float, + layout::RowMajor + >; + + using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator< + ThreadMap, + float + >; + + static int const kFragmentsPerIteration = 2; }; /// Partial specialization for half <= float x 8 epilogues avoids shared memory bank conflicts. @@ -125,6 +156,8 @@ struct DefaultIteratorsTensorOp< 8, 8 >; + + static int const kFragmentsPerIteration = 2; }; /// Partial specialization for int8_t x 16 <= int32_t x 16 epilogues avoids shared memory bank conflicts. @@ -160,6 +193,8 @@ struct DefaultIteratorsTensorOp< 16, 8 >; + + static int const kFragmentsPerIteration = 1; }; /// Partial specialization for int8_t x 8 <= int32_t x 8 epilogues avoids shared memory bank conflicts. @@ -195,6 +230,8 @@ struct DefaultIteratorsTensorOp< 8, 8 >; + + static int const kFragmentsPerIteration = 1; }; /// Partial specialization for int8_t x 8 <= int32_t x 8 epilogues avoids shared memory bank conflicts. @@ -230,6 +267,8 @@ struct DefaultIteratorsTensorOp< 8, 8 >; + + static int const kFragmentsPerIteration = 1; }; } // namespace detail @@ -251,6 +290,7 @@ struct DefaultEpilogueTensorOp { static int const kPartitionsK = PartitionsK; using OutputOp = OutputOp_; static int const kElementsPerAccess = ElementsPerAccess; + using ElementOutput = typename OutputOp::ElementOutput; using LayoutC = typename WarpMmaTensorOp::LayoutC; using ElementAccumulator = typename WarpMmaTensorOp::ElementC; @@ -303,6 +343,8 @@ struct DefaultEpilogueTensorOp { /// Hard-coded padding elements added using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits::value * 4>; + static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1); + // // Define the epilogue // @@ -315,7 +357,8 @@ struct DefaultEpilogueTensorOp { WarpTileIterator, SharedLoadIterator, OutputOp, - Padding + Padding, + kFragmentsPerIteration >; }; @@ -325,7 +368,7 @@ struct DefaultEpilogueTensorOp { /// intereleaved output layout. For this case, shared memory is not needed. template + bool isSplitK = false> struct DefaultInterleavedEpilogueTensorOp { using Shape = Shape_; using WarpMmaTensorOp = WarpMmaTensorOp_; @@ -362,7 +405,7 @@ struct DefaultInterleavedEpilogueTensorOp { // using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue< Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator, - AccumulatorFragmentIterator, OutputOp, InterleavedK, IsBetaZero>; + AccumulatorFragmentIterator, OutputOp, InterleavedK>; }; //////////////////////////////////////////////////////////////////////////////// @@ -371,7 +414,7 @@ struct DefaultInterleavedEpilogueTensorOp { /// intereleaved output layout. For this case, shared memory is not needed. template + bool isSplitK = false> struct DefaultInterleavedConvEpilogue { using Shape = Shape_; using WarpMmaTensorOp = WarpMmaTensorOp_; @@ -408,7 +451,7 @@ struct DefaultInterleavedConvEpilogue { // using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue< Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator, - AccumulatorFragmentIterator, OutputOp, InterleavedK, IsBetaZero>; + AccumulatorFragmentIterator, OutputOp, InterleavedK>; }; //////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h index 7fec5110f4..4dbd339fd9 100644 --- a/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -40,6 +40,11 @@ #include "cutlass/epilogue/thread/linear_combination.h" #include "cutlass/epilogue/thread/linear_combination_clamp.h" +#include "cutlass/epilogue/thread/linear_combination_relu.h" +#include "cutlass/epilogue/thread/linear_combination_gelu.h" +#include "cutlass/epilogue/thread/linear_combination_sigmoid.h" +#include "cutlass/epilogue/thread/linear_combination_planar_complex.h" + #include "cutlass/epilogue/thread/conversion_op.h" #include "cutlass/epilogue/thread/reduction_op.h" diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h index 58425c286c..353b0f5478 100644 --- a/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -40,6 +40,11 @@ #include "cutlass/epilogue/thread/linear_combination.h" #include "cutlass/epilogue/thread/linear_combination_clamp.h" +#include "cutlass/epilogue/thread/linear_combination_relu.h" +#include "cutlass/epilogue/thread/linear_combination_gelu.h" +#include "cutlass/epilogue/thread/linear_combination_sigmoid.h" +#include "cutlass/epilogue/thread/linear_combination_planar_complex.h" + #include "cutlass/epilogue/thread/conversion_op.h" #include "cutlass/epilogue/thread/reduction_op.h" diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_simt.h b/include/cutlass/epilogue/threadblock/default_thread_map_simt.h index 69298d515a..0f33ad9a41 100644 --- a/include/cutlass/epilogue/threadblock/default_thread_map_simt.h +++ b/include/cutlass/epilogue/threadblock/default_thread_map_simt.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h index 752b1ee9b4..901b16845f 100644 --- a/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h index 9776ba0682..f9f77c2223 100644 --- a/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h index cd828c697e..ccde4a526c 100644 --- a/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/direct_epilogue_tensor_op.h b/include/cutlass/epilogue/threadblock/direct_epilogue_tensor_op.h index f14be1ff8e..8f9dd454be 100644 --- a/include/cutlass/epilogue/threadblock/direct_epilogue_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/direct_epilogue_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/epilogue.h b/include/cutlass/epilogue/threadblock/epilogue.h index a27541b47a..9afd3d5f7f 100644 --- a/include/cutlass/epilogue/threadblock/epilogue.h +++ b/include/cutlass/epilogue/threadblock/epilogue.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -63,7 +63,7 @@ namespace threadblock { //////////////////////////////////////////////////////////////////////////////// -/// Epilogue operator without splitk +/// Epilogue operator template < typename Shape_, ///< Shape of threadblock tile (concept: GemmShape) typename WarpMmaOperator_, ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp) @@ -73,7 +73,8 @@ template < typename WarpTileIterator_, ///< Warp-scoped tile iterator writing accumulators to SMEM typename SharedLoadIterator_, ///< Threadblock-scoped tile iterator loading from SMEM typename OutputOp_, ///< Output operator - typename Padding_ ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape) + typename Padding_, ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape) + int FragmentsPerPartition = 1 ///< Used to coarsten the epilogue granularity > class Epilogue : public EpilogueBase< @@ -82,7 +83,8 @@ class Epilogue : PartitionsK, AccumulatorFragmentIterator_, WarpTileIterator_, - Padding_> { + Padding_, + FragmentsPerPartition> { public: @@ -92,7 +94,8 @@ class Epilogue : PartitionsK, AccumulatorFragmentIterator_, WarpTileIterator_, - Padding_>; + Padding_, + FragmentsPerPartition>; using Shape = Shape_; using WarpMmaOperator = WarpMmaOperator_; @@ -113,7 +116,6 @@ class Epilogue : /// Accumulator element using ElementAccumulator = typename WarpTileIterator::Element; - /// Output element using ElementOutput = typename OutputTileIterator::Element; @@ -139,6 +141,9 @@ class Epilogue : /// Number of warps using WarpCount = typename Base::WarpCount; + int const kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK; + int const kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles; + public: @@ -166,7 +171,10 @@ class Epilogue : int lane_idx ///< Id of thread within warp ): Base(shared_storage, thread_idx, warp_idx, lane_idx), - shared_load_iterator_(shared_storage.reference(), thread_idx) { } + shared_load_iterator_(shared_storage.reference(), thread_idx) + { + + } /// Streams the result to global memory CUTLASS_DEVICE @@ -177,7 +185,7 @@ class Epilogue : OutputTileIterator source_iterator) { ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles) if (!output_op.is_source_needed()) { - compute_source_not_needed_(output_op, destination_iterator, accumulators); + compute_source_not_needed_(output_op, destination_iterator, accumulators); } else { compute_source_needed_(output_op, destination_iterator, accumulators, source_iterator); @@ -185,6 +193,8 @@ class Epilogue : } private: + + static_assert(kPartitionsK == 1 || Base::kFragmentsPerIteration == 1, "One of these must be exactly 1."); /// Streams the result to global memory CUTLASS_DEVICE @@ -205,7 +215,7 @@ class Epilogue : // CUTLASS_PRAGMA_UNROLL - for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) { + for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration) { // // Convert and store fragment @@ -213,12 +223,24 @@ class Epilogue : __syncthreads(); - typename AccumulatorFragmentIterator::Fragment accum_fragment; - accum_fragment_iterator.load(accum_fragment); - ++accum_fragment_iterator; + CUTLASS_PRAGMA_UNROLL + for (int p = 0; p < Base::kFragmentsPerIteration; ++p) { + typename AccumulatorFragmentIterator::Fragment accum_fragment; - this->warp_tile_iterator_.store(accum_fragment); + accum_fragment_iterator.load(accum_fragment); + ++accum_fragment_iterator; + + this->warp_tile_iterator_.store(accum_fragment); + + if (p < Base::kFragmentsPerIteration - 1) { + this->warp_tile_iterator_.add_pointer_offset(kSmemPointerOffset); + } + } + + if (Base::kFragmentsPerIteration > 1) { + this->warp_tile_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration)); + } __syncthreads(); @@ -226,45 +248,53 @@ class Epilogue : // Load fragments from shared memory // - typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK]; + CUTLASS_PRAGMA_UNROLL + for (int p = 0; p < Base::kFragmentsPerIteration; ++p) { - shared_load_iterator_.load(aligned_accum_fragment[0]); - // If the number of k-slices is > 1 - perform a reduction amongst the k-slices - if (kPartitionsK > 1) - { - plus add_fragments; - const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK; + typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK]; - CUTLASS_PRAGMA_UNROLL - for ( int i = 1; i < kPartitionsK; ++i) { - shared_load_iterator_.add_tile_offset({tile_row_offset , 0}); - shared_load_iterator_.load(aligned_accum_fragment[i]); - aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]); + shared_load_iterator_.load(aligned_accum_fragment[0]); + + if (p < Base::kFragmentsPerIteration - 1) { + shared_load_iterator_.add_pointer_offset(kSmemPointerOffset); } + else if (kPartitionsK > 1) { - shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0}); - } + plus add_fragments; - // - // Compute the output result - // - - typename OutputTileIterator::Fragment output_fragment; + CUTLASS_PRAGMA_UNROLL + for ( int i = 1; i < kPartitionsK; ++i) { + shared_load_iterator_.add_pointer_offset(kSmemPointerOffset); + shared_load_iterator_.load(aligned_accum_fragment[i]); + aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]); + } - apply_output_operator_source_not_needed_(output_fragment, output_op, aligned_accum_fragment[0]); + shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset); + } + // + // Compute the output result + // - // - // Store the final result - // + typename OutputTileIterator::Fragment output_fragment; - destination_iterator.store(output_fragment); - ++destination_iterator; - + apply_output_operator_source_not_needed_(output_fragment, output_op, aligned_accum_fragment[0]); + + + // + // Store the final result + // + + destination_iterator.store(output_fragment); + ++destination_iterator; + } + + if (Base::kFragmentsPerIteration > 1) { + shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration)); + } } } - /// Streams the result to global memory CUTLASS_DEVICE @@ -323,19 +353,18 @@ class Epilogue : shared_load_iterator_.load(aligned_accum_fragment[0]); // If the number of k-slices is > 1 - perform a reduction amongst the k-slices - if (kPartitionsK > 1) - { + if (kPartitionsK > 1) { + plus add_fragments; - const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK; CUTLASS_PRAGMA_UNROLL for ( int i = 1; i < kPartitionsK; ++i) { - shared_load_iterator_.add_tile_offset({tile_row_offset , 0}); + shared_load_iterator_.add_pointer_offset(kSmemPointerOffset); shared_load_iterator_.load(aligned_accum_fragment[i]); aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]); } - shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0}); + shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset); } // diff --git a/include/cutlass/epilogue/threadblock/epilogue_base.h b/include/cutlass/epilogue/threadblock/epilogue_base.h index a9b5a41404..76692d43cd 100644 --- a/include/cutlass/epilogue/threadblock/epilogue_base.h +++ b/include/cutlass/epilogue/threadblock/epilogue_base.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -66,7 +66,8 @@ template < int PartitionsK, ///< Number of partitions of the K dimension typename AccumulatorFragmentIterator_, ///< Fragment iterator selecting accumulators typename WarpTileIterator_, ///< Warp-scoped tile iterator writing accumulators to SMEM - typename Padding_ ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape) + typename Padding_, ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape) + int FragmentsPerIteration = 1 > class EpilogueBase { public: @@ -94,6 +95,9 @@ class EpilogueBase { kPartitionsK >; + /// Use this to control the granularity of one epilogue 'iteration' + static int const kFragmentsPerIteration = FragmentsPerIteration; + public: /// Shared storage allocation needed by the epilogue @@ -120,7 +124,7 @@ class EpilogueBase { /// Shape of the shared memory allocation for the epilogue using StorageShape = MatrixShape< - Shape::kRow + Padding::kRow, + (Shape::kRow + Padding::kRow) * kFragmentsPerIteration, Shape::kColumn + Padding::kColumn >; diff --git a/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h b/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h index 6cb9963615..eae1ad4ff5 100644 --- a/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h +++ b/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/epilogue_workspace.h b/include/cutlass/epilogue/threadblock/epilogue_workspace.h index 36d196a37f..2341051c87 100644 --- a/include/cutlass/epilogue/threadblock/epilogue_workspace.h +++ b/include/cutlass/epilogue/threadblock/epilogue_workspace.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/interleaved_epilogue.h b/include/cutlass/epilogue/threadblock/interleaved_epilogue.h index b616545b9f..7bf7b4de8e 100644 --- a/include/cutlass/epilogue/threadblock/interleaved_epilogue.h +++ b/include/cutlass/epilogue/threadblock/interleaved_epilogue.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -73,9 +73,7 @@ template < /// Output operator typename OutputOp_, /// Number of interleaved k - int InterleavedK, - /// Whether Beta is zero - bool IsBetaZero = false> + int InterleavedK> class InterleavedEpilogue { public: using Shape = Shape_; @@ -149,21 +147,75 @@ class InterleavedEpilogue { OutputTileIterator destination_iterator, ///< Tile iterator for destination AccumulatorTile const &accumulators, ///< Complete warp-level accumulator tile OutputTileIterator source_iterator) { ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles) + if (!output_op.is_source_needed()) { + compute_source_not_needed_(output_op, destination_iterator, accumulators); + } + else { + compute_source_needed_(output_op, destination_iterator, accumulators, source_iterator); + } + } + + /// Streams the result to global memory + CUTLASS_DEVICE + void compute_source_not_needed_( + OutputOp const &output_op, ///< Output operator + OutputTileIterator destination_iterator, ///< Tile iterator for destination + AccumulatorTile const &accumulators ///< Complete warp-level accumulator tile + ) { // - // Predicated tile iterators constructed from members + // Iterator over warp-level accumulator fragment // - if (IsBetaZero && output_op.is_source_needed()) - assert(0); + AccumulatorFragmentIterator accum_fragment_iterator(accumulators); - typename OutputTileIterator::Fragment source_fragment; + // + // Iterate over accumulator tile + // + + CUTLASS_PRAGMA_UNROLL + for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) { + + // + // Convert fragment + // + + typename AccumulatorFragmentIterator::Fragment accum_fragment; + + accum_fragment_iterator.load(accum_fragment); + ++accum_fragment_iterator; + + // + // Compute the output result + // + + typename OutputTileIterator::Fragment output_fragment; + apply_output_operator_source_not_needed_(output_op, output_fragment, accum_fragment); + + // + // Store the final result + // - if (!IsBetaZero) { - if (!output_op.is_source_needed()) { - source_iterator.clear_mask(); - } + destination_iterator.set_iteration_index(iter); + destination_iterator.store(output_fragment); + ++destination_iterator; } + } + + /// Streams the result to global memory + CUTLASS_DEVICE + void compute_source_needed_( + OutputOp const &output_op, ///< Output operator + OutputTileIterator destination_iterator, ///< Tile iterator for destination + AccumulatorTile const &accumulators, ///< Complete warp-level accumulator tile + OutputTileIterator source_iterator ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles) + ) { + + // + // Predicated tile iterators constructed from members + // + + typename OutputTileIterator::Fragment source_fragment; source_fragment.clear(); @@ -183,11 +235,9 @@ class InterleavedEpilogue { // Load the source // - if (!IsBetaZero) { - source_iterator.set_iteration_index(iter); - source_iterator.load(source_fragment); - ++source_iterator; - } + source_iterator.set_iteration_index(iter); + source_iterator.load(source_fragment); + ++source_iterator; // // Convert fragment @@ -243,6 +293,30 @@ class InterleavedEpilogue { output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]); } } + + /// Helper to invoke the output functor over each vector of output + CUTLASS_DEVICE + void apply_output_operator_source_not_needed_( + OutputOp const &output_op, ///< Output operator + typename OutputTileIterator::Fragment &output_fragment, + typename AccumulatorFragmentIterator::Fragment const + &aligned_accum_fragment) { + OutputAccessType *output_frag_ptr = + reinterpret_cast(&output_fragment); + + AccumulatorAccessType const *compute_frag_ptr = + reinterpret_cast( + &aligned_accum_fragment); + + int const kOutputOpIterations = OutputTileIterator::Fragment::kElements / + OutputTileIterator::kElementsPerAccess; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kOutputOpIterations; ++i) { + // Call the output operator + output_frag_ptr[i] = output_op(compute_frag_ptr[i]); + } + } }; //////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/threadblock/output_tile_thread_map.h b/include/cutlass/epilogue/threadblock/output_tile_thread_map.h index cfe13cc167..377f33bd95 100644 --- a/include/cutlass/epilogue/threadblock/output_tile_thread_map.h +++ b/include/cutlass/epilogue/threadblock/output_tile_thread_map.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -371,6 +371,11 @@ struct OutputTileOptimalThreadMap { using Shape = Shape_; + using TileShape = MatrixShape< + Shape::kTile * Shape::kCluster * Shape::kGroup * Shape::kRow, + Shape::kColumn + >; + using Iterations = OutputTileShape< Detail::RowArrangement::kIterationsColumn, Detail::RowArrangement::kIterationsRow, diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h index 1be50cbd90..a4a5d15a12 100644 --- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h +++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -176,7 +176,7 @@ class PredicatedTileIterator { /// Internal state counter int state_[3]; - + private: // @@ -214,6 +214,11 @@ class PredicatedTileIterator { + ThreadMap::Delta::kColumn * c) < extent.column()); } + // Null pointer performs no accesses + if (!pointer) { + mask_.clear(); + } + // Initialize pointer byte_pointer_ = reinterpret_cast(pointer) + LongIndex(thread_offset.row()) * LongIndex(params_.stride) + @@ -288,11 +293,12 @@ class PredicatedTileIterator { } } + /// Loads a fragment from memory CUTLASS_DEVICE void load(Fragment &frag) { - load_with_byte_offset(frag, 0); + load_with_byte_offset(frag, 0); } /// Stores a fragment to memory @@ -326,11 +332,10 @@ class PredicatedTileIterator { bool guard = row_guard && mask_.predicates[column]; - if (guard) { - - memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess] = - frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column]; - } + cutlass::arch::global_store( + frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column], + (void *)&memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess], + guard); } if (row + 1 < ThreadMap::Iterations::kRow) { @@ -349,11 +354,12 @@ class PredicatedTileIterator { } } + /// Stores a fragment to memory CUTLASS_DEVICE void store(Fragment const &frag) { - store_with_byte_offset(frag, 0); + store_with_byte_offset(frag, 0); } /// Advances to the next position to load or store @@ -404,7 +410,7 @@ class PredicatedTileIterator { ///< Sets the mask CUTLASS_DEVICE void get_mask(Mask &mask) { - return mask_; + mask = mask_; } ///< Sets the mask @@ -644,9 +650,8 @@ class InterleavedPredicatedTileIterator { bool guard = col_guard && mask_.predicates[iteration_contiguous_]; - if (guard) { - *memory_pointer = *frag_ptr; - } + cutlass::arch::global_store( + *frag_ptr, (void *)memory_pointer, guard); } /// Overrides the internal iteration index @@ -689,7 +694,7 @@ class InterleavedPredicatedTileIterator { ///< Sets the mask CUTLASS_DEVICE void get_mask(Mask &mask) { - return mask_; + mask = mask_; } ///< Sets the mask @@ -949,9 +954,8 @@ class InterleavedConvPredicatedTileIterator { AccessType const *frag_ptr = reinterpret_cast(&frag); AccessType *memory_pointer = reinterpret_cast(byte_pointer); - if (guard) { - *memory_pointer = *frag_ptr; - } + cutlass::arch::global_store( + *frag_ptr, (void *)memory_pointer, guard); } /// Overrides the internal iteration index @@ -993,7 +997,7 @@ class InterleavedConvPredicatedTileIterator { ///< Sets the mask CUTLASS_DEVICE void get_mask(Mask &mask) { - return mask_; + mask = mask_; } ///< Sets the mask diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h index a08e1e0616..d73ce1bdfa 100644 --- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h +++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/shared_load_iterator.h b/include/cutlass/epilogue/threadblock/shared_load_iterator.h index 0aa3dbb19d..b5fefa26db 100644 --- a/include/cutlass/epilogue/threadblock/shared_load_iterator.h +++ b/include/cutlass/epilogue/threadblock/shared_load_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -61,7 +61,7 @@ template < class SharedLoadIterator { public: using ThreadMap = ThreadMap_; - using Shape = typename ThreadMap::Shape; + using Shape = typename ThreadMap::TileShape; using Element = Element_; @@ -151,7 +151,9 @@ class SharedLoadIterator { CUTLASS_DEVICE void add_tile_offset(TensorCoord const &offset) { - add_pointer_offset(offset.row() * stride_ / (sizeof_bits::value / 8) + offset.column() * Shape::kColumn); + byte_pointer_ += + offset.row() * Shape::kRow * stride_ + + offset.column() * Shape::kColumn * sizeof_bits::value / 8; } /// Loads a fragment from memory diff --git a/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h b/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h index d37b07d562..5b31e33727 100644 --- a/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h +++ b/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -171,7 +171,7 @@ class SharedLoadIteratorMixed { void add_pointer_offset(LongIndex pointer_offset) { CUTLASS_PRAGMA_UNROLL for (int i = 0; i < kLoadsPerAccess; ++i) { - pointers_ += pointer_offset / LoadType::kElements; + pointers_[i] += pointer_offset / LoadType::kElements; } } @@ -179,7 +179,9 @@ class SharedLoadIteratorMixed { void add_tile_offset(TensorCoord const &offset) { CUTLASS_PRAGMA_UNROLL for (int i = 0; i < kLoadsPerAccess; ++i) { - pointers_[i] += offset.row() * stride_ + offset.column() / LoadType::kElements; + pointers_[i] += + offset.row() * Shape::kRow * stride_ + + offset.column() * Shape::kColumn / LoadType::kElements; } } @@ -236,7 +238,7 @@ class SharedLoadIteratorMixed { ///////////////////////////////////////////////////////////////////////////////////////////////// -/// Partial specialization for int32_t x 16 => int8_t x 16 +/// Partial specialization for int32_t x 8 => int8_t x 8 template < typename ThreadMap_ ///< Thread map (conept: OutputTileThreadMap) > @@ -339,7 +341,9 @@ class SharedLoadIteratorMixed { void add_tile_offset(TensorCoord const &offset) { CUTLASS_PRAGMA_UNROLL for (int i = 0; i < kLoadsPerAccess; ++i) { - pointers_[i] += offset.row() * stride_ + offset.column() / LoadType::kElements; + pointers_[i] += + offset.row() * Shape::kRow * stride_ + + offset.column() * Shape::kColumn / LoadType::kElements; } } @@ -497,7 +501,9 @@ class SharedLoadIteratorMixed { void add_tile_offset(TensorCoord const &offset) { CUTLASS_PRAGMA_UNROLL for (int i = 0; i < kLoadsPerAccess; ++i) { - pointers_[i] += offset.row() * stride_ + offset.column() / LoadType::kElements; + pointers_[i] += + offset.row() * Shape::kRow * stride_ + + offset.column() * Shape::kColumn / LoadType::kElements; } } diff --git a/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h index 1bab9104c7..633d92193c 100644 --- a/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h +++ b/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h index 4c95649244..6117e167de 100644 --- a/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h +++ b/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/fragment_iterator_simt.h b/include/cutlass/epilogue/warp/fragment_iterator_simt.h index 6d75e5697b..b2ed96cf44 100644 --- a/include/cutlass/epilogue/warp/fragment_iterator_simt.h +++ b/include/cutlass/epilogue/warp/fragment_iterator_simt.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h index f620e4bddf..b028dedfde 100644 --- a/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h +++ b/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h index 1abbbdc03c..c826b2be55 100644 --- a/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h +++ b/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h index b2a0612ac5..fbceee5dba 100644 --- a/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h +++ b/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/simt_policy.h b/include/cutlass/epilogue/warp/simt_policy.h index 3e096978da..058a6c4413 100644 --- a/include/cutlass/epilogue/warp/simt_policy.h +++ b/include/cutlass/epilogue/warp/simt_policy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/tensor_op_policy.h b/include/cutlass/epilogue/warp/tensor_op_policy.h index fd085c47b6..93eeda3e56 100644 --- a/include/cutlass/epilogue/warp/tensor_op_policy.h +++ b/include/cutlass/epilogue/warp/tensor_op_policy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/tile_iterator_simt.h b/include/cutlass/epilogue/warp/tile_iterator_simt.h index a9d03db1c3..552f15b3f2 100644 --- a/include/cutlass/epilogue/warp/tile_iterator_simt.h +++ b/include/cutlass/epilogue/warp/tile_iterator_simt.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h index 33cee0d375..7c22af81f0 100644 --- a/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h +++ b/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h index 82a93e2d00..cec0b8f27e 100644 --- a/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h +++ b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h index 1754f58016..75c064e285 100644 --- a/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h +++ b/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h index c8eab0ceb1..1ea6dd4f43 100644 --- a/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h +++ b/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/volta_tensor_op_policy.h b/include/cutlass/epilogue/warp/volta_tensor_op_policy.h index b0ecc5eb6f..e8e14f3e4d 100644 --- a/include/cutlass/epilogue/warp/volta_tensor_op_policy.h +++ b/include/cutlass/epilogue/warp/volta_tensor_op_policy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h b/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h index 7b938d3712..b1bc6cf3b1 100644 --- a/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h +++ b/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/fast_math.h b/include/cutlass/fast_math.h index 4d9503e5f2..c54bdac52f 100644 --- a/include/cutlass/fast_math.h +++ b/include/cutlass/fast_math.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -30,9 +30,12 @@ #else #include #include +#include #endif #include "cutlass/cutlass.h" +#include "cutlass/uint128.h" +#include "cutlass/coord.h" /** * \file @@ -151,6 +154,7 @@ constexpr int ceil_div(int a, int b) { return (a + b - 1) / b; } +///////////////////////////////////////////////////////////////////////////////////////////////// /** * log2 computation, what's the @@ -221,6 +225,8 @@ void fast_divmod(int& quo, int64_t& rem, int64_t src, int div, unsigned int mul, rem = src - (quo * div); } +///////////////////////////////////////////////////////////////////////////////////////////////// + /// Object to encapsulate the fast division+modulus operation. /// /// This object precomputes two values used to accelerate the computation and is best used @@ -272,9 +278,159 @@ struct FastDivmod { } }; -/****************************************************************************** - * Min/Max - ******************************************************************************/ +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Object to encapsulate the fast division+modulus operation for 64b integer division. +/// +/// This object precomputes two values used to accelerate the computation and is best used +/// when the divisor is a grid-invariant. In this case, it may be computed in host code and +/// marshalled along other kernel arguments using the 'Params' pattern. +/// +/// Example: +/// +/// +/// uint64_t quotient, remainder, dividend, divisor; +/// +/// FastDivmodU64 divmod(divisor); +/// +/// divmod(quotient, remainder, dividend); +/// +/// // quotient = (dividend / divisor) +/// // remainder = (dividend % divisor) +/// +struct FastDivmodU64 { + + uint64_t divisor; + uint64_t multiplier; + unsigned int shift_right; + unsigned int round_up; + + // + // Static methods + // + + /// Computes b, where 2^b is the greatest power of two that is less than or equal to x + CUTLASS_HOST_DEVICE + static uint32_t integer_log2(uint64_t x) { + uint32_t n = 0; + while (x >>= 1) { + ++n; + } + return n; + } + + /// Default ctor + CUTLASS_HOST_DEVICE + FastDivmodU64(): divisor(0), multiplier(0), shift_right(0), round_up(0) { } + + /// Construct the FastDivmod object, in host code ideally. + /// + /// This precomputes some values based on the divisor and is computationally expensive. + CUTLASS_HOST_DEVICE + FastDivmodU64(uint64_t divisor_): divisor(divisor_), multiplier(1), shift_right(0), round_up(0) { + + if (divisor) { + shift_right = integer_log2(divisor); + + if ((divisor & (divisor - 1)) == 0) { + multiplier = 0; + } + else { + uint64_t power_of_two = (uint64_t(1) << shift_right); + uint64_t multiplier_lo = uint128_t(0, power_of_two) / divisor; + multiplier = uint128_t(power_of_two, power_of_two) / divisor; + round_up = (multiplier_lo == multiplier ? 1 : 0); + } + } + } + + /// Returns the quotient of floor(dividend / divisor) + CUTLASS_HOST_DEVICE + uint64_t divide(uint64_t dividend) const { + uint64_t quotient = 0; + + #ifdef __CUDA_ARCH__ + uint64_t x = dividend; + if (multiplier) { + x = __umul64hi(dividend + round_up, multiplier); + } + quotient = (x >> shift_right); + #else + // TODO - use proper 'fast' division here also. No reason why x86-code shouldn't be optimized. + quotient = dividend / divisor; + #endif + + return quotient; + } + + /// Computes the remainder given a computed quotient and dividend + CUTLASS_HOST_DEVICE + uint64_t modulus(uint64_t quotient, uint64_t dividend) const { + return uint32_t(dividend - quotient * divisor); + } + + /// Returns the quotient of floor(dividend / divisor) and computes the remainder + CUTLASS_HOST_DEVICE + uint64_t divmod(uint64_t &remainder, uint64_t dividend) const { + uint64_t quotient = divide(dividend); + remainder = modulus(quotient, dividend); + return quotient; + } + + /// Computes integer division and modulus using precomputed values. This is computationally + /// inexpensive. + CUTLASS_HOST_DEVICE + void operator()(uint64_t "ient, uint64_t &remainder, uint64_t dividend) const { + quotient = divmod(remainder, dividend); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Computes the coordinate decomposition from a linear index. +/// +/// This decomposition is accelerated by the FastDivmodU64 object. It is assumed that +/// a coordinate of indices can be decomposed by div/mod operations. +/// Note, is assumed that element divmod[0] divides by extent[1]. +/// +/// For example, assume 4-D coordinate (n, p, q, c) is mapped to a linear index `npqc`. This +/// can be decomposed via three divide and modulus operations: +/// +/// c = npqc % C; | divmod[2] = FastDivmodU64(C) +/// npq = npqc / C; | coord[3] = c +/// +/// q = npq % Q; | divmod[1] = FastDivmodU64(Q) +/// np = npq / Q; | coord[2] = q +/// +/// p = np % P; | divmod[0] = FastDivmodU64(P) +/// n = np / P; | coord[1] = p +/// +/// | coord[0] = n +/// +template +CUTLASS_HOST_DEVICE Coord CoordinateDecomposition( + uint64_t linear_idx, ///< Linear index to decompose + FastDivmodU64 const *divmod) { ///< Pointer to array of Rank-1 FastDivmodU64 objects + + static_assert(Rank > 0, "CoordinateDecomposition requires Rank=1 or greater."); + + Coord coord; + + CUTLASS_PRAGMA_UNROLL + for (int i = Rank; i > 1; --i) { + uint64_t remainder; + linear_idx = divmod[i - 2].divmod(remainder, linear_idx); + coord[i - 1] = int(remainder); + } + + coord[0] = int(linear_idx); + + return coord; +} + +///////////////////////////////////////////////////////////////////////////////////////////////// +// Min/Max +///////////////////////////////////////////////////////////////////////////////////////////////// template struct Min { @@ -296,6 +452,30 @@ constexpr int const_max(int a, int b) { return (b > a ? b : a); } +template +CUTLASS_HOST_DEVICE +T fast_min(T a, T b) { + return (b < a ? b : a); +} + +template <> +CUTLASS_HOST_DEVICE +float fast_min(float a, float b) { + return fminf(a, b); +} + +template +CUTLASS_HOST_DEVICE +T fast_max(T a, T b) { + return (a < b ? b : a); +} + +template <> +CUTLASS_HOST_DEVICE +float fast_max(float a, float b) { + return fmaxf(a, b); +} + CUTLASS_HOST_DEVICE float fast_cos(float theta) { #if defined(__CUDA_ARCH__) @@ -404,6 +584,24 @@ double fast_log(double x) { #endif } +CUTLASS_HOST_DEVICE +float fast_tanh(float x) { + #if defined(__CUDA_ARCH__) + return ::tanhf(x); + #else + return std::tanh(x); + #endif +} + +CUTLASS_HOST_DEVICE +double fast_tanh(double x) { + #if defined(__CUDA_ARCH__) + return ::tanh(x); + #else + return std::tanh(x); + #endif +} + ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace cutlass diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h index d20c45df2e..52d4ca59e6 100644 --- a/include/cutlass/functional.h +++ b/include/cutlass/functional.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -235,6 +235,156 @@ struct conjugate { ///////////////////////////////////////////////////////////////////////////////////////////////// +template +struct logical_and { + CUTLASS_HOST_DEVICE + T operator()(T const &a, T const &b) const { + return ((a && b) ? T(1) : T()); + } +}; + +template +struct logical_or { + CUTLASS_HOST_DEVICE + T operator()(T const &a, T const &b) const { + return ((a || b) ? T(1) : T()); + } +}; + +template +struct logical_not { + CUTLASS_HOST_DEVICE + T operator()(T const &a) const { + return T(!(a)); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct bit_and { + CUTLASS_HOST_DEVICE + T operator()(T const &a, T const &b) const { + return a & b; + } +}; + +template +struct bit_or { + CUTLASS_HOST_DEVICE + T operator()(T const &a, T const &b) const { + return a | b; + } +}; + +template +struct bit_not { + CUTLASS_HOST_DEVICE + T operator()(T const &a) const { + return ~a; + } +}; + +template +struct bit_xor { + CUTLASS_HOST_DEVICE + T operator()(T const &a, T const &b) const { + return a ^ b; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// Partial specializations for Arrays +template +struct bit_and> { + CUTLASS_HOST_DEVICE + Array operator()(Array const &a, Array const &b) const { + using ArrayType = Array; + using Storage = typename ArrayType::Storage; + ArrayType result; + + Storage *result_data = result.raw_data(); + Storage const *a_data = a.raw_data(); + Storage const *b_data = b.raw_data(); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < ArrayType::kStorageElements; ++i) { + result_data[i] = (a_data[i] & b_data[i]); + } + + return result; + } +}; + +// Partial specializations for Arrays +template +struct bit_or> { + CUTLASS_HOST_DEVICE + Array operator()(Array const &a, Array const &b) const { + using ArrayType = Array; + using Storage = typename ArrayType::Storage; + ArrayType result; + + Storage *result_data = result.raw_data(); + Storage const *a_data = a.raw_data(); + Storage const *b_data = b.raw_data(); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < ArrayType::kStorageElements; ++i) { + result_data[i] = (a_data[i] | b_data[i]); + } + + return result; + } +}; + + +// Partial specializations for Arrays +template +struct bit_not> { + CUTLASS_HOST_DEVICE + Array operator()(Array const &a) const { + using ArrayType = Array; + using Storage = typename ArrayType::Storage; + ArrayType result; + + Storage *result_data = result.raw_data(); + Storage const *a_data = a.raw_data(); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < ArrayType::kStorageElements; ++i) { + result_data[i] = (~a_data[i]); + } + + return result; + } +}; + +// Partial specializations for Arrays +template +struct bit_xor> { + CUTLASS_HOST_DEVICE + Array operator()(Array const &a, Array const &b) const { + using ArrayType = Array; + using Storage = typename ArrayType::Storage; + ArrayType result; + + Storage *result_data = result.raw_data(); + Storage const *a_data = a.raw_data(); + Storage const *b_data = b.raw_data(); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < ArrayType::kStorageElements; ++i) { + result_data[i] = (a_data[i] ^ b_data[i]); + } + + return result; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + template struct conjugate> { CUTLASS_HOST_DEVICE diff --git a/include/cutlass/gemm/device/default_gemm_configuration.h b/include/cutlass/gemm/device/default_gemm_configuration.h index c65b3f0062..ad38bf63e3 100644 --- a/include/cutlass/gemm/device/default_gemm_configuration.h +++ b/include/cutlass/gemm/device/default_gemm_configuration.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/device/gemm.h b/include/cutlass/gemm/device/gemm.h index 70383e15ef..e1d0092cdb 100644 --- a/include/cutlass/gemm/device/gemm.h +++ b/include/cutlass/gemm/device/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -133,7 +133,9 @@ namespace device { /// Operator class tag typename OperatorClass, - /// Tag indicating architecture to tune for + /// Tag indicating architecture to tune for. This is the minimum SM that + /// supports the intended feature. The device kernel can be built + /// targeting any SM larger than this number. typename ArchTag, /// Threadblock-level tile size (concept: GemmShape) @@ -211,9 +213,7 @@ template < /// Operation performed by GEMM typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, - ElementAccumulator_>::Operator, - /// Whether Beta is zero or not - bool IsBetaZero = false> + ElementAccumulator_>::Operator> class Gemm { public: @@ -241,7 +241,6 @@ class Gemm { static int const kAlignmentB = AlignmentB; static int const kAlignmentC = EpilogueOutputOp::kCount; static bool const kSplitKSerial = SplitKSerial; - static bool const kIsBetaZero = IsBetaZero; static ComplexTransform const kTransformA = ComplexTransform::kNone; static ComplexTransform const kTransformB = ComplexTransform::kNone; @@ -265,8 +264,7 @@ class Gemm { ThreadblockSwizzle, kStages, kSplitKSerial, - Operator, - kIsBetaZero + Operator >::GemmKernel; /// Argument structure @@ -533,15 +531,13 @@ template < /// If true, kernel supports split-K as a serial reduction bool SplitKSerial, /// Operation performed by GEMM - typename Operator_, - /// Beta is zero or not - bool IsBetaZero> + typename Operator_> class Gemm { + Operator_> { public: using ElementA = ElementA_; @@ -569,7 +565,6 @@ class Gemm; using UnderlyingArguments = typename UnderlyingOperator::Arguments; diff --git a/include/cutlass/gemm/device/gemm_array.h b/include/cutlass/gemm/device/gemm_array.h index c44579e005..12bc300ff2 100644 --- a/include/cutlass/gemm/device/gemm_array.h +++ b/include/cutlass/gemm/device/gemm_array.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -133,7 +133,9 @@ namespace device { /// Operator class tag typename OperatorClass, - /// Tag indicating architecture to tune for + /// Tag indicating architecture to tune for. This is the minimum SM that + /// supports the intended feature. The device kernel can be built + /// targeting any SM larger than this number. typename ArchTag, /// Threadblock-level tile size (concept: GemmShape) @@ -257,8 +259,7 @@ class GemmArray { ThreadblockSwizzle, kStages, false, - Operator, - false + Operator >::GemmKernel; using GemmKernel = kernel::GemmArray; diff --git a/include/cutlass/gemm/device/gemm_batched.h b/include/cutlass/gemm/device/gemm_batched.h index 052bd90093..8f09b4a77c 100644 --- a/include/cutlass/gemm/device/gemm_batched.h +++ b/include/cutlass/gemm/device/gemm_batched.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -133,7 +133,9 @@ namespace device { /// Operator class tag typename OperatorClass, - /// Tag indicating architecture to tune for + /// Tag indicating architecture to tune for. This is the minimum SM that + /// supports the intended feature. The device kernel can be built + /// targeting any SM larger than this number. typename ArchTag, /// Threadblock-level tile size (concept: GemmShape) @@ -257,8 +259,7 @@ class GemmBatched { ThreadblockSwizzle, kStages, false, - Operator, - false + Operator >::GemmKernel; using GemmKernel = kernel::GemmBatched; diff --git a/include/cutlass/gemm/device/gemm_complex.h b/include/cutlass/gemm/device/gemm_complex.h index 8ad1036bb1..70e0b46a38 100644 --- a/include/cutlass/gemm/device/gemm_complex.h +++ b/include/cutlass/gemm/device/gemm_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -47,37 +47,40 @@ namespace device { ///////////////////////////////////////////////////////////////////////////////////////////////// -/*! Gemm device-level operator. This is an interface to efficient CUTLASS GEMM kernels that may - be invoked from host code. +/*! Gemm device-level operator. This is an interface to efficient CUTLASS GEMM + kernels that may be invoked from host code. The contributions of this class are: - - 1. At compile time, it maps data types and high-level structural parameters onto - specific CUTLASS components. - 2. At runtime, it maps logical arguments to GEMM problems to kernel parameters. + 1. At compile time, it maps data types and high-level structural parameters + onto specific CUTLASS components. + + 2. At runtime, it maps logical arguments to GEMM problems to kernel + parameters. 3. At runtime, it launches kernels on the device. - The intent is to provide a convenient mechanism for interacting with most plausible GEMM - configurations for each supported architecture. Consequently, not all parameters are exposed - to the top-level interface. Rather, sensible defaults at each level of the CUTLASS hierarchy - are selected to tradeoff simplicity of the interface with flexibility. We expect - most configurations to be specified at this level. Applications with more exotic requirements - may construct their kernels of interest using CUTLASS components at the threadblock, warp, - and thread levels of abstraction. + The intent is to provide a convenient mechanism for interacting with most + plausible GEMM configurations for each supported architecture. Consequently, + not all parameters are exposed to the top-level interface. Rather, sensible + defaults at each level of the CUTLASS hierarchy are selected to tradeoff + simplicity of the interface with flexibility. We expect most configurations to + be specified at this level. Applications with more exotic requirements may + construct their kernels of interest using CUTLASS components at the + threadblock, warp, and thread levels of abstraction. - CUTLASS exposes computations using the functor design pattern in which objects compose some - internal state with an overloaded function call operator. This enables decoupling of - initialization from execution, possibly reducing overhead during steady state phases of - application execution. + CUTLASS exposes computations using the functor design pattern in which objects + compose some internal state with an overloaded function call operator. This + enables decoupling of initialization from execution, possibly reducing + overhead during steady state phases of application execution. - CUTLASS device-level operators expose an Arguments structure encompassing each logical - input to the computation. This is distinct from the kernel-level Params structure pattern - which contains application-specific precomputed state needed by the device code. + CUTLASS device-level operators expose an Arguments structure encompassing each + logical input to the computation. This is distinct from the kernel-level + Params structure pattern which contains application-specific precomputed state + needed by the device code. - Example of a CUTLASS GEMM operator implementing the functionality of cuBLAS's SGEMM NN - is as follows: + Example of a CUTLASS GEMM operator implementing the functionality of cuBLAS's + SGEMM NN is as follows: // // Instantiate the CUTLASS GEMM operator. @@ -111,46 +114,48 @@ namespace device { template < /// Element type for A matrix operand typename ElementA, - + /// Layout type for A matrix operand typename LayoutA, - + /// Element type for B matrix operand typename ElementB, - + /// Layout type for B matrix operand typename LayoutB, - + /// Element type for C and D matrix operands typename ElementC, - + /// Layout type for C and D matrix operands typename LayoutC, - + /// Element type for internal accumulation typename ElementAccumulator, /// Operator class tag typename OperatorClass, - - /// Tag indicating architecture to tune for + + /// Tag indicating architecture to tune for. This is the minimum SM that + /// supports the intended feature. The device kernel can be built + /// targeting any SM larger than this number. typename ArchTag, - + /// Threadblock-level tile size (concept: GemmShape) typename ThreadblockShape, - + /// Warp-level tile size (concept: GemmShape) typename WarpShape, - + /// Warp-level tile size (concept: GemmShape) typename InstructionShape, - + /// Epilogue output operator typename EpilogueOutputOp, - + /// Threadblock-level swizzling operator typename ThreadblockSwizzle, - + /// Number of stages used in the pipelined mainloop int Stages > @@ -173,7 +178,7 @@ template < typename ElementAccumulator_ = ElementC_, /// Operator class tag typename OperatorClass_ = arch::OpClassSimt, - /// Tag indicating architecture to tune for + /// Tag indicating architecture to tune for. typename ArchTag_ = arch::Sm70, /// Threadblock-level tile size (concept: GemmShape) typename ThreadblockShape_ = typename DefaultGemmConfiguration< @@ -192,7 +197,8 @@ template < OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, /// Threadblock-level swizzling operator - typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>, + typename ThreadblockSwizzle_ = + threadblock::GemmIdentityThreadblockSwizzle<>, /// Number of stages used in the pipelined mainloop int Stages = DefaultGemmConfiguration + bool SplitKSerial = false> class GemmComplex { public: diff --git a/include/cutlass/gemm/device/gemm_sparse.h b/include/cutlass/gemm/device/gemm_sparse.h index bfd5606e1f..04e2dd6673 100644 --- a/include/cutlass/gemm/device/gemm_sparse.h +++ b/include/cutlass/gemm/device/gemm_sparse.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -133,7 +133,9 @@ namespace device { /// Operator class tag typename OperatorClass, - /// Tag indicating architecture to tune for + /// Tag indicating architecture to tune for. This is the minimum SM that + /// supports the intended feature. The device kernel can be built + /// targeting any SM larger than this number. typename ArchTag, /// Threadblock-level tile size (concept: GemmShape) @@ -211,9 +213,7 @@ template < /// Operation performed by GEMM typename Operator_ = typename DefaultGemmConfiguration< OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, - ElementAccumulator_>::Operator, - /// Whether Beta is zero or not - bool IsBetaZero = false> + ElementAccumulator_>::Operator> class SparseGemm { public: @@ -241,7 +241,6 @@ class SparseGemm { static int const kAlignmentB = AlignmentB; static int const kAlignmentC = EpilogueOutputOp::kCount; static bool const kSplitKSerial = SplitKSerial; - static bool const kIsBetaZero = IsBetaZero; static ComplexTransform const kTransformA = ComplexTransform::kNone; static ComplexTransform const kTransformB = ComplexTransform::kNone; @@ -265,8 +264,7 @@ class SparseGemm { ThreadblockSwizzle, kStages, kSplitKSerial, - Operator, - kIsBetaZero + Operator >::GemmKernel; using ElementE = typename GemmKernel::ElementE; diff --git a/include/cutlass/gemm/device/gemm_splitk_parallel.h b/include/cutlass/gemm/device/gemm_splitk_parallel.h index 73f1c240b0..987319c2cc 100644 --- a/include/cutlass/gemm/device/gemm_splitk_parallel.h +++ b/include/cutlass/gemm/device/gemm_splitk_parallel.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,9 @@ template < typename ElementAccumulator_ = ElementC_, /// Operator class tag typename OperatorClass_ = arch::OpClassSimt, - /// Tag indicating architecture to tune for + /// Tag indicating architecture to tune for. This is the minimum SM that + /// supports the intended feature. The device kernel can be built + /// targeting any SM larger than this number. typename ArchTag_ = arch::Sm70, /// Threadblock-level tile size (concept: GemmShape) typename ThreadblockShape_ = typename DefaultGemmConfiguration< @@ -425,7 +427,9 @@ template < typename ElementAccumulator_, /// Operator class tag typename OperatorClass_, - /// Tag indicating architecture to tune for + /// Tag indicating architecture to tune for. This is the minimum SM that + /// supports the intended feature. The device kernel can be built + /// targeting any SM larger than this number. typename ArchTag_, /// Threadblock-level tile size (concept: GemmShape) typename ThreadblockShape_, diff --git a/include/cutlass/gemm/device/gemm_universal.h b/include/cutlass/gemm/device/gemm_universal.h index 0912909014..54f8e14932 100644 --- a/include/cutlass/gemm/device/gemm_universal.h +++ b/include/cutlass/gemm/device/gemm_universal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,9 @@ template < typename ElementAccumulator_ = ElementC_, /// Operator class tag typename OperatorClass_ = arch::OpClassSimt, - /// Tag indicating architecture to tune for + /// Tag indicating architecture to tune for. This is the minimum SM that + /// supports the intended feature. The device kernel can be built + /// targeting any SM larger than this number. typename ArchTag_ = arch::Sm70, /// Threadblock-level tile size (concept: GemmShape) typename ThreadblockShape_ = typename DefaultGemmConfiguration< @@ -202,7 +204,9 @@ template < typename ElementAccumulator_, /// Operator class tag typename OperatorClass_, - /// Tag indicating architecture to tune for + /// Tag indicating architecture to tune for. This is the minimum SM that + /// supports the intended feature. The device kernel can be built + /// targeting any SM larger than this number. typename ArchTag_, /// Threadblock-level tile size (concept: GemmShape) typename ThreadblockShape_, diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h index a669483541..fb54170134 100644 --- a/include/cutlass/gemm/device/gemm_universal_adapter.h +++ b/include/cutlass/gemm/device/gemm_universal_adapter.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/device/gemm_universal_base.h b/include/cutlass/gemm/device/gemm_universal_base.h index 9ffc6b041c..74c519a44e 100644 --- a/include/cutlass/gemm/device/gemm_universal_base.h +++ b/include/cutlass/gemm/device/gemm_universal_base.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/gemm.h b/include/cutlass/gemm/gemm.h index 51f535f7c1..62725ffe08 100644 --- a/include/cutlass/gemm/gemm.h +++ b/include/cutlass/gemm/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/default_gemm.h b/include/cutlass/gemm/kernel/default_gemm.h index 0aba2d3a72..966b00890e 100644 --- a/include/cutlass/gemm/kernel/default_gemm.h +++ b/include/cutlass/gemm/kernel/default_gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -111,9 +111,7 @@ template < /// epilogue bool SplitKSerial, /// Operation performed by GEMM - typename Operator, - /// Beta is zero or not - bool IsBetaZero = false> + typename Operator> struct DefaultGemm; //////////////////////////////////////////////////////////////////////////////// @@ -295,16 +293,14 @@ template < /// epilogue bool SplitKSerial, /// Operation performed by GEMM - typename Operator, - /// Is Beta zero or not - bool IsBetaZero> + typename Operator> struct DefaultGemm< ElementA, layout::ColumnMajorInterleaved, kAlignmentA, ElementB, layout::RowMajorInterleaved, kAlignmentB, ElementC, layout::ColumnMajorInterleaved, int32_t, arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, - SplitKSerial, Operator, IsBetaZero> { + SplitKSerial, Operator> { using LayoutA = layout::ColumnMajorInterleaved; using LayoutB = layout::RowMajorInterleaved; using LayoutC = layout::ColumnMajorInterleaved; @@ -324,8 +320,7 @@ struct DefaultGemm< using Epilogue = typename cutlass::epilogue::threadblock:: DefaultInterleavedEpilogueTensorOp< ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp, - 64 / sizeof_bits::value, InterleavedK, - IsBetaZero>::Epilogue; + 64 / sizeof_bits::value, InterleavedK>::Epilogue; /// Define the kernel-level GEMM operator. using GemmKernel = kernel::Gemm; @@ -361,16 +356,14 @@ template < /// epilogue bool SplitKSerial, /// Operation performed by GEMM - typename Operator, - /// Is Beta zero or not - bool IsBetaZero> + typename Operator> struct DefaultGemm, kAlignmentA, ElementB, layout::RowMajorInterleaved, kAlignmentB, ElementC, layout::ColumnMajorInterleaved, int32_t, arch::OpClassTensorOp, arch::Sm75, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - ThreadblockSwizzle, 2, SplitKSerial, Operator, IsBetaZero> { + ThreadblockSwizzle, 2, SplitKSerial, Operator> { using LayoutA = layout::ColumnMajorInterleaved; using LayoutB = layout::RowMajorInterleaved; using LayoutC = layout::ColumnMajorInterleaved; @@ -389,8 +382,7 @@ struct DefaultGemm, using Epilogue = typename cutlass::epilogue::threadblock:: DefaultInterleavedEpilogueTensorOp< ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp, - 64 / sizeof_bits::value, InterleavedK, - IsBetaZero>::Epilogue; + 64 / sizeof_bits::value, InterleavedK>::Epilogue; /// Define the kernel-level GEMM operator. using GemmKernel = kernel::Gemm; @@ -682,7 +674,7 @@ struct DefaultGemm, EpilogueOutputOp, ThreadblockSwizzle, 2, SplitKSerial, - Operator, false> { + Operator> { using InstructionShape = GemmShape<1, 1, 4>; using ElementA = int8_t; using ElementB = int8_t; @@ -703,8 +695,7 @@ struct DefaultGemm::ThreadblockMma; static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount; diff --git a/include/cutlass/gemm/kernel/default_gemm_complex.h b/include/cutlass/gemm/kernel/default_gemm_complex.h index cff06e69de..350b3484c1 100644 --- a/include/cutlass/gemm/kernel/default_gemm_complex.h +++ b/include/cutlass/gemm/kernel/default_gemm_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h b/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h index 870084834a..a60cf02452 100644 --- a/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h +++ b/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/default_gemm_sparse.h b/include/cutlass/gemm/kernel/default_gemm_sparse.h index 9c43666fe0..e212f6bfca 100644 --- a/include/cutlass/gemm/kernel/default_gemm_sparse.h +++ b/include/cutlass/gemm/kernel/default_gemm_sparse.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -112,9 +112,7 @@ template < /// epilogue bool SplitKSerial, /// Operation performed by GEMM - typename Operator, - /// Beta is zero or not - bool IsBetaZero = false> + typename Operator> struct DefaultSparseGemm; //////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h b/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h index e23965d336..d97a93f498 100644 --- a/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h +++ b/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/default_gemm_universal.h b/include/cutlass/gemm/kernel/default_gemm_universal.h index 579005cb41..f9094672d9 100644 --- a/include/cutlass/gemm/kernel/default_gemm_universal.h +++ b/include/cutlass/gemm/kernel/default_gemm_universal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -185,8 +185,7 @@ struct DefaultGemmUniversal< ThreadblockSwizzle, Stages, true, - Operator, - false + Operator >::GemmKernel; /// Define the kernel in terms of the default kernel diff --git a/include/cutlass/gemm/kernel/default_gemm_with_reduction.h b/include/cutlass/gemm/kernel/default_gemm_with_reduction.h new file mode 100644 index 0000000000..47c075c920 --- /dev/null +++ b/include/cutlass/gemm/kernel/default_gemm_with_reduction.h @@ -0,0 +1,240 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief + Defines a GEMM with Reduction based on an existing UniversalGemm kernel. + +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/kernel/gemm_with_fused_epilogue.h" +#include "cutlass/gemm/kernel/default_gemm_universal.h" + +#include "cutlass/epilogue/threadblock/default_epilogue_with_reduction.h" +#include "cutlass/epilogue/threadblock/epilogue_with_reduction.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace kernel { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + /// Element type for A matrix operand + typename ElementA_, + /// Layout type for A matrix operand + typename LayoutA_, + /// Complex elementwise transformation on A operand + ComplexTransform TransformA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB_, + /// Layout type for B matrix operand + typename LayoutB_, + /// Complex elementwise transformation on B operand + ComplexTransform TransformB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for C and D matrix operands + typename ElementC_, + /// Layout type for C and D matrix operands + typename LayoutC_, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Operator class tag + typename OperatorClass, + /// Tag indicating architecture to tune for + typename ArchTag, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp, + /// Epilogue reduction operator + typename EpilogueReductionOp, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of stages used in the pipelined mainloop + int Stages, + /// Operation performed by GEMM + typename Operator, + /// + typename Enable = void +> +struct DefaultGemmWithReduction { + + using GemmBase = typename DefaultGemmUniversal< + ElementA_, LayoutA_, TransformA, kAlignmentA, + ElementB_, LayoutB_, TransformB, kAlignmentB, + ElementC_, LayoutC_, ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + Operator + >::GemmKernel; + + // Replace epilogue + using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionTensorOp< + typename GemmBase::Epilogue::Shape, + typename GemmBase::Epilogue::WarpMmaOperator, + GemmBase::Epilogue::kPartitionsK, + ElementC_, + EpilogueOutputOp, + EpilogueReductionOp, + GemmBase::Epilogue::kElementsPerAccess + >::Epilogue; + + // Compose the GEMM kernel + using GemmKernel = GemmWithFusedEpilogue< + typename GemmBase::Mma, + Epilogue, + ThreadblockSwizzle + >; +}; + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Parital specialization: ArchTag = cutlass::arch::Sm70 +/// +/// +template < + /// Element type for A matrix operand + typename ElementA_, + /// Layout type for A matrix operand + typename LayoutA_, + /// Complex elementwise transformation on A operand + ComplexTransform TransformA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB_, + /// Layout type for B matrix operand + typename LayoutB_, + /// Complex elementwise transformation on B operand + ComplexTransform TransformB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for C and D matrix operands + typename ElementC_, + /// Layout type for C and D matrix operands + typename LayoutC_, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Operator class tag + typename OperatorClass, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp, + /// Epilogue reduction operator + typename EpilogueReductionOp, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of stages used in the pipelined mainloop + int Stages, + /// Operation performed by GEMM + typename Operator, + /// + typename Enable +> +struct DefaultGemmWithReduction< + ElementA_, LayoutA_, TransformA, kAlignmentA, + ElementB_, LayoutB_, TransformB, kAlignmentB, + ElementC_, LayoutC_, + ElementAccumulator, + OperatorClass, + cutlass::arch::Sm70, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + EpilogueReductionOp, + ThreadblockSwizzle, + Stages, + Operator, + Enable + > { + + using GemmBase = typename DefaultGemmUniversal< + ElementA_, LayoutA_, TransformA, kAlignmentA, + ElementB_, LayoutB_, TransformB, kAlignmentB, + ElementC_, LayoutC_, ElementAccumulator, + OperatorClass, + cutlass::arch::Sm70, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + Operator + >::GemmKernel; + + // Replace epilogue + using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionVoltaTensorOp< + typename GemmBase::Epilogue::Shape, + typename GemmBase::Epilogue::WarpMmaOperator, + GemmBase::Epilogue::kPartitionsK, + ElementC_, + EpilogueOutputOp, + EpilogueReductionOp, + GemmBase::Epilogue::kElementsPerAccess + >::Epilogue; + + // Compose the GEMM kernel + using GemmKernel = GemmWithFusedEpilogue< + typename GemmBase::Mma, + Epilogue, + ThreadblockSwizzle + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace kernel +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/kernel/default_gemv.h b/include/cutlass/gemm/kernel/default_gemv.h index 36ae339c4e..03d9c43c52 100755 --- a/include/cutlass/gemm/kernel/default_gemv.h +++ b/include/cutlass/gemm/kernel/default_gemv.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemm.h b/include/cutlass/gemm/kernel/gemm.h index ce61137f36..1d5601cdd8 100644 --- a/include/cutlass/gemm/kernel/gemm.h +++ b/include/cutlass/gemm/kernel/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemm_array.h b/include/cutlass/gemm/kernel/gemm_array.h index 1c59a53ae0..0df217421d 100644 --- a/include/cutlass/gemm/kernel/gemm_array.h +++ b/include/cutlass/gemm/kernel/gemm_array.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemm_batched.h b/include/cutlass/gemm/kernel/gemm_batched.h index 45ec7756f7..ceefed127f 100644 --- a/include/cutlass/gemm/kernel/gemm_batched.h +++ b/include/cutlass/gemm/kernel/gemm_batched.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemm_pipelined.h b/include/cutlass/gemm/kernel/gemm_pipelined.h index 02c7ba254b..39f328a30b 100644 --- a/include/cutlass/gemm/kernel/gemm_pipelined.h +++ b/include/cutlass/gemm/kernel/gemm_pipelined.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemm_planar_complex.h b/include/cutlass/gemm/kernel/gemm_planar_complex.h index b9626145fe..0151848f38 100644 --- a/include/cutlass/gemm/kernel/gemm_planar_complex.h +++ b/include/cutlass/gemm/kernel/gemm_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemm_planar_complex_array.h b/include/cutlass/gemm/kernel/gemm_planar_complex_array.h index e7fa89dc74..05bde223bf 100644 --- a/include/cutlass/gemm/kernel/gemm_planar_complex_array.h +++ b/include/cutlass/gemm/kernel/gemm_planar_complex_array.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemm_splitk_parallel.h b/include/cutlass/gemm/kernel/gemm_splitk_parallel.h index 72ca5a4743..e009567e4d 100644 --- a/include/cutlass/gemm/kernel/gemm_splitk_parallel.h +++ b/include/cutlass/gemm/kernel/gemm_splitk_parallel.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemm_universal.h b/include/cutlass/gemm/kernel/gemm_universal.h index bba6217160..0ff5ce999c 100644 --- a/include/cutlass/gemm/kernel/gemm_universal.h +++ b/include/cutlass/gemm/kernel/gemm_universal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemv_batched_strided.h b/include/cutlass/gemm/kernel/gemv_batched_strided.h index ea8d9bdf85..63f4d6e37e 100755 --- a/include/cutlass/gemm/kernel/gemv_batched_strided.h +++ b/include/cutlass/gemm/kernel/gemv_batched_strided.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/sparse_gemm.h b/include/cutlass/gemm/kernel/sparse_gemm.h index 730745fdc8..9d9e0a282d 100644 --- a/include/cutlass/gemm/kernel/sparse_gemm.h +++ b/include/cutlass/gemm/kernel/sparse_gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/thread/mma.h b/include/cutlass/gemm/thread/mma.h index 15dfe4338e..e163d8930a 100644 --- a/include/cutlass/gemm/thread/mma.h +++ b/include/cutlass/gemm/thread/mma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/thread/mma_sm50.h b/include/cutlass/gemm/thread/mma_sm50.h index 6d52efb023..e7bbbc90a6 100644 --- a/include/cutlass/gemm/thread/mma_sm50.h +++ b/include/cutlass/gemm/thread/mma_sm50.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/thread/mma_sm60.h b/include/cutlass/gemm/thread/mma_sm60.h index 07e2d55629..562c682e8a 100644 --- a/include/cutlass/gemm/thread/mma_sm60.h +++ b/include/cutlass/gemm/thread/mma_sm60.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/thread/mma_sm61.h b/include/cutlass/gemm/thread/mma_sm61.h index 09fd356236..81430d986a 100644 --- a/include/cutlass/gemm/thread/mma_sm61.h +++ b/include/cutlass/gemm/thread/mma_sm61.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_gemv_core.h b/include/cutlass/gemm/threadblock/default_gemv_core.h index 9d692d6db5..a4ac423ebe 100755 --- a/include/cutlass/gemm/threadblock/default_gemv_core.h +++ b/include/cutlass/gemm/threadblock/default_gemv_core.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_mma.h b/include/cutlass/gemm/threadblock/default_mma.h index fbf76510db..155508096c 100644 --- a/include/cutlass/gemm/threadblock/default_mma.h +++ b/include/cutlass/gemm/threadblock/default_mma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_mma_core.h b/include/cutlass/gemm/threadblock/default_mma_core.h index a7ac7c44b2..5a5426f4c6 100644 --- a/include/cutlass/gemm/threadblock/default_mma_core.h +++ b/include/cutlass/gemm/threadblock/default_mma_core.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_mma_core_simt.h b/include/cutlass/gemm/threadblock/default_mma_core_simt.h index ba3a161650..2ec882cc45 100644 --- a/include/cutlass/gemm/threadblock/default_mma_core_simt.h +++ b/include/cutlass/gemm/threadblock/default_mma_core_simt.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm70.h b/include/cutlass/gemm/threadblock/default_mma_core_sm70.h index 30b3b3c0aa..4e75154630 100644 --- a/include/cutlass/gemm/threadblock/default_mma_core_sm70.h +++ b/include/cutlass/gemm/threadblock/default_mma_core_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm75.h b/include/cutlass/gemm/threadblock/default_mma_core_sm75.h index d797704e79..ded7f119d3 100644 --- a/include/cutlass/gemm/threadblock/default_mma_core_sm75.h +++ b/include/cutlass/gemm/threadblock/default_mma_core_sm75.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm80.h b/include/cutlass/gemm/threadblock/default_mma_core_sm80.h index 065ed74694..8b0c0de628 100644 --- a/include/cutlass/gemm/threadblock/default_mma_core_sm80.h +++ b/include/cutlass/gemm/threadblock/default_mma_core_sm80.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h b/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h index f7298e4e7e..26c9b95b10 100644 --- a/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h +++ b/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_mma_core_wmma.h b/include/cutlass/gemm/threadblock/default_mma_core_wmma.h index 8214494321..64efa9a0f4 100644 --- a/include/cutlass/gemm/threadblock/default_mma_core_wmma.h +++ b/include/cutlass/gemm/threadblock/default_mma_core_wmma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h b/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h index 2f4a079619..d5f963bd01 100644 --- a/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h +++ b/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h b/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h index 04a856e9a4..a204f95410 100644 --- a/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h +++ b/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h index 36c5c54ee9..0c3972145e 100644 --- a/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h +++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h index 613c88e3ea..6a1e48fc0c 100644 --- a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h +++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h index 697d22bf6d..9528744b7d 100644 --- a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h +++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_sparse_mma.h b/include/cutlass/gemm/threadblock/default_sparse_mma.h index 3f6354771e..b390382a0f 100644 --- a/include/cutlass/gemm/threadblock/default_sparse_mma.h +++ b/include/cutlass/gemm/threadblock/default_sparse_mma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/gemv.h b/include/cutlass/gemm/threadblock/gemv.h new file mode 100755 index 0000000000..584b375da6 --- /dev/null +++ b/include/cutlass/gemm/threadblock/gemv.h @@ -0,0 +1,141 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Template for a threadblock-scoped GEMV kernel. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/numeric_types.h" +#include "cutlass/matrix_shape.h" + +#include "cutlass/gemm/gemm.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix-vector product using SIMT math instructions. +template < + class Core_ //< GemvCore +> +class Gemv { +public: + using Shape = typename Core_::Shape; + + /// The MMA operator that computes GEMV + using Operator = typename Core_::Operator; + + /// Iterates over A in global memory + using IteratorA = typename Core_::IteratorA; + + /// Iterates over B in global memory + using IteratorB = typename Core_::IteratorB; + + /// Fragment of operand C loaded from global memory + using IteratorC = typename Core_::IteratorC; + + /// Fragment of operand A loaded from global memory + using FragmentA = typename IteratorA::Fragment; + + /// Fragment of operand B loaded from global memory + using FragmentB = typename IteratorB::Fragment; + + /// Fragment of operand accumulator loaded/stored to global memory + using FragmentC = typename Operator::FragmentC; + + /// Shape of the per-thread GEMV operation + using ThreadShape = typename Core_::ThreadShape; + +public: + CUTLASS_DEVICE + Gemv() { } + + CUTLASS_DEVICE + void operator()( + GemmCoord const &problem_size, ///< problem size of batched GEMV + FragmentC &accum, ///< destination accumulator tile + IteratorA iterator_A, ///< iterator over A operand in global memory + IteratorB iterator_B, ///< iterator over B operand in global memory + FragmentC const &src_accum) { ///< source accumualtor tile + + // + // Prologue + // + + FragmentA frag_A; + FragmentB frag_B; + frag_A.clear(); + frag_B.clear(); + + iterator_A.load(frag_A); + iterator_B.load(frag_B); + ++iterator_A; + ++iterator_B; + + // + // Mainloop + // + Operator thread_mma; + int gemm_k = problem_size.k(); + + if (gemm_k < Shape::kK) + { + iterator_A.clear_mask(); + iterator_B.clear_mask(); + } + + // iterate over K to accumulate result + CUTLASS_GEMM_LOOP + for (; gemm_k > 0; gemm_k -= Shape::kK) { + thread_mma(accum, frag_A, frag_B, accum); + + iterator_A.load(frag_A); + iterator_B.load(frag_B); + ++iterator_A; + ++iterator_B; + + if (gemm_k < Shape::kK) + { + iterator_A.clear_mask(); + iterator_B.clear_mask(); + } + } + + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass diff --git a/include/cutlass/gemm/threadblock/mma_base.h b/include/cutlass/gemm/threadblock/mma_base.h index dbf3d31f56..a56d81f0e0 100644 --- a/include/cutlass/gemm/threadblock/mma_base.h +++ b/include/cutlass/gemm/threadblock/mma_base.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/mma_multistage.h b/include/cutlass/gemm/threadblock/mma_multistage.h index 804e3373a3..d07b236d40 100644 --- a/include/cutlass/gemm/threadblock/mma_multistage.h +++ b/include/cutlass/gemm/threadblock/mma_multistage.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/mma_pipelined.h b/include/cutlass/gemm/threadblock/mma_pipelined.h index 80954f6c4f..5fcbdebe1a 100644 --- a/include/cutlass/gemm/threadblock/mma_pipelined.h +++ b/include/cutlass/gemm/threadblock/mma_pipelined.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_base.h b/include/cutlass/gemm/threadblock/mma_planar_complex_base.h index b37b418462..22c9b3f863 100644 --- a/include/cutlass/gemm/threadblock/mma_planar_complex_base.h +++ b/include/cutlass/gemm/threadblock/mma_planar_complex_base.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h b/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h index 18e63b5805..fedad053b0 100644 --- a/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h +++ b/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h b/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h index ecf722d92a..0e48b2bd1a 100644 --- a/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h +++ b/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/mma_singlestage.h b/include/cutlass/gemm/threadblock/mma_singlestage.h index 373d985ac6..edcef03699 100644 --- a/include/cutlass/gemm/threadblock/mma_singlestage.h +++ b/include/cutlass/gemm/threadblock/mma_singlestage.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/mma_sparse_base.h b/include/cutlass/gemm/threadblock/mma_sparse_base.h index c6bb3411fc..eb192f723e 100644 --- a/include/cutlass/gemm/threadblock/mma_sparse_base.h +++ b/include/cutlass/gemm/threadblock/mma_sparse_base.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/mma_sparse_multistage.h b/include/cutlass/gemm/threadblock/mma_sparse_multistage.h index a2ff84664a..e865585b1c 100644 --- a/include/cutlass/gemm/threadblock/mma_sparse_multistage.h +++ b/include/cutlass/gemm/threadblock/mma_sparse_multistage.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/threadblock_swizzle.h b/include/cutlass/gemm/threadblock/threadblock_swizzle.h index 587de56a66..79314088dd 100644 --- a/include/cutlass/gemm/threadblock/threadblock_swizzle.h +++ b/include/cutlass/gemm/threadblock/threadblock_swizzle.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h b/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h index 3c6772aff7..b397b4567d 100644 --- a/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h +++ b/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h b/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h index 637e39009e..bdc2341e3f 100644 --- a/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h +++ b/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/default_mma_tensor_op.h b/include/cutlass/gemm/warp/default_mma_tensor_op.h index ea9ab5c931..8240c430b4 100644 --- a/include/cutlass/gemm/warp/default_mma_tensor_op.h +++ b/include/cutlass/gemm/warp/default_mma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h b/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h index 06d3afa59f..c550b022f7 100644 --- a/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h +++ b/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h b/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h index 582fb472e1..5f8864a500 100644 --- a/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h +++ b/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma.h b/include/cutlass/gemm/warp/mma.h index 16c736e2b7..7180434e1e 100644 --- a/include/cutlass/gemm/warp/mma.h +++ b/include/cutlass/gemm/warp/mma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op.h b/include/cutlass/gemm/warp/mma_complex_tensor_op.h index a34c16df07..5877b95f3b 100644 --- a/include/cutlass/gemm/warp/mma_complex_tensor_op.h +++ b/include/cutlass/gemm/warp/mma_complex_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h index b95af0df15..ba74fe96e3 100644 --- a/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h +++ b/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h index 4ab139023a..7cfad2ea6d 100644 --- a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h +++ b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h index 8d9417b0fb..dacfe266e8 100644 --- a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h +++ b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_planar_complex.h b/include/cutlass/gemm/warp/mma_planar_complex.h index c579044065..6328105b78 100644 --- a/include/cutlass/gemm/warp/mma_planar_complex.h +++ b/include/cutlass/gemm/warp/mma_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_simt.h b/include/cutlass/gemm/warp/mma_simt.h index 306a08d17c..a86e06e461 100644 --- a/include/cutlass/gemm/warp/mma_simt.h +++ b/include/cutlass/gemm/warp/mma_simt.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_simt_policy.h b/include/cutlass/gemm/warp/mma_simt_policy.h index 6abd0bf6a8..de89d5123a 100644 --- a/include/cutlass/gemm/warp/mma_simt_policy.h +++ b/include/cutlass/gemm/warp/mma_simt_policy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_simt_tile_iterator.h b/include/cutlass/gemm/warp/mma_simt_tile_iterator.h index ed1e598702..660db38803 100644 --- a/include/cutlass/gemm/warp/mma_simt_tile_iterator.h +++ b/include/cutlass/gemm/warp/mma_simt_tile_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -214,7 +214,7 @@ class MmaSimtTileIterator *dst_ptr = @@ -273,6 +273,213 @@ class MmaSimtTileIterator +class MmaSimtTileIterator { +public: + + /// Shape of tile to load (concept: MatrixShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand::kA; + + /// Element type + using Element = Element_; + + /// Layout of policy + using Layout = layout::RowMajor; + + /// Decomposition of elements among threads + using Policy = Policy_; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + // + // Derived quantities + // + + static_assert(!(Shape::kRow % Policy::WarpShape::kRow), + "The warp-level GEMM M size must be divisible by the number of threads arranged along the M dimension."); + + static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero."); + static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero."); + static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero."); + static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero."); + + /// Thread-level shape of a fragment + using ThreadShape = MatrixShape< + Shape::kRow / Policy::WarpShape::kRow, + Shape::kColumn + >; + + static_assert(!(ThreadShape::kRow % Policy::LaneMmaShape::kM), + "Thread-level GEMM must be divisible by Policy::LaneMmaShape."); + + /// Number of individual loads (scalar loads) + using Iterations = MatrixShape< + ThreadShape::kRow / Policy::LaneMmaShape::kM, + ThreadShape::kColumn + >; + + /// Fragment object holding a thread's part of a tile + using Fragment = Array; + +private: + + /// Internal reference + cutlass::TensorRef ref_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaSimtTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaSimtTileIterator( + TensorRef ref, + int lane_id + ) { + + // compute offset based on thread ID and lane layout + typename Policy::LaneLayout lane_layout = Policy::get_lane_layout(); + + MatrixCoord lane_offset = lane_layout.inverse(lane_id) * + MatrixCoord(Policy::LaneMmaShape::kM, 0); + + ref.add_coord_offset(lane_offset); + + ref_.reset(ref.data(), ref.stride(0)); + + } + + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaSimtTileIterator &add_pointer_offset(LongIndex offset) { + ref_.add_pointer_offset(offset); + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) { + + ref_.add_coord_offset({ + coord.row() * Shape::kRow, + coord.column() * Shape::kColumn}); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaSimtTileIterator & operator++() { + + ref_.add_coord_offset({0, Shape::kColumn}); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaSimtTileIterator & operator--() { + + ref_.add_coord_offset({0, -Shape::kColumn}); + + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. (scalar loads) + CUTLASS_HOST_DEVICE + void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const { + + CUTLASS_PRAGMA_UNROLL + for (int k = 0; k < Iterations::kColumn; ++k) { + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < Iterations::kRow; ++m) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Policy::LaneMmaShape::kM; i++) { + + frag[m * Policy::LaneMmaShape::kM + i + k * Iterations::kRow] = + *(ref_.data() + + ref_.offset({m * Policy::WarpShape::kRow * Policy::LaneMmaShape::kM + i, k}) + + pointer_offset); + } + } + } + } + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + load_with_pointer_offset(frag, 0); + } + + /// Stores a fragment to memory at the location pointed to by the iterator + CUTLASS_HOST_DEVICE + void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const { + + CUTLASS_PRAGMA_UNROLL + for (int k = 0; k < Iterations::kColumn; ++k) { + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < Iterations::kRow; ++m) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Policy::LaneMmaShape::kM; i++) { + + *(ref_.data() + ref_.offset(m * Policy::WarpShape::kM * Policy::LaneMmaShape::kM + i, k) + pointer_offset) = + frag[m * Policy::LaneMmaShape::kM + i + k * Iterations::kM]; + } + } + } + } + + /// Stores a fragment to memory at the location pointed to by the iterator + CUTLASS_HOST_DEVICE + void store(Fragment const &frag) const { + store_with_pointer_offset(frag, 0); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + // no operation here + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + /// Specialization for B operands of row-major layouts /// /// Concept: MutableRandomAccessContiguousTileIteratorConcept @@ -354,7 +561,6 @@ class MmaSimtTileIterator, layout::RowMajor> ref_; - public: /// Default ctor constructs null iterator @@ -417,7 +623,7 @@ class MmaSimtTileIterator +class MmaSimtTileIterator { +public: + + /// Shape of tile to load (concept: MatrixShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand::kB; + + /// Element type + using Element = Element_; + + /// Layout of policy + using Layout = layout::ColumnMajor; + + /// Decomposition of elements among threads + using Policy = Policy_; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + // + // Derived quantities + // + + static_assert(!(Shape::kColumn % Policy::WarpShape::kColumn), + "The warp-level GEMM N size must be divisible by the number of threads arranged along the N dimension."); + + static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero."); + static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero."); + static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero."); + static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero."); + + /// Thread-level shape of a fragment + using ThreadShape = MatrixShape< + Shape::kRow, + Shape::kColumn / Policy::WarpShape::kColumn + >; + + static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN), + "Thread-level GEMM must be divisible by Policy::LaneMmaShape."); + + /// Number of individual loads + using Iterations = MatrixShape< + ThreadShape::kRow, + ThreadShape::kColumn / Policy::LaneMmaShape::kN + >; + + /// Fragment object holding a thread's part of a tile + using Fragment = Array; + +private: + + /// Internal reference + cutlass::TensorRef ref_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaSimtTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaSimtTileIterator( + TensorRef ref, + int lane_id + ) { + + // compute offset based on thread ID and lane layout + typename Policy::LaneLayout lane_layout = Policy::get_lane_layout(); + + MatrixCoord lane_offset = lane_layout.inverse(lane_id) * + MatrixCoord(0, Policy::LaneMmaShape::kN); + + ref.add_coord_offset(lane_offset); + + ref_.reset(ref.data(), ref.stride(0)); + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaSimtTileIterator &add_pointer_offset(LongIndex offset) { + ref_.add_pointer_offset(offset); + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) { + + ref_.add_coord_offset({ + coord.row() * Shape::kRow, + coord.column() * Shape::kColumn}); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaSimtTileIterator & operator++() { + + ref_.add_coord_offset({Shape::kRow, 0}); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaSimtTileIterator & operator--() { + + ref_.add_coord_offset({-Shape::kRow, 0}); + + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. (scalar loads) + CUTLASS_HOST_DEVICE + void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const { + + CUTLASS_PRAGMA_UNROLL + for (int k = 0; k < Iterations::kRow; ++k) { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < Iterations::kColumn; ++n) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Policy::LaneMmaShape::kN; ++i) { + frag[n * Policy::LaneMmaShape::kN + i + k * Iterations::kColumn] = + *(ref_.data() + + ref_.offset({k, n * Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN + i}) + + pointer_offset); + } + } + } + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + load_with_pointer_offset(frag, 0); + } + + /// Stores a fragment to memory at the location pointed to by the iterator + CUTLASS_HOST_DEVICE + void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const { + + Array const *src_ptr = + reinterpret_cast *>(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int k = 0; k < Iterations::kM; ++k) { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < Iterations::kN; ++n) { + *(ref_.data() + ref_.offset({k, n * Policy::WarpShape::kN}) + pointer_offset / Policy::LaneMmaShape::kN) = + src_ptr[n + k * Iterations::kN]; + } + } + } + + /// Stores a fragment to memory at the location pointed to by the iterator + CUTLASS_HOST_DEVICE + void store(Fragment const &frag, Index pointer_offset) const { + store_with_pointer_offset(frag, 0); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + // no operation here + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + /// Specialization for C operands of column-major layouts /// /// Concept: MutableRandomAccessContiguousTileIteratorConcept diff --git a/include/cutlass/gemm/warp/mma_sparse_tensor_op.h b/include/cutlass/gemm/warp/mma_sparse_tensor_op.h index ba86e08583..86c50d3768 100644 --- a/include/cutlass/gemm/warp/mma_sparse_tensor_op.h +++ b/include/cutlass/gemm/warp/mma_sparse_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_tensor_op.h b/include/cutlass/gemm/warp/mma_tensor_op.h index a60a86020a..a6f83129fc 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op.h +++ b/include/cutlass/gemm/warp/mma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h b/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h index 5b5b5345a0..e7a77f72ac 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h @@ -44,9 +44,7 @@ template < /// Shape of one matrix product operation (concept: MatrixShape) typename InstructionShape_, /// Output operation on the fragment - typename OutputOp_, - /// Whether beta is zero - bool IsBetaZero_ > + typename OutputOp_> class MmaTensorOpFragmentIterator; @@ -68,7 +66,7 @@ template < typename OutputOp_> class MmaTensorOpFragmentIterator { + InstructionShape_, OutputOp_> { public: /// Shape of warp tile to load (concept: MatrixShape) @@ -105,8 +103,10 @@ class MmaTensorOpFragmentIterator(&frag); - int index_m = (index_ * MmaIterations::kRow) % AccumulatorIterations::kRow; - int index_n = (index_ * MmaIterations::kRow) / AccumulatorIterations::kRow - * MmaIterations::kColumn; + int index = index_ * MmaIterations::kCount; CUTLASS_PRAGMA_UNROLL for (int n = 0; n < MmaIterations::kColumn; n++) { for (int m = 0; m < MmaIterations::kRow; m++) { int accumulator_access_offset = - (n + index_n) * AccumulatorIterations::kRow + m + index_m; + n * AccumulatorIterations::kRow + m + index; - frag_ptr[n * MmaIterations::kRow + m].clear(); + frag_ptr[m * MmaIterations::kColumn + n].clear(); if(!(is_residual_tile_ && index_ >= kResidualIndex)) - //frag_ptr[n * MmaIterations::kRow + m] = accumulators_[accumulator_access_offset]; - frag_ptr[n * MmaIterations::kRow + m] = output_op(accumulators_[accumulator_access_offset], src_fragment); + frag_ptr[m * MmaIterations::kColumn + n] = output_op(accumulators_[accumulator_access_offset], src_fragment); } } } @@ -251,7 +247,7 @@ template < typename OutputOp_> class MmaTensorOpFragmentIterator { + InstructionShape_, OutputOp_> { public: /// Shape of warp tile to load (concept: MatrixShape) @@ -294,7 +290,7 @@ class MmaTensorOpFragmentIterator(&frag); -// NumericArrayConverter fragmentConverter; int index = index_ * AccessIterations::kCount; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < AccessIterations::kCount; i++) { -// int index_m = (index % AccessIterations::kCount) / (AccessIterations::kColumn * kIterationsPerInstruction) -// * kIterationsPerInstruction + index % kIterationsPerInstruction; -// -// int index_n = (index / AccessIterations::kCount) * MmaIterations::kColumn + -// (index % (AccessIterations::kColumn * kIterationsPerInstruction)) -// / kIterationsPerInstruction * AccessIterations::kColumn; -// -// int accumulator_access_offset = index_m / kIterationsPerInstruction * AccessIterations::kCount * kIterationsPerInstruction -// + index_m % kIterationsPerInstruction + index_n * kIterationsPerInstruction; int accumulator_access_offset = index / AccessIterations::kCount * (MmaIterations::kColumn * kIterationsPerInstruction) + (index % AccessIterations::kCount) / (AccessIterations::kColumn * kIterationsPerInstruction) * @@ -442,7 +428,6 @@ class MmaTensorOpFragmentIterator= kResidualIndex)) - // frag_ptr[m * MmaIterations::kColumn + n] = fragmentConverter(accumulators_[accumulator_access_offset]); frag_ptr[i*kIterationsPerAccess + j] = output_op(accumulators_[accumulator_access_offset + j * kAccessStride], src_fragment); } index++; diff --git a/include/cutlass/gemm/warp/mma_tensor_op_policy.h b/include/cutlass/gemm/warp/mma_tensor_op_policy.h index 68b28bfff1..4dd57da20a 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_policy.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_policy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_tensor_op_sm70.h b/include/cutlass/gemm/warp/mma_tensor_op_sm70.h index cc1a909532..409eda4082 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_sm70.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h index 59f68a42a1..9d4d81d49d 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h index c57cc6a8d9..4be831f366 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -238,7 +238,7 @@ class MmaVoltaTensorOpMultiplicandTileIterator< pointer_[0] = pointer_[1]; pointer_[1] = tmp_pointer; } - contiguous_offset = contiguous_offset / 2; + contiguous_offset = contiguous_offset / 2 * 2; } int offset = (strided_offset * InstructionShape::kStrided) * stride_ * diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h index e286ed1162..4d45ecf5dd 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h index a7e69816f1..5a82a702fa 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h index 64be655680..6fd783c6dd 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_tensor_op_wmma.h b/include/cutlass/gemm/warp/mma_tensor_op_wmma.h index 824e207d74..c000dd6283 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_wmma.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_wmma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/tile_iterator_planar_complex.h b/include/cutlass/gemm/warp/tile_iterator_planar_complex.h index a3050c4299..ef5767198d 100644 --- a/include/cutlass/gemm/warp/tile_iterator_planar_complex.h +++ b/include/cutlass/gemm/warp/tile_iterator_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/half.h b/include/cutlass/half.h index 3d0bd34724..5503f5b318 100644 --- a/include/cutlass/half.h +++ b/include/cutlass/half.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/integer_subbyte.h b/include/cutlass/integer_subbyte.h index df32042d0e..bd8a6a0108 100644 --- a/include/cutlass/integer_subbyte.h +++ b/include/cutlass/integer_subbyte.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -144,7 +144,7 @@ struct integer_subbyte { /// Greater than CUTLASS_HOST_DEVICE bool operator>(integer_subbyte const &rhs) const { - return !(rhs < *this); + return !(*this <= rhs); } }; diff --git a/include/cutlass/kernel_launch.h b/include/cutlass/kernel_launch.h index bd84a35781..cda1896c0f 100644 --- a/include/cutlass/kernel_launch.h +++ b/include/cutlass/kernel_launch.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/layout.h b/include/cutlass/layout/layout.h index 775357d125..4d78c4c45d 100644 --- a/include/cutlass/layout/layout.h +++ b/include/cutlass/layout/layout.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/matrix.h b/include/cutlass/layout/matrix.h index 0590492625..668245fcb7 100644 --- a/include/cutlass/layout/matrix.h +++ b/include/cutlass/layout/matrix.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/pitch_linear.h b/include/cutlass/layout/pitch_linear.h index a6158b32a4..a44825c1d8 100644 --- a/include/cutlass/layout/pitch_linear.h +++ b/include/cutlass/layout/pitch_linear.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/tensor.h b/include/cutlass/layout/tensor.h index 7f608dcf76..1196b726eb 100644 --- a/include/cutlass/layout/tensor.h +++ b/include/cutlass/layout/tensor.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -149,11 +149,11 @@ class TensorNHWC { fast_divmod(w, tmp, w, int(stride_[0]), c_mul, c_shr); #else - n = int(index / (stride_[0] * stride_[1] * stride_[2])); - LongIndex residual = index % (stride_[0] * stride_[1] * stride_[2]); + n = int(index / stride_[2]); + LongIndex residual = index % stride_[2]; - h = int(residual / (stride_[0] * stride_[1])); - residual = (residual % (stride_[0] * stride_[1])); + h = int(residual / stride_[1]); + residual = (residual % stride_[1]); w = int(residual / stride_[0]); c = int(residual % stride_[0]); @@ -314,6 +314,15 @@ class TensorNCxHWx { CUTLASS_HOST_DEVICE TensorNCxHWx(Stride const &stride = Stride(0)): stride_(stride) { } + /// Constructor + CUTLASS_HOST_DEVICE + TensorNCxHWx( + typename Stride::Index stride_w, ///< number of elements between adjacent W coordinates + typename Stride::Index stride_h, ///< number of elements between adjacent H coordinates + typename Stride::Index stride_n ///< number of elements between adjacent N coordinates + ): + stride_(make_Coord(stride_w, stride_h, stride_n)) { } + /// Helper returns a layout to a tightly packed tensor CUTLASS_HOST_DEVICE static TensorNCxHWx packed(TensorCoord const &extent) { @@ -404,6 +413,15 @@ class TensorCxRSKx { CUTLASS_HOST_DEVICE TensorCxRSKx(Stride const &stride = Stride(0)): stride_(stride) { } + /// Constructor + CUTLASS_HOST_DEVICE + TensorCxRSKx( + typename Stride::Index stride_w, ///< number of elements between adjacent W coordinates + typename Stride::Index stride_h, ///< number of elements between adjacent H coordinates + typename Stride::Index stride_n ///< number of elements between adjacent N coordinates + ): + stride_(make_Coord(stride_w, stride_h, stride_n)) { } + /// Helper returns a layout to a tightly packed tensor CUTLASS_HOST_DEVICE static TensorCxRSKx packed(TensorCoord const &extent) { @@ -529,6 +547,12 @@ class TensorNDHWC { LongIndex(stride_[3] * coord.n()); } + /// Returns the offset of a pitchlinear coordinate in linear memory. + CUTLASS_HOST_DEVICE + LongIndex operator()(PitchLinearCoord coord) const { + return coord.contiguous() + LongIndex(coord.strided() * stride_[3]); + } + /// Returns the stride of the layout CUTLASS_HOST_DEVICE Stride stride() const { diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm70.h b/include/cutlass/layout/tensor_op_multiplicand_sm70.h index 03f87db392..9d375e6c48 100644 --- a/include/cutlass/layout/tensor_op_multiplicand_sm70.h +++ b/include/cutlass/layout/tensor_op_multiplicand_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm75.h b/include/cutlass/layout/tensor_op_multiplicand_sm75.h index b52483355c..5f81c1dae6 100644 --- a/include/cutlass/layout/tensor_op_multiplicand_sm75.h +++ b/include/cutlass/layout/tensor_op_multiplicand_sm75.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm80.h b/include/cutlass/layout/tensor_op_multiplicand_sm80.h index e5963a2a80..5d2ffc5ffa 100644 --- a/include/cutlass/layout/tensor_op_multiplicand_sm80.h +++ b/include/cutlass/layout/tensor_op_multiplicand_sm80.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/vector.h b/include/cutlass/layout/vector.h index b54b6b3b18..126d30b2fe 100644 --- a/include/cutlass/layout/vector.h +++ b/include/cutlass/layout/vector.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/matrix.h b/include/cutlass/matrix.h index 5d05ee8994..971f125e45 100644 --- a/include/cutlass/matrix.h +++ b/include/cutlass/matrix.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/matrix_coord.h b/include/cutlass/matrix_coord.h index b432665e8c..dcf25cc64f 100644 --- a/include/cutlass/matrix_coord.h +++ b/include/cutlass/matrix_coord.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/matrix_shape.h b/include/cutlass/matrix_shape.h index cb3118c2d6..5b672ebbc6 100644 --- a/include/cutlass/matrix_shape.h +++ b/include/cutlass/matrix_shape.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/numeric_conversion.h b/include/cutlass/numeric_conversion.h index 766478e085..57f3984b41 100644 --- a/include/cutlass/numeric_conversion.h +++ b/include/cutlass/numeric_conversion.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -1359,6 +1359,60 @@ struct PreferredRoundingMode { ///////////////////////////////////////////////////////////////////////////////////////////////// +/// Packs predicates into an array. +template +struct PackPredicates { + using result_type = Array; + + static_assert(!(N % 4), "Must pack predicates in a count that is a multiple of 4"); + + CUTLASS_HOST_DEVICE + result_type operator()(bool const predicates[]) { + + result_type packed; + packed.clear(); + + int const kWordSize = 8; + uint8_t *bytes = reinterpret_cast(packed.data()); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N; ++i) { + int word_idx = (i / kWordSize); + int bit_idx = (i % kWordSize); + + uint8_t mask = ((predicates[i] ? 1u : 0u) << bit_idx); + bytes[word_idx] = (bytes[word_idx] | mask); + } + return packed; + } +}; + +/// Packs predicates into an array +template +struct UnpackPredicates { + using result_type = Array; + + static_assert(!(N % 4), "Must unpack predicates in a count that is a multiple of 4"); + + CUTLASS_HOST_DEVICE + void operator()(bool predicates[], result_type const &packed) { + + int const kWordSize = 8; + uint8_t const *bytes = reinterpret_cast(packed.data()); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N; ++i) { + int word_idx = (i / kWordSize); + int bit_idx = (i % kWordSize); + + predicates[i] = bool((bytes[word_idx] >> bit_idx) & 0x1); + } + + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace cutlass ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/numeric_types.h b/include/cutlass/numeric_types.h index 9479ccb08b..363997b620 100644 --- a/include/cutlass/numeric_types.h +++ b/include/cutlass/numeric_types.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -38,7 +38,6 @@ namespace cutlass { - ///////////////////////////////////////////////////////////////////////////////////////////////// /// Defines the size of an element in bits diff --git a/include/cutlass/platform/platform.h b/include/cutlass/platform/platform.h index 826b3977fc..e9ccae2e7f 100644 --- a/include/cutlass/platform/platform.h +++ b/include/cutlass/platform/platform.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/predicate_vector.h b/include/cutlass/predicate_vector.h index 9293696225..6ef748fb2e 100644 --- a/include/cutlass/predicate_vector.h +++ b/include/cutlass/predicate_vector.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/quaternion.h b/include/cutlass/quaternion.h index aef35025d3..67e0634afb 100644 --- a/include/cutlass/quaternion.h +++ b/include/cutlass/quaternion.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/real.h b/include/cutlass/real.h index 99af846b19..faa7d92d0d 100644 --- a/include/cutlass/real.h +++ b/include/cutlass/real.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/reduction/device/reduce_split_k.h b/include/cutlass/reduction/device/reduce_split_k.h index e3626f88c0..4c044a4cab 100644 --- a/include/cutlass/reduction/device/reduce_split_k.h +++ b/include/cutlass/reduction/device/reduce_split_k.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/reduction/kernel/reduce_split_k.h b/include/cutlass/reduction/kernel/reduce_split_k.h index 586c90d86a..870b94b8ed 100644 --- a/include/cutlass/reduction/kernel/reduce_split_k.h +++ b/include/cutlass/reduction/kernel/reduce_split_k.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/reduction/thread/reduce.h b/include/cutlass/reduction/thread/reduce.h index 698b174f95..a0f2d18fff 100644 --- a/include/cutlass/reduction/thread/reduce.h +++ b/include/cutlass/reduction/thread/reduce.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/reduction/thread/reduction_operators.h b/include/cutlass/reduction/thread/reduction_operators.h index 6f9aeb6f32..3c29bf7dc3 100644 --- a/include/cutlass/reduction/thread/reduction_operators.h +++ b/include/cutlass/reduction/thread/reduction_operators.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -35,6 +35,8 @@ #include "cutlass/functional.h" #include "cutlass/numeric_conversion.h" +///////////////////////////////////////////////////////////////////////////////////////////////// + namespace cutlass { namespace reduction { namespace thread { @@ -97,6 +99,131 @@ struct ReduceAdd { ///////////////////////////////////////////////////////////////////////////////////////////////// +namespace detail { + +/// Special handling for binary operators +template +struct VectorizeArrayOperation { + + using ValueType = Array; + + CUTLASS_HOST_DEVICE + ValueType operator()( + ReductionOp const &reduction_op, + ValueType const &lhs, + ValueType const &rhs) const { + + ValueType result; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N; ++i) { + result[i] = reduction_op(lhs[i], rhs[i]); + } + + return result; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct ReduceArrayOperation { + + using ArrayType = Array; + + CUTLASS_HOST_DEVICE + Element operator()( + ReductionOp const &reduction_op, + ArrayType const &array) const { + + Element item = reduction_op(array[0], array[1]); + + CUTLASS_PRAGMA_UNROLL + for (int i = 2; i < N; ++i) { + item = reduction_op(item, array[i]); + } + + return item; + } +}; + +template +struct ReduceArrayOperation, uint1b_t, N> { + + using ArrayType = Array; + + CUTLASS_HOST_DEVICE + uint1b_t operator()( + logical_and const &reduction_op, + ArrayType const &array) const { + + uint8_t const *ptr = reinterpret_cast(&array); + bool item = false; + + CUTLASS_PRAGMA_UNROLL + for (int byte = 0; byte < (N + 7) / 8; ++byte) { + uint8_t bits = ptr[byte]; + item = (item || !bits); + } + + return uint1b_t(!item); + } +}; + +template +struct ReduceArrayOperation, uint1b_t, N> { + + using ArrayType = Array; + + CUTLASS_HOST_DEVICE + uint1b_t operator()( + logical_and const &reduction_op, + ArrayType const &array) const { + + uint8_t const *ptr = reinterpret_cast(&array); + bool item = true; + + CUTLASS_PRAGMA_UNROLL + for (int byte = 0; byte < (N + 7) / 8; ++byte) { + uint8_t bits = ptr[byte]; + item = (item || bits); + } + + return uint1b_t(item); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Helper function to infer template argument types +template +CUTLASS_HOST_DEVICE +Array ApplyArrayOperator( + ReductionOp const &reduction_op, + Array const &lhs, + Array const &rhs) { + + VectorizeArrayOperation vectorize_op; + + return vectorize_op(reduction_op, lhs, rhs); +} + +/// Helper to reduce an array +template +Element ReduceArray(ReductionOp const &reduction_op, Array const &array) { + ReduceArrayOperation reduce_array_op; + + return reduce_array_op(reduction_op, array); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace detail + +///////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace thread } // namespace reduction } // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/reduction/threadblock_swizzle.h b/include/cutlass/reduction/threadblock_swizzle.h index 2419cdf6f5..943b818d16 100644 --- a/include/cutlass/reduction/threadblock_swizzle.h +++ b/include/cutlass/reduction/threadblock_swizzle.h @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/relatively_equal.h b/include/cutlass/relatively_equal.h index 3d6a43b952..d75959152b 100644 --- a/include/cutlass/relatively_equal.h +++ b/include/cutlass/relatively_equal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/semaphore.h b/include/cutlass/semaphore.h index dc5523dca1..87f519053e 100644 --- a/include/cutlass/semaphore.h +++ b/include/cutlass/semaphore.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/subbyte_reference.h b/include/cutlass/subbyte_reference.h index 6f7aab2c6d..950c8da4ee 100644 --- a/include/cutlass/subbyte_reference.h +++ b/include/cutlass/subbyte_reference.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -358,6 +358,12 @@ class SubbyteReference { return ptr_; } + /// Gets storage pointer + CUTLASS_HOST_DEVICE + Element * operator&() const { + return reinterpret_cast(ptr_); + } + /// Gets element offset within storage vector CUTLASS_HOST_DEVICE int element_offset() const { diff --git a/include/cutlass/tensor_coord.h b/include/cutlass/tensor_coord.h index b60bc11262..5c0c603171 100644 --- a/include/cutlass/tensor_coord.h +++ b/include/cutlass/tensor_coord.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/tensor_ref.h b/include/cutlass/tensor_ref.h index a805107c3d..2782b49fc0 100644 --- a/include/cutlass/tensor_ref.h +++ b/include/cutlass/tensor_ref.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/tensor_ref_planar_complex.h b/include/cutlass/tensor_ref_planar_complex.h index 54611911ca..009608db8d 100644 --- a/include/cutlass/tensor_ref_planar_complex.h +++ b/include/cutlass/tensor_ref_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/tensor_view.h b/include/cutlass/tensor_view.h index fdbee1055e..333c559af9 100644 --- a/include/cutlass/tensor_view.h +++ b/include/cutlass/tensor_view.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -219,7 +219,9 @@ class TensorView : public TensorRef { TensorCoord const& location = TensorCoord() ///< resulting view's origin within the old view ) const { - return TensorView(ref(), extent.clamp(extent_ - location)).add_coord_offset(location); + TensorView result(this->ref(), extent.clamp(extent_ - location)); + result.add_coord_offset(location); + return result; } /// Returns the number of scalar elements needed to store tensor. diff --git a/include/cutlass/tensor_view_planar_complex.h b/include/cutlass/tensor_view_planar_complex.h index bdd29829da..80d32f1c00 100644 --- a/include/cutlass/tensor_view_planar_complex.h +++ b/include/cutlass/tensor_view_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -208,7 +208,9 @@ class TensorViewPlanarComplex : public TensorRefPlanarComplex TensorCoord const& location = TensorCoord() ///< resulting view's origin within the old view ) const { - return TensorViewPlanarComplex(ref(), extent.clamp(extent_ - location)).add_coord_offset(location); + TensorViewPlanarComplex result(this->ref(), extent.clamp(extent_ - location)); + result.add_coord_offset(location); + return result; } /// Returns the number of scalar elements needed to store tensor. diff --git a/include/cutlass/tfloat32.h b/include/cutlass/tfloat32.h index 2d28851299..67a7f1c7b0 100644 --- a/include/cutlass/tfloat32.h +++ b/include/cutlass/tfloat32.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/thread/matrix.h b/include/cutlass/thread/matrix.h index a54b347150..a7ffa6b5ab 100644 --- a/include/cutlass/thread/matrix.h +++ b/include/cutlass/thread/matrix.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/trace.h b/include/cutlass/trace.h index 39ffa2968c..62df598dd6 100644 --- a/include/cutlass/trace.h +++ b/include/cutlass/trace.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/pitch_linear_thread_map.h b/include/cutlass/transform/pitch_linear_thread_map.h index c19f79cbbc..11285014af 100644 --- a/include/cutlass/transform/pitch_linear_thread_map.h +++ b/include/cutlass/transform/pitch_linear_thread_map.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/thread/unaryOp.h b/include/cutlass/transform/thread/unaryOp.h index de4f79b972..6434db54f6 100644 --- a/include/cutlass/transform/thread/unaryOp.h +++ b/include/cutlass/transform/thread/unaryOp.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h index 7dce3228ec..a6bdca8f21 100644 --- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h +++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h @@ -1,40 +1,39 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - *modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright notice, - *this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - *notice, this list of conditions and the following disclaimer in the - *documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the names of its - *contributors may be used to endorse or promote products derived from this - *software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, - *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING - *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, - *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT,INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ /*! \file \brief Templates calculating the address and predicates to the load of tiles - from pitch-linear rank=2 tensors. + from pitch-linear rank=2 tensors. - This iterator uses masks to guard out-of-bounds accesses and visits the last - "residue" tile first, with the objective of minimizing predicate mask updates - during steady-state operation. + This iterator uses masks to guard out-of-bounds accesses. The first tile this + iterator visits maybe partial, then the remaining tiles are complete. So, we + only need to compute the predicates twice, once before the first tile and + once for the remaining full tiles which can share the same predicates. A precomputed "Params" object minimizes the amount of state that must be - stored in registers, and integer addition is used to advance the pointer - through memory. + stored in registers, and integer addition is used to advance the pointer + through memory. */ #pragma once diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h index 97ab909c74..278766fd36 100644 --- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h +++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h @@ -1,27 +1,25 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - *modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright notice, - *this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - *notice, this list of conditions and the following disclaimer in the - *documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the names of its - *contributors may be used to endorse or promote products derived from this - *software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, - *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING - *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, - *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT,INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ /*! \file diff --git a/include/cutlass/transform/threadblock/predicated_tile_iterator.h b/include/cutlass/transform/threadblock/predicated_tile_iterator.h index 48d25ef42a..603d1fcb95 100644 --- a/include/cutlass/transform/threadblock/predicated_tile_iterator.h +++ b/include/cutlass/transform/threadblock/predicated_tile_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -25,8 +25,10 @@ /*! \file \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors. - This iterator uses masks to guard out-of-bounds accesses and visits the last "residue" tile - first, with the objective of minimizing predicate mask updates during steady-state operation. + This iterator uses masks to guard out-of-bounds accesses. The first tile this + iterator visits maybe partial, then the remaining tiles are complete. So, we + only need to compute the predicates twice, once before the first tile and + once for the remaining full tiles which can share the same predicates. A precomputed "Params" object minimizes the amount of state that must be stored in registers, and integer addition is used to advance the pointer through memory. diff --git a/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h b/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h index 0342a43464..9895e74c7a 100644 --- a/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h +++ b/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator.h index 0d775dffba..3541cd752e 100644 --- a/include/cutlass/transform/threadblock/regular_tile_access_iterator.h +++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without *modification, are permitted provided that the following conditions are met: diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h index 31f529e004..ec30b48ed8 100644 --- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h +++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -182,7 +182,15 @@ class RegularTileAccessIterator< return prev; } - /// Adds a tile offset + /// Adds a tile offset in the unit of tile. + /// In GEMM/Conv implementation, this is used to move in the k dimension in the shared memory. + /// Below layouts are the shared memory layouts. Current SM50 SIMT kernels only use col major A and row major B. + /// For row major A operand, k dimension is contiguous dimension; + /// For col major A operand, k dimension is strided dimension; + /// For row major B operand, k dimension is strided dimension; + /// For col major B operand, k dimension is contiguous dimension. + /// Below two classes map col/row major to the pitch linear coordinates used + /// in this base class. CUTLASS_DEVICE void add_tile_offset(TensorCoord const &coord) { add_pointer_offset(coord.contiguous() * Shape::kContiguous + diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h index 32043130bd..e0c44b1c48 100644 --- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h +++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h index 5a0c74fdc6..5861ca687c 100644 --- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h +++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator.h b/include/cutlass/transform/threadblock/regular_tile_iterator.h index d7928ac00a..e1978f361d 100644 --- a/include/cutlass/transform/threadblock/regular_tile_iterator.h +++ b/include/cutlass/transform/threadblock/regular_tile_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h index 2dcd57d658..831131f0d7 100644 --- a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h +++ b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -227,14 +227,21 @@ class RegularTileIterator::value * (coord.contiguous() * Shape::kContiguous + coord.strided() * Shape::kStrided * stride_) / 8; add_pointer_offset(offset); } - }; ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h index 85d702fec6..abfba6b8b4 100644 --- a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h +++ b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h index c7f0690779..c35f131437 100644 --- a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h +++ b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h index 82c8842ec0..0d2bbeea4f 100644 --- a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h +++ b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/uint128.h b/include/cutlass/uint128.h new file mode 100644 index 0000000000..cfcb696e4d --- /dev/null +++ b/include/cutlass/uint128.h @@ -0,0 +1,253 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! + \file + \brief Defines an unsigned 128b integer with several operators to support 64-bit integer division. +*/ + +#pragma once + +#if defined(__CUDACC_RTC__) +#include +#else +#include +#include +#include +#include +#endif + +#include "cutlass/cutlass.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Optionally enable GCC's built-in type +#if defined(__x86_64) && !defined(__CUDA_ARCH__) +#if defined(__GNUC__) +#define CUTLASS_UINT128_NATIVE +#elif defined(_MSC_VER) +#define CUTLASS_INT128_ARITHMETIC +#include +#endif +#endif + +///////////////////////////////////////////////////////////////////////////////////////////////// + +///! Unsigned 128b integer type +struct uint128_t { + + /// Size of one part of the uint's storage in bits + int const kPartSize = sizeof(uint64_t) * 8; + + // Use a union to store either low and high parts or, if present, a built-in 128b integer type. + union { + + struct { + uint64_t lo; + uint64_t hi; + }; + + #if defined(CUTLASS_UINT128_NATIVE) + unsigned __int128 native; + #endif // defined(CUTLASS_UINT128_NATIVE) + }; + + // + // Methods + // + + /// Default ctor + CUTLASS_HOST_DEVICE + uint128_t(): lo(0), hi(0) { } + + /// Constructor from uint64 + CUTLASS_HOST_DEVICE + uint128_t(uint64_t lo_): lo(lo_), hi(0) { } + + /// Constructor from two 64b unsigned integers + CUTLASS_HOST_DEVICE + uint128_t(uint64_t lo_, uint64_t hi_): lo(lo_), hi(hi_) { + + } + + /// Optional constructor from native value + #if defined(CUTLASS_UINT128_NATIVE) + uint128_t(unsigned __int128 value): native(value) { } + #endif + + /// Lossily cast to uint64 + CUTLASS_HOST_DEVICE + explicit operator uint64_t() const { + return lo; + } + + CUTLASS_HOST_DEVICE + static void exception() { +#if defined(__CUDA_ARCH__) + asm volatile (" brkpt;\n"); +#else + throw std::runtime_error("Not yet implemented."); +#endif + } + + /// Add + CUTLASS_HOST_DEVICE + uint128_t operator+(uint128_t const &rhs) const { + uint128_t y; +#if defined(CUTLASS_UINT128_NATIVE) + y.native = native + rhs.native; +#else + y.lo = lo + rhs.lo; + y.hi = hi + rhs.hi + (!y.lo && (rhs.lo)); +#endif + return y; + } + + /// Subtract + CUTLASS_HOST_DEVICE + uint128_t operator-(uint128_t const &rhs) const { + uint128_t y; +#if defined(CUTLASS_UINT128_NATIVE) + y.native = native - rhs.native; +#else + y.lo = lo - rhs.lo; + y.hi = hi - rhs.hi - (rhs.lo && y.lo > lo); +#endif + return y; + } + + /// Multiply by unsigned 64b integer yielding 128b integer + CUTLASS_HOST_DEVICE + uint128_t operator*(uint64_t const &rhs) const { + uint128_t y; +#if defined(CUTLASS_UINT128_NATIVE) + y.native = native * rhs; +#elif defined(CUTLASS_INT128_ARITHMETIC) + // Multiply by the low part + y.lo = _umul128(lo, rhs, &y.hi); + + // Add the high part and ignore the overflow + uint64_t overflow; + y.hi += _umul128(hi, rhs, &overflow); +#else + // TODO - not implemented + exception(); +#endif + return y; + } + + /// Divide 128b operation by 64b operation yielding a 64b quotient + CUTLASS_HOST_DEVICE + uint64_t operator/(uint64_t const &divisor) const { + uint64_t quotient = 0; +#if defined(CUTLASS_UINT128_NATIVE) + quotient = uint64_t(native / divisor); +#elif defined(CUTLASS_INT128_ARITHMETIC) + // implemented using MSVC's arithmetic intrinsics + uint64_t remainder = 0; + quotient = _udiv128(hi, lo, divisor, &remainder); +#else + // TODO - not implemented + exception(); +#endif + return quotient; + } + + /// Divide 128b operation by 64b operation yielding a 64b quotient + CUTLASS_HOST_DEVICE + uint64_t operator%(uint64_t const &divisor) const { + uint64_t remainder = 0; +#if defined(CUTLASS_UINT128_NATIVE) + remainder = uint64_t(native % divisor); +#elif defined(CUTLASS_INT128_ARITHMETIC) + // implemented using MSVC's arithmetic intrinsics + (void)_udiv128(hi, lo, divisor, &remainder); +#else + // TODO - not implemented + exception(); +#endif + return remainder; + } + + /// Computes the quotient and remainder in a single method. + CUTLASS_HOST_DEVICE + uint64_t divmod(uint64_t &remainder, uint64_t divisor) const { + uint64_t quotient = 0; +#if defined(CUTLASS_UINT128_NATIVE) + quotient = uint64_t(native / divisor); + remainder = uint64_t(native % divisor); +#elif defined(CUTLASS_INT128_ARITHMETIC) + // implemented using MSVC's arithmetic intrinsics + quotient = _udiv128(hi, lo, divisor, &remainder); +#else + // TODO - not implemented + exception(); +#endif + return quotient; + } + + /// Left-shifts a 128b unsigned integer + CUTLASS_HOST_DEVICE + uint128_t operator<<(int sh) const { + if (sh == 0) { + return *this; + } + else if (sh >= kPartSize) { + return uint128_t(0, lo << (sh - kPartSize)); + } + else { + return uint128_t( + (lo << sh), + (hi << sh) | uint64_t(lo >> (kPartSize - sh)) + ); + } + } + + /// Right-shifts a 128b unsigned integer + CUTLASS_HOST_DEVICE + uint128_t operator>>(int sh) const { + if (sh == 0) { + return *this; + } + else if (sh >= kPartSize) { + return uint128_t((hi >> (sh - kPartSize)), 0); + } + else { + return uint128_t( + (lo >> sh) | (hi << (kPartSize - sh)), + (hi >> sh) + ); + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/wmma_array.h b/include/cutlass/wmma_array.h index e80961394d..37e87430a1 100644 --- a/include/cutlass/wmma_array.h +++ b/include/cutlass/wmma_array.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/media/docs/code_organization.md b/media/docs/code_organization.md index 9a00d3056f..55984d9bb1 100644 --- a/media/docs/code_organization.md +++ b/media/docs/code_organization.md @@ -220,7 +220,7 @@ of tests run may vary over time as more are added. # Copyright -Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/doxygen_mainpage.md b/media/docs/doxygen_mainpage.md index 15656d25e5..6ff2575b9a 100644 --- a/media/docs/doxygen_mainpage.md +++ b/media/docs/doxygen_mainpage.md @@ -120,7 +120,7 @@ cudaError_t cutlass_sgemm_nn( # Copyright -Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/efficient_gemm.md b/media/docs/efficient_gemm.md index 7a1a6ae7f4..a8374fd8a2 100644 --- a/media/docs/efficient_gemm.md +++ b/media/docs/efficient_gemm.md @@ -225,7 +225,7 @@ targeting NVIDIA GPUs. # Copyright -Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/functionality.md b/media/docs/functionality.md index aeb9bcf3b6..c5570750e3 100644 --- a/media/docs/functionality.md +++ b/media/docs/functionality.md @@ -249,7 +249,7 @@ CUDA exposes warp-level matrix operations in the CUDA C++ WMMA API. The CUDA C++ # Copyright -Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/fundamental_types.md b/media/docs/fundamental_types.md index 7556cd45dc..40f5f0810f 100644 --- a/media/docs/fundamental_types.md +++ b/media/docs/fundamental_types.md @@ -346,7 +346,7 @@ support on current and future NVIDIA GPUs. # Copyright -Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/gemm_api.md b/media/docs/gemm_api.md index fec32a0451..2c268fdc7b 100644 --- a/media/docs/gemm_api.md +++ b/media/docs/gemm_api.md @@ -541,7 +541,7 @@ to inline PTX. # Copyright -Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/implicit_gemm_convolution.md b/media/docs/implicit_gemm_convolution.md index 5cc0a258e5..c564eb61e4 100644 --- a/media/docs/implicit_gemm_convolution.md +++ b/media/docs/implicit_gemm_convolution.md @@ -754,7 +754,7 @@ Convolution can also be run by the CUTLASS Profiler. # Copyright -Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/layout.md b/media/docs/layout.md index bacec0e442..0de2751211 100644 --- a/media/docs/layout.md +++ b/media/docs/layout.md @@ -267,7 +267,7 @@ Permuted Shared Memory Layouts: # Copyright -Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/profiler.md b/media/docs/profiler.md index c7ce91a7ca..4d04f40b42 100644 --- a/media/docs/profiler.md +++ b/media/docs/profiler.md @@ -501,7 +501,7 @@ reference_device: Passed # Copyright -Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/programming_guidelines.md b/media/docs/programming_guidelines.md index 0cf7ea257f..e87d93ba66 100644 --- a/media/docs/programming_guidelines.md +++ b/media/docs/programming_guidelines.md @@ -292,7 +292,7 @@ Github's pretty printer. # Copyright -Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/quickstart.md b/media/docs/quickstart.md index f283da8a3b..333ea07346 100644 --- a/media/docs/quickstart.md +++ b/media/docs/quickstart.md @@ -537,7 +537,7 @@ $ cmake .. -DCUTLASS_NVCC_ARCHS='70;75;80' -DCUTLASS_LIBRARY_KERNELS=tensorop*s* # Copyright -Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/terminology.md b/media/docs/terminology.md index 07464143cb..e41a655569 100644 --- a/media/docs/terminology.md +++ b/media/docs/terminology.md @@ -74,7 +74,7 @@ contiguous and strided dimensions of a tile. # Copyright -Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/tile_iterator_concept.md b/media/docs/tile_iterator_concept.md index 061ff90734..c4a3962b54 100644 --- a/media/docs/tile_iterator_concept.md +++ b/media/docs/tile_iterator_concept.md @@ -466,7 +466,7 @@ struct WriteableReadableRandomAccessContiguousTileIteratorConcept { # Copyright -Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/utilities.md b/media/docs/utilities.md index b9ddc79a70..fc4ac8ca1b 100644 --- a/media/docs/utilities.md +++ b/media/docs/utilities.md @@ -379,7 +379,7 @@ int main() { # Copyright -Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/images/13_example_block_resident_fusion.png b/media/images/13_example_block_resident_fusion.png new file mode 100755 index 0000000000..736857b946 Binary files /dev/null and b/media/images/13_example_block_resident_fusion.png differ diff --git a/media/images/13_example_fusion.png b/media/images/13_example_fusion.png new file mode 100755 index 0000000000..142c8d04a7 Binary files /dev/null and b/media/images/13_example_fusion.png differ diff --git a/media/images/13_example_rf_resident_fusion.png b/media/images/13_example_rf_resident_fusion.png new file mode 100755 index 0000000000..dc2786f756 Binary files /dev/null and b/media/images/13_example_rf_resident_fusion.png differ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 436990fd66..55542b5367 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt index d57570ce6c..7954c53e3e 100644 --- a/test/unit/CMakeLists.txt +++ b/test/unit/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/common/cutlass_unit_test.h b/test/unit/common/cutlass_unit_test.h index 81908265fa..83e930da82 100644 --- a/test/unit/common/cutlass_unit_test.h +++ b/test/unit/common/cutlass_unit_test.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -59,3 +59,7 @@ void FilterArchitecture(); #define CUTLASS_TEST_L1(NAME_STATIC,NAME_DYNAMIC,...) CUTLASS_TEST_LEVEL_ACTIVE(1,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__) #define CUTLASS_TEST_L2(NAME_STATIC,NAME_DYNAMIC,...) CUTLASS_TEST_LEVEL_ACTIVE(2,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__) #endif + +#if !defined(CUTLASS_TEST_UNIT_ENABLE_WARNINGS) +#define CUTLASS_TEST_UNIT_ENABLE_WARNINGS false +#endif diff --git a/test/unit/common/filter_architecture.cpp b/test/unit/common/filter_architecture.cpp index 0c548bdf86..5e13354b6e 100644 --- a/test/unit/common/filter_architecture.cpp +++ b/test/unit/common/filter_architecture.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/CMakeLists.txt b/test/unit/conv/CMakeLists.txt index a50a58f59e..c2840838f2 100644 --- a/test/unit/conv/CMakeLists.txt +++ b/test/unit/conv/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/conv/device/CMakeLists.txt b/test/unit/conv/device/CMakeLists.txt index ce907e0d58..1578625686 100644 --- a/test/unit/conv/device/CMakeLists.txt +++ b/test/unit/conv/device/CMakeLists.txt @@ -20,32 +20,73 @@ # STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - add_custom_target( +list(SORT CUTLASS_NVCC_ARCHS_ENABLED) +set(CUTLASS_NVCC_ARCHS_ENABLED_REVERSED ${CUTLASS_NVCC_ARCHS_ENABLED}) +list(REVERSE CUTLASS_NVCC_ARCHS_ENABLED_REVERSED) +list(GET CUTLASS_NVCC_ARCHS_ENABLED_REVERSED 0 CUTLASS_NVCC_MAX_ARCH) + +add_custom_target( cutlass_test_unit_conv_device DEPENDS cutlass_test_unit_conv_device_simt - cutlass_test_unit_conv_device_tensorop_f32_sm70 - cutlass_test_unit_conv_device_tensorop_f32_sm75 - cutlass_test_unit_conv_device_tensorop_f16_sm80 - cutlass_test_unit_conv_device_tensorop_f32_sm80 - cutlass_test_unit_conv_device_tensorop_f32_tf32_sm80 - cutlass_test_unit_conv_device_tensorop_s32 - cutlass_test_unit_conv_device_tensorop_s32_interleaved ) add_custom_target( test_unit_conv_device DEPENDS test_unit_conv_device_simt - test_unit_conv_device_tensorop_f32_sm70 - test_unit_conv_device_tensorop_f32_sm75 - test_unit_conv_device_tensorop_f16_sm80 - test_unit_conv_device_tensorop_f32_sm80 - test_unit_conv_device_tensorop_f32_tf32_sm80 - test_unit_conv_device_tensorop_s32 - test_unit_conv_device_tensorop_s32_interleaved ) +if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 70) + + add_dependencies( + cutlass_test_unit_conv_device + cutlass_test_unit_conv_device_tensorop_f32_sm70 + ) + + add_dependencies( + test_unit_conv_device + test_unit_conv_device_tensorop_f32_sm70 + ) + +endif() + +if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 75) + + add_dependencies( + cutlass_test_unit_conv_device + cutlass_test_unit_conv_device_tensorop_f32_sm75 + cutlass_test_unit_conv_device_tensorop_s32 + cutlass_test_unit_conv_device_tensorop_s32_interleaved + ) + + add_dependencies( + test_unit_conv_device + test_unit_conv_device_tensorop_f32_sm75 + test_unit_conv_device_tensorop_s32 + test_unit_conv_device_tensorop_s32_interleaved + ) + +endif() + +if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 80) + + add_dependencies( + cutlass_test_unit_conv_device + cutlass_test_unit_conv_device_tensorop_f16_sm80 + cutlass_test_unit_conv_device_tensorop_f32_sm80 + cutlass_test_unit_conv_device_tensorop_f32_tf32_sm80 + ) + + add_dependencies( + test_unit_conv_device + test_unit_conv_device_tensorop_f16_sm80 + test_unit_conv_device_tensorop_f32_sm80 + test_unit_conv_device_tensorop_f32_tf32_sm80 + ) + +endif() + # # OpClassSimt (CUDA cores) # @@ -56,20 +97,27 @@ cutlass_test_unit_add_executable( # F32 conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu - conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu - conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu - conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu - # CF32 conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu - - conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu - conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu - conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu ) +if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 80) + + cutlass_target_sources( + cutlass_test_unit_conv_device_simt + PRIVATE + conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu + conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu + conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu + conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu + conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu + conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu + ) + +endif() + # # OpClassTensorOp (Tensor cores) # @@ -92,57 +140,81 @@ cutlass_test_unit_add_executable( conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu ) -# Conv2d - F16 input, F16 output, F16 accumulation -cutlass_test_unit_add_executable( - cutlass_test_unit_conv_device_tensorop_f16_sm80 - - conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu - conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu - conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu -) - -# Conv2d - F16 input, F32 output, F32 accumulation -cutlass_test_unit_add_executable( - cutlass_test_unit_conv_device_tensorop_f32_sm80 - - - conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu - conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu - conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu - - conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu - conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu -) - -# Conv2d - TF32 input, F32 output, F32 accumulation -cutlass_test_unit_add_executable( - cutlass_test_unit_conv_device_tensorop_f32_tf32_sm80 - - conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu - conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu - conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu +if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 80) + + # Conv2d - F16 input, F16 output, F16 accumulation + cutlass_test_unit_add_executable( + cutlass_test_unit_conv_device_tensorop_f16_sm80 + + conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu + conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu + conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu + ) - conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu - conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu - conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu -) + # Conv2d - F16 input, F32 output, F32 accumulation + + cutlass_test_unit_add_executable( + cutlass_test_unit_conv_device_tensorop_f32_sm80 + + conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu + conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu + conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu + + conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu + conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu + ) + + # Conv2d - TF32 input, F32 output, F32 accumulation + + cutlass_test_unit_add_executable( + cutlass_test_unit_conv_device_tensorop_f32_tf32_sm80 + + conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu + conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu + conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu + + conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu + conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu + conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu + ) -# Conv2d - S8 input, S32 output, S32 accumulation -cutlass_test_unit_add_executable( - cutlass_test_unit_conv_device_tensorop_s32 +endif() - conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu - conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu - conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu - conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu -) +if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 75) -# Conv2d - S8 interleaved input, S8 interleaved output, S32 accumulation -cutlass_test_unit_add_executable( - cutlass_test_unit_conv_device_tensorop_s32_interleaved + # Conv2d - S8 input, S32 output, S32 accumulation - conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu - conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu - conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu - conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu -) + cutlass_test_unit_add_executable( + cutlass_test_unit_conv_device_tensorop_s32 + conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu + conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu + ) + + # Conv2d - S8 interleaved input, S8 interleaved output, S32 accumulation + + cutlass_test_unit_add_executable( + cutlass_test_unit_conv_device_tensorop_s32_interleaved + conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu + conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu + ) + + if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 80) + + cutlass_target_sources( + cutlass_test_unit_conv_device_tensorop_s32 + PRIVATE + conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu + conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu + ) + + # Conv2d - S8 interleaved input, S8 interleaved output, S32 accumulation + cutlass_target_sources( + cutlass_test_unit_conv_device_tensorop_s32_interleaved + PRIVATE + conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu + conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu + ) + + endif() + +endif() diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu index 4d500d9783..ba53d6f727 100644 --- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu +++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu index cc36edc75e..dc3f9d5062 100644 --- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu +++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu index aab0d34e49..e3eb0736d8 100644 --- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu +++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu index bc9ee6e9d7..ff512c02b2 100644 --- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu +++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu index 7417f92197..212290cb8b 100644 --- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu +++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu index 01f51a2cc4..b1fc52f4d6 100644 --- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu +++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu index 7682a319fe..542e1e6b96 100644 --- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu +++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu index 48c6ddb043..262e221e9f 100644 --- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu +++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu index b3b66a9de1..9ef2c7f640 100644 --- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu index 25e3ee0d5f..baece322a8 100644 --- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu index e151f5a78f..3366f1b541 100644 --- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu index 4c8102a503..72026a7e8c 100644 --- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu index 15f5585839..5332bea683 100644 --- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu index b54359f177..7b74e1284b 100644 --- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu index 51d2b942f4..46c366b76a 100644 --- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu index 820f0fb89f..78885e2a4e 100644 --- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu index 746e7d7b0b..3637fe8c9f 100644 --- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu index 7255eac644..b9fa943f07 100644 --- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu index 7e9bb9060b..a343aed586 100644 --- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu index 5426003779..38f6c6fbe7 100644 --- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu index d0ba7a5047..0bdd99bd0c 100644 --- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu index fbab373165..5893564830 100644 --- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu index e8b7c44fe2..021dc9a39f 100644 --- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu index e5146be328..43f6c0965b 100644 --- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu index 4cfdd3722d..2446c4aa99 100644 --- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu index c1a1f647a3..4c7b3d77df 100644 --- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu +++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_problems.h b/test/unit/conv/device/conv2d_problems.h index 74b43e11c7..c532894e9b 100644 --- a/test/unit/conv/device/conv2d_problems.h +++ b/test/unit/conv/device/conv2d_problems.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -165,6 +165,22 @@ struct TestbedConv2dProblemSizes { // C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64} //////////////////////////////////////////////////////////////////////////////////////////// + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {1, 1, 1, minimum_channel_size}, // input size (NHWC) + {8, 1, 1, minimum_channel_size}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( + {1, 1, 8, minimum_channel_size}, // input size (NHWC) + {8, 1, 3, minimum_channel_size}, // filter size (KRSC) + {1, 1, 1, 1}, // padding (pad_h, _, pad_w, _) + {1, 1}, // stride (stride_h, stride_w) + {1, 1} // dilation (dilation_h, dilation_w) + )); + conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( {1, 8, 8, minimum_channel_size}, // input size (NHWC) {8, 3, 3, minimum_channel_size}, // filter size (KRSC) @@ -322,7 +338,7 @@ struct TestbedConv2dProblemSizes { {1, 1}, // dilation (dilation_h, dilation_w) {4, 1, 1, 328} // output size (NPQK) )); - + } diff --git a/test/unit/conv/device/conv2d_testbed.h b/test/unit/conv/device/conv2d_testbed.h index 14bdd9bf13..9b94a4db61 100644 --- a/test/unit/conv/device/conv2d_testbed.h +++ b/test/unit/conv/device/conv2d_testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -204,10 +204,13 @@ class TestbedConv2d { ElementCompute alpha = ElementCompute(1), ElementCompute beta = ElementCompute(0)) { - // Waive test if CUDA device is insufficient - if (!sufficient()) { - return true; - } + // Waive test if insufficient CUDA device + if (!sufficient()) { + if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) { + std::cerr << "Test waived due to insufficient CUDA device." << std::endl; + } + return true; + } #if 0 //display conv2d problem size for debugging std::cout << problem_size << std::endl diff --git a/test/unit/conv/device/conv2d_testbed_interleaved.h b/test/unit/conv/device/conv2d_testbed_interleaved.h index cb4ecc7056..06ab207d14 100644 --- a/test/unit/conv/device/conv2d_testbed_interleaved.h +++ b/test/unit/conv/device/conv2d_testbed_interleaved.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu index 07961dd2b7..dbc5533225 100644 --- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu +++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu index a68a30fe5b..6cf9b15fb7 100644 --- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu +++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu index 3cbde02888..a27265143d 100644 --- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu +++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu index ffb79d77ad..bd49794eab 100644 --- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu +++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu index 1101090a12..14b93a3621 100644 --- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu +++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu index ade6f8df32..ca74be4d7e 100644 --- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu +++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu index a0aac81147..5645c90de7 100644 --- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu +++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu index 2185257f15..d67d54290c 100644 --- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu +++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu new file mode 100644 index 0000000000..e57d61170c --- /dev/null +++ b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu @@ -0,0 +1,120 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv3d_dgrad.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv3d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +TEST(SM80_Device_Conv3d_Dgrad_Analytic_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, + 128x128_32x4_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + using Conv3dDgradKernel = typename cutlass::conv::kernel::DefaultConv3dDgrad< + ElementA, cutlass::layout::TensorNDHWC, + ElementB, cutlass::layout::TensorNDHWC, + ElementC, cutlass::layout::TensorNDHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd + >::Kernel; + + using Conv3dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv3d instance + EXPECT_TRUE(test::conv::device::TestAllConv3d()); +} + +//////////////////////////////////////////////////////////////////////////////// + + +TEST(SM80_Device_Conv3d_Dgrad_Optimized_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, + 128x128_32x4_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + using Conv3dDgradKernel = typename cutlass::conv::kernel::DefaultConv3dDgrad< + ElementA, cutlass::layout::TensorNDHWC, + ElementB, cutlass::layout::TensorNDHWC, + ElementC, cutlass::layout::TensorNDHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kUnity + >::Kernel; + + using Conv3dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv3d instance + EXPECT_TRUE(test::conv::device::TestAllConv3d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM75_SUPPORTED + diff --git a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu index 211a331d8b..dfb64ce363 100644 --- a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu +++ b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -76,5 +76,46 @@ TEST(SM80_Device_Conv3d_Dgrad_Analytic_ImplicitGemm_tf32ndhwc_tf32ndhwc_f32ndhwc EXPECT_TRUE(test::conv::device::TestAllConv3d()); } +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv3d_Dgrad_Optimized_ImplicitGemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, + 128x128_32x3_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::tfloat32_t; + using ElementB = cutlass::tfloat32_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv3dDgradKernel = typename cutlass::conv::kernel::DefaultConv3dDgrad< + ElementA, cutlass::layout::TensorNDHWC, + ElementB, cutlass::layout::TensorNDHWC, + ElementC, cutlass::layout::TensorNDHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized, + cutlass::conv::StrideSupport::kUnity + >::Kernel; + + using Conv3dDgrad = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv3d instance + EXPECT_TRUE(test::conv::device::TestAllConv3d()); +} //////////////////////////////////////////////////////////////////////////////// #endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu new file mode 100644 index 0000000000..d5abb46e72 --- /dev/null +++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu @@ -0,0 +1,80 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv3d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv3d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_Conv3d_Fprop_Analytic_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, + 128x128_32x3_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv3dFpropKernel = typename cutlass::conv::kernel::DefaultConv3dFprop< + ElementA, cutlass::layout::TensorNDHWC, + ElementB, cutlass::layout::TensorNDHWC, + ElementC, cutlass::layout::TensorNDHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd + >::Kernel; + + using Conv3dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv3d instance + EXPECT_TRUE(test::conv::device::TestAllConv3d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM75_SUPPORTED diff --git a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu new file mode 100644 index 0000000000..b89485e2cb --- /dev/null +++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu @@ -0,0 +1,159 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide Implicit GEMM interface +*/ + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/conv/kernel/default_conv3d_fprop.h" +#include "cutlass/conv/device/implicit_gemm_convolution.h" + +#include "conv3d_testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +TEST(SM80_Device_Conv3d_Fprop_Analytic_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, + 128x128_32x4_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + using Conv3dFpropKernel = typename cutlass::conv::kernel::DefaultConv3dFprop< + ElementA, cutlass::layout::TensorNDHWC, + ElementB, cutlass::layout::TensorNDHWC, + ElementC, cutlass::layout::TensorNDHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd + >::Kernel; + + using Conv3dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv3d instance + EXPECT_TRUE(test::conv::device::TestAllConv3d()); +} + +//////////////////////////////////////////////////////////////////////////////// + + +TEST(SM80_Device_Conv3d_Fprop_Optimized_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, + 128x128_32x4_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + using Conv3dFpropKernel = typename cutlass::conv::kernel::DefaultConv3dFprop< + ElementA, cutlass::layout::TensorNDHWC, + ElementB, cutlass::layout::TensorNDHWC, + ElementC, cutlass::layout::TensorNDHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv3dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv3d instance + EXPECT_TRUE(test::conv::device::TestAllConv3d()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv3d_Fprop_Optimized_ImplicitGemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, + 64x256_32x4_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::half_t; + using ElementB = cutlass::half_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + using Conv3dFpropKernel = typename cutlass::conv::kernel::DefaultConv3dFprop< + ElementA, cutlass::layout::TensorNDHWC, + ElementB, cutlass::layout::TensorNDHWC, + ElementC, cutlass::layout::TensorNDHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv3dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv3d instance + EXPECT_TRUE(test::conv::device::TestAllConv3d()); +} + +//////////////////////////////////////////////////////////////////////////////// +#endif // CUTLASS_ARCH_MMA_SM75_SUPPORTED + diff --git a/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu index 0aabef5ba6..9a5c21eafc 100644 --- a/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu +++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -76,5 +76,46 @@ TEST(SM80_Device_Conv3d_Fprop_Analytic_ImplicitGemm_tf32ndhwc_tf32ndhwc_f32ndhwc EXPECT_TRUE(test::conv::device::TestAllConv3d()); } +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Conv3d_Fprop_Optimized_ImplicitGemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, + 128x128_32x3_64x64x32) { + + /// Conv operation element types for the Gemm equivalent (ImplicitGemm) + using ElementA = cutlass::tfloat32_t; + using ElementB = cutlass::tfloat32_t; + using ElementC = float; + using ElementAccumulator = float; + using ElementCompute = float; + + /// Device-level Conv2d instance + using Conv3dFpropKernel = typename cutlass::conv::kernel::DefaultConv3dFprop< + ElementA, cutlass::layout::TensorNDHWC, + ElementB, cutlass::layout::TensorNDHWC, + ElementC, cutlass::layout::TensorNDHWC, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementC, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd, + cutlass::conv::IteratorAlgorithm::kOptimized + >::Kernel; + + using Conv3dFprop = cutlass::conv::device::ImplicitGemmConvolution; + + /// Run all unit test sizes with device-level Conv3d instance + EXPECT_TRUE(test::conv::device::TestAllConv3d()); +} + //////////////////////////////////////////////////////////////////////////////// #endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED diff --git a/test/unit/conv/device/conv3d_problems.h b/test/unit/conv/device/conv3d_problems.h index 9cc618467e..21dc4b4f07 100644 --- a/test/unit/conv/device/conv3d_problems.h +++ b/test/unit/conv/device/conv3d_problems.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -107,9 +107,25 @@ struct TestbedConv3dProblemSizes { )); conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize( - {1, 1, 16, 16, minimum_channel_size}, // input size (NDHWC) - {8, 1, 3, 3, minimum_channel_size}, // filter size (KTRSC) - cutlass::Coord<3>({0, 1, 1}), // padding (pad_d, pad_h, pad_w) + {1, 1, 1, 8, minimum_channel_size}, // input size (NDHWC) + {8, 1, 1, 3, minimum_channel_size}, // filter size (KTRSC) + cutlass::Coord<3>({1, 1, 1}), // padding (pad_d, pad_h, pad_w) + cutlass::Coord<3>({1, 1, 1}), // stride (stride_d, stride_h, stride_w) + cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) + )); + + conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize( + {1, 8, 8, 8, minimum_channel_size}, // input size (NDHWC) + {8, 3, 3, 3, minimum_channel_size}, // filter size (KTRSC) + cutlass::Coord<3>({1, 1, 1}), // padding (pad_d, pad_h, pad_w) + cutlass::Coord<3>({1, 1, 1}), // stride (stride_d, stride_h, stride_w) + cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) + )); + + conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize( + {1, 16, 16, 16, minimum_channel_size}, // input size (NDHWC) + {8, 3, 3, 3, minimum_channel_size}, // filter size (KTRSC) + cutlass::Coord<3>({1, 1, 1}), // padding (pad_d, pad_h, pad_w) cutlass::Coord<3>({1, 1, 1}), // stride (stride_d, stride_h, stride_w) cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) )); @@ -138,6 +154,7 @@ struct TestbedConv3dProblemSizes { cutlass::Coord<3>({1, 1, 1}) // dilation (dilation_d, dilation_h, dilation_w) )); + conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize( {1, 11, 15, 19, 64}, // input size (NDHWC) {32, 4, 3, 6, 64}, // filter size (KTRSC) diff --git a/test/unit/conv/device/conv3d_testbed.h b/test/unit/conv/device/conv3d_testbed.h index 179520d158..87ac39abb7 100644 --- a/test/unit/conv/device/conv3d_testbed.h +++ b/test/unit/conv/device/conv3d_testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -204,10 +204,14 @@ class TestbedConv3d { ElementCompute alpha = ElementCompute(1), ElementCompute beta = ElementCompute()) { - // Waive test if CUDA device is insufficient. - if (!sufficient()) { - return true; - } + + // Waive test if insufficient CUDA device + if (!sufficient()) { + if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) { + std::cerr << "Test waived due to insufficient CUDA device." << std::endl; + } + return true; + } #if 0 //display conv2d problem size for debugging std::cout << problem_size << std::endl @@ -413,11 +417,6 @@ bool TestAllConv3d( // TestbedConv3dProblemSizes conv3d_problems(128/cutlass::sizeof_bits::value); - // - // Get conv problem sizes to run conv operator - // - //TestbedConv3dProblemSizes conv_problems(128/cutlass::sizeof_bits::value); - // Vector of conv3d problem sizes to avoid duplicate runs Conv3dProblemVector conv_tested_sizes; @@ -443,12 +442,17 @@ bool TestAllConv3d( // Procedurally disable certain cases // - // CUTLASS DGRAD's unity stride specialization only support stride {1, 1} + // CUTLASS DGRAD's unity stride specialization only support stride {1, 1, 1} if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad) && - (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == - cutlass::conv::StrideSupport::kUnity)) { - if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) { + ((ImplicitGemm::ImplicitGemmKernel::Mma::IteratorA::kStrideSupport == + cutlass::conv::StrideSupport::kUnity) || + (ImplicitGemm::ImplicitGemmKernel::Mma::IteratorB::kStrideSupport == + cutlass::conv::StrideSupport::kUnity))) { + if (!((conv_problem.stride_d == 1) && + (conv_problem.stride_h == 1) && + (conv_problem.stride_w == 1)) + ) { continue; } } diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu index a3f8409447..e706f1dae7 100644 --- a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu +++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu index 9847aede81..89167ce384 100644 --- a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu +++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu index 6dcbf0e726..477762fd1d 100644 --- a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu +++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/CMakeLists.txt b/test/unit/core/CMakeLists.txt index 19282035f5..ade17ae0bd 100644 --- a/test/unit/core/CMakeLists.txt +++ b/test/unit/core/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/core/array.cu b/test/unit/core/array.cu index 5a8cc855b0..bafbfbc298 100644 --- a/test/unit/core/array.cu +++ b/test/unit/core/array.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/bfloat16.cu b/test/unit/core/bfloat16.cu index d33ff2cc3c..29262fadc6 100644 --- a/test/unit/core/bfloat16.cu +++ b/test/unit/core/bfloat16.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/complex.cu b/test/unit/core/complex.cu index 003762f719..59812d6e38 100644 --- a/test/unit/core/complex.cu +++ b/test/unit/core/complex.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/functional.cu b/test/unit/core/functional.cu index ab843154ef..a3b98f7037 100644 --- a/test/unit/core/functional.cu +++ b/test/unit/core/functional.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/half.cu b/test/unit/core/half.cu index dad1f97a79..a888741f5e 100644 --- a/test/unit/core/half.cu +++ b/test/unit/core/half.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/matrix.cu b/test/unit/core/matrix.cu index f012fe9f87..f94605d7b5 100644 --- a/test/unit/core/matrix.cu +++ b/test/unit/core/matrix.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/matrix_coord.cu b/test/unit/core/matrix_coord.cu index 841d4cb72a..69d4f0977e 100644 --- a/test/unit/core/matrix_coord.cu +++ b/test/unit/core/matrix_coord.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/numeric_conversion.cu b/test/unit/core/numeric_conversion.cu index 5f8f383987..8fc3128d87 100644 --- a/test/unit/core/numeric_conversion.cu +++ b/test/unit/core/numeric_conversion.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/predicate_vector.cu b/test/unit/core/predicate_vector.cu index f9a0675c01..3dbe835242 100644 --- a/test/unit/core/predicate_vector.cu +++ b/test/unit/core/predicate_vector.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/quaternion.cu b/test/unit/core/quaternion.cu index 69ce928aec..62f8118834 100644 --- a/test/unit/core/quaternion.cu +++ b/test/unit/core/quaternion.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/tensor_ref.cu b/test/unit/core/tensor_ref.cu index 6bedddc577..f30cc19476 100644 --- a/test/unit/core/tensor_ref.cu +++ b/test/unit/core/tensor_ref.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/tensor_view.cu b/test/unit/core/tensor_view.cu index 684ca5b0f2..6ea8d2f313 100644 --- a/test/unit/core/tensor_view.cu +++ b/test/unit/core/tensor_view.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/test_unit_core.cpp b/test/unit/core/test_unit_core.cpp index a6dfbf4bbc..bcebec675f 100644 --- a/test/unit/core/test_unit_core.cpp +++ b/test/unit/core/test_unit_core.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/tfloat32.cu b/test/unit/core/tfloat32.cu index 9b54603fee..96e4c91389 100644 --- a/test/unit/core/tfloat32.cu +++ b/test/unit/core/tfloat32.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/CMakeLists.txt b/test/unit/epilogue/CMakeLists.txt index 9de2d56edb..66050010d5 100755 --- a/test/unit/epilogue/CMakeLists.txt +++ b/test/unit/epilogue/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/epilogue/thread/CMakeLists.txt b/test/unit/epilogue/thread/CMakeLists.txt index 9b04f7752a..dd43262d1b 100644 --- a/test/unit/epilogue/thread/CMakeLists.txt +++ b/test/unit/epilogue/thread/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/epilogue/thread/linear_combination.cu b/test/unit/epilogue/thread/linear_combination.cu index 6518e98738..5ff188a3e8 100644 --- a/test/unit/epilogue/thread/linear_combination.cu +++ b/test/unit/epilogue/thread/linear_combination.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/thread/linear_combination_planar_complex.cu b/test/unit/epilogue/thread/linear_combination_planar_complex.cu index 89d1be5e02..e6327a1dee 100644 --- a/test/unit/epilogue/thread/linear_combination_planar_complex.cu +++ b/test/unit/epilogue/thread/linear_combination_planar_complex.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/CMakeLists.txt b/test/unit/epilogue/threadblock/CMakeLists.txt index cb8b7a62d5..b987a05cb1 100755 --- a/test/unit/epilogue/threadblock/CMakeLists.txt +++ b/test/unit/epilogue/threadblock/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -22,6 +22,7 @@ cutlass_test_unit_add_executable( cutlass_test_unit_epilogue_threadblock + predicated_tile_iterator.cu output_tile_threadmap.cu epilogue_simt.cu diff --git a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu index 76b70f5069..11fa80cf8a 100644 --- a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu +++ b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/epilogue_simt.cu b/test/unit/epilogue/threadblock/epilogue_simt.cu index 935a812426..72b86cfac1 100644 --- a/test/unit/epilogue/threadblock/epilogue_simt.cu +++ b/test/unit/epilogue/threadblock/epilogue_simt.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu index 25cd8933c5..c6ff649dc8 100644 --- a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu +++ b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu index f3552a1847..0cdadad0c1 100644 --- a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu +++ b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu index db8e68a3a5..d1a2b9d9a5 100644 --- a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu +++ b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -460,7 +460,7 @@ TEST(SM75_Epilogue_threadblock_epilogue, s4_tensor_op_128x32_64x32x32) { } -TEST(SM75_Epilogue_threadblock_epilogue, s8_tensor_op_256x128_64x64x32) { +TEST(SM75_Epilogue_threadblock_epilogue, s4_tensor_op_256x128_64x64x32) { // // Define the warp-level matrix multiply @@ -520,7 +520,7 @@ TEST(SM75_Epilogue_threadblock_epilogue, s8_tensor_op_256x128_64x64x32) { } -TEST(SM75_Epilogue_threadblock_epilogue, s8_tensor_op_128x256_64x64x32) { +TEST(SM75_Epilogue_threadblock_epilogue, s4_tensor_op_128x256_64x64x32) { // // Define the warp-level matrix multiply diff --git a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu index 88fa98cf03..7fc4c7e437 100644 --- a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu +++ b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu index 24752a1df0..3039d1fc1e 100644 --- a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu +++ b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/output_tile_threadmap.cu b/test/unit/epilogue/threadblock/output_tile_threadmap.cu index 6e6e96e71f..19824e8ae9 100644 --- a/test/unit/epilogue/threadblock/output_tile_threadmap.cu +++ b/test/unit/epilogue/threadblock/output_tile_threadmap.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/predicated_tile_iterator.cu b/test/unit/epilogue/threadblock/predicated_tile_iterator.cu index 40874f7bf1..fddb0e17dc 100644 --- a/test/unit/epilogue/threadblock/predicated_tile_iterator.cu +++ b/test/unit/epilogue/threadblock/predicated_tile_iterator.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/testbed.h b/test/unit/epilogue/threadblock/testbed.h index 1dc9baa317..ba5241af3b 100644 --- a/test/unit/epilogue/threadblock/testbed.h +++ b/test/unit/epilogue/threadblock/testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/testbed_planar_complex.h b/test/unit/epilogue/threadblock/testbed_planar_complex.h index 6afa603293..3c2959dbcc 100644 --- a/test/unit/epilogue/threadblock/testbed_planar_complex.h +++ b/test/unit/epilogue/threadblock/testbed_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/warp/CMakeLists.txt b/test/unit/epilogue/warp/CMakeLists.txt index dbd7ee65b5..97b942d0f6 100644 --- a/test/unit/epilogue/warp/CMakeLists.txt +++ b/test/unit/epilogue/warp/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu index 9e94616f72..945732762e 100644 --- a/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu +++ b/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu index 3522c9e925..cf3ffe50cb 100644 --- a/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu +++ b/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu index 4931d93718..c6112a7e56 100644 --- a/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu +++ b/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/CMakeLists.txt b/test/unit/gemm/CMakeLists.txt index 4ac245716f..ff4280bf30 100644 --- a/test/unit/gemm/CMakeLists.txt +++ b/test/unit/gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/gemm/device/CMakeLists.txt b/test/unit/gemm/device/CMakeLists.txt index 7ead7eba54..87e495987f 100644 --- a/test/unit/gemm/device/CMakeLists.txt +++ b/test/unit/gemm/device/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -123,6 +123,8 @@ cutlass_test_unit_add_executable( BATCH_SOURCES ON BATCH_SIZE 4 + gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu + gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu @@ -172,7 +174,6 @@ cutlass_test_unit_add_executable( BATCH_SOURCES ON BATCH_SIZE 4 - gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu index fc887bce36..dbf2960dcd 100644 --- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu index d8b9072736..5006bcb253 100644 --- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu +++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu @@ -1,5 +1,5 @@ /************************************************************************************************** - Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu index 03f0b75251..0267531b8e 100644 --- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu index 77777a66f3..b30226e83a 100644 --- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu index f6862b0d2d..9de52a182c 100644 --- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu +++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu @@ -1,5 +1,5 @@ /************************************************************************************************** - Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu index b4fb7eba02..079cf81224 100644 --- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu index 3da9cdbb58..8c62c37888 100644 --- a/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu +++ b/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu index b0dbbdc856..2ee4e3f807 100644 --- a/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu +++ b/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu b/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu index b15af10764..11f3cc0543 100644 --- a/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu +++ b/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu b/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu index cec5ce60a5..0d9425347a 100644 --- a/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu +++ b/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu index c7df15d140..b71d02064d 100644 --- a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu +++ b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu index 5113d2f800..76a7fe5002 100644 --- a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu +++ b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu index 427c1e0e13..5922a4788e 100644 --- a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu +++ b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu index 74fbc1f549..7796d5a80e 100644 --- a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu +++ b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu index ea3da85d52..aca46c5f03 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu index 167949d8c6..c5ad79bab1 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu index ae72cade2f..4be57a632c 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu index 858fd301fe..abbd4f41dc 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu index 1f4d3e2933..efa04dba6e 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu index 2dc224ab2e..147a333b9c 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu index 71f21444cf..bc892a5ed9 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu index bb1665062e..28b75ec28c 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu index 3e8b96584f..ecd5a0463e 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu index cd6e48a3a2..19d2850f0e 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu index a9f9ea9978..a42c1827f0 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu index d797ed5577..326f11fd3b 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu index 7cf1fad244..2d86068aae 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu index cef53a2dc9..fa82abacb2 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu index be764f5282..410abad050 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu index 25d3e5bee8..616cb998a6 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu index f7c8fb23f2..4208cd509b 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu index 2798007695..4a75ec1601 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu index b4114ffe51..9314d2131b 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu index 6ca8ada8a5..1b034b85ac 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu index 64b697af81..3baadeec11 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu index cff5070599..976b671cb0 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu index 849b7582e6..323a980891 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu index 8a760b02ab..90feba8b33 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu index 9f2c2c542d..b4e09b5aac 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu index aa92606167..2d77ed3ea9 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu index dac3675b84..ffd8e3f73b 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu index 74434cc9fa..db25e6c7d2 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu index 176112d10f..4cdbdfe586 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu index 47e927d450..7b17e7f02e 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu index 8ae6464f27..da53084eaa 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu index de19ca0047..0472750584 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu index 0b83c6cbb7..e63e601fcd 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu index a81684241b..ab634268df 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu index 585b1df179..e2330ee64b 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu index ab030e5a97..dcc5581d4e 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu index b8fa4dad8e..82f5869d94 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu index 358aacecd9..47bbf0cd8b 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu index 957bcd2ab0..face7b9382 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu index 7c0f3b406a..015731c884 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu index 972756bba8..9a2a355a6b 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu index ffba9c0dac..dfa302b613 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu index 14030b1d41..1429ed1a23 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu index 9a1918db44..5b104ee1fa 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu index 51a09194e4..e3a1074ff9 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu index 74d64af70d..0a232e0724 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu index d4bc720bca..d6fc20674b 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu index dd0976d9f7..840e99de14 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu index 83c5cd1479..6fc1096ee1 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu index b62d99f78a..f028401cde 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu index 6d78dc9a9b..63368ed81b 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu index 5ea2f9ce00..94c34bccce 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu index 0f773de4f2..caec177f96 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu index 54d6229a0d..fc0dbdf328 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu index d123931e1a..92a8d20366 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu index b1286accd1..7be0e08f05 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu index 5a511540fa..eb91e486c0 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu index 26f41ac2b7..47828166b8 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu index 06498afb9a..a7c64010d4 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu index e377980bbf..d005347b0f 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu index 96f5dcc947..26f1c79ccc 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu index 0497e61945..3cee859958 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu index 0f94d589c6..8c2951dad7 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu index 2163711b84..d6a481a8b3 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu index 91095a945d..e6fcabef5c 100644 --- a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu +++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu index 2108eeb4e4..668a193e71 100644 --- a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu +++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu index 869b59b51d..0cebe7a129 100644 --- a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu +++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu index fda4371705..0efa0a9215 100644 --- a/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu +++ b/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu index 7c2b6c6e38..ff2a5c752d 100644 --- a/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu +++ b/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu index eec3ca4cdb..96a6320053 100644 --- a/test/unit/gemm/device/gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu +++ b/test/unit/gemm/device/gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu index 64fe313c50..120cae0571 100644 --- a/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu +++ b/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu index 63c765c551..8f7425737e 100644 --- a/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu +++ b/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu index 99303712e5..f974cf16e1 100644 --- a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu +++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu index 993b0b9d5a..f21995c597 100644 --- a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu +++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu index 25fd50cfc3..f66ba86d74 100644 --- a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu +++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu index 4cc4068170..6acd01359c 100644 --- a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu index d53e3c0768..d0bfd412ae 100644 --- a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu +++ b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -76,8 +76,7 @@ TEST(SM80_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 64x128x128_32x64x128) { 32, 32, false, - cutlass::arch::OpMultiplyAddSaturate, - true + cutlass::arch::OpMultiplyAddSaturate >; test::gemm::device::MultistageInterleavedTestbed testbed; @@ -117,8 +116,7 @@ TEST(SM80_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 128x128x128_64x64x128) { 32, 32, false, - cutlass::arch::OpMultiplyAddSaturate, - true + cutlass::arch::OpMultiplyAddSaturate >; test::gemm::device::MultistageInterleavedTestbed testbed; @@ -158,8 +156,7 @@ TEST(SM80_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 256x128x128_64x64x128) { 32, 32, false, - cutlass::arch::OpMultiplyAddSaturate, - true + cutlass::arch::OpMultiplyAddSaturate >; test::gemm::device::MultistageInterleavedTestbed testbed; @@ -199,8 +196,7 @@ TEST(SM80_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 128x256x128_64x64x128) { 32, 32, false, - cutlass::arch::OpMultiplyAddSaturate, - true + cutlass::arch::OpMultiplyAddSaturate >; test::gemm::device::MultistageInterleavedTestbed testbed; diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu index 983dff337f..6f1eef5c96 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu index 8dd541838f..9bf9e45bc9 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu @@ -1,5 +1,5 @@ /************************************************************************************************** - Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu index 01a65b32a5..05827535d6 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu index 33f3b07a2a..76cfe97087 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu index 1a3f7dba85..3ec1553a4d 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu @@ -1,5 +1,5 @@ /************************************************************************************************** - Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu index aaf618267e..5597b61e37 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu @@ -1,5 +1,5 @@ /************************************************************************************************** - Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu index 857df472a7..4cbc34839e 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu index 51d182cd66..68f7c152fc 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu index 90fe6bcfd8..097ef5418a 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu index 393e68bfd6..0ca058ed38 100644 --- a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu index c4900e489e..e5fae709e6 100644 --- a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu +++ b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -73,8 +73,7 @@ TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x64x64_32x32x64) { 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate, - true + cutlass::arch::OpMultiplyAddSaturate >; test::gemm::device::MultistageInterleavedTestbed testbed; @@ -112,8 +111,7 @@ TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x64x64_64x32x64) { 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate, - true + cutlass::arch::OpMultiplyAddSaturate >; test::gemm::device::MultistageInterleavedTestbed testbed; @@ -151,8 +149,7 @@ TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x128x64_32x64x64) { 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate, - true + cutlass::arch::OpMultiplyAddSaturate >; test::gemm::device::MultistageInterleavedTestbed testbed; @@ -190,8 +187,7 @@ TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x128x64_64x64x64) { 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate, - true + cutlass::arch::OpMultiplyAddSaturate >; test::gemm::device::MultistageInterleavedTestbed testbed; @@ -229,8 +225,7 @@ TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 256x128x64_64x64x64) { 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate, - true + cutlass::arch::OpMultiplyAddSaturate >; test::gemm::device::MultistageInterleavedTestbed testbed; @@ -268,8 +263,7 @@ TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x256x64_64x64x64) { 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate, - true + cutlass::arch::OpMultiplyAddSaturate >; test::gemm::device::MultistageInterleavedTestbed testbed; @@ -307,8 +301,7 @@ TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 256x64x64_64x64x64) { 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate, - true + cutlass::arch::OpMultiplyAddSaturate >; test::gemm::device::MultistageInterleavedTestbed testbed; @@ -346,8 +339,7 @@ TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x256x64_64x64x64) { 16, 16, false, - cutlass::arch::OpMultiplyAddSaturate, - true + cutlass::arch::OpMultiplyAddSaturate >; test::gemm::device::MultistageInterleavedTestbed testbed; @@ -358,4 +350,3 @@ TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x256x64_64x64x64) { //////////////////////////////////////////////////////////////////////////////// #endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) - diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu index 6ac9b71bf2..215e6a6a5e 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu index 9e1076a833..21ab4f746f 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu @@ -1,5 +1,5 @@ /************************************************************************************************** - Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu index cc6e4c3a5d..b139ec4117 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu index 86a678d22b..d0c3bcce80 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu index a86dc2442e..9399725d35 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu @@ -1,5 +1,5 @@ /************************************************************************************************** - Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu index 5b9b1d7d95..26db722085 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu @@ -1,5 +1,5 @@ /************************************************************************************************** - Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu index d53571a2d7..6fda75299b 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu index 024cba0a49..1358cec554 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu index 2d6db336f6..4d7e5b3a42 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu @@ -1,5 +1,5 @@ /************************************************************************************************** - Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu index ac5757e0ee..36c76e61aa 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu index 93642e64b6..2ee6365985 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu index 197e69b710..b5b99164f8 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu @@ -1,5 +1,5 @@ /************************************************************************************************** - Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu index 719e2ac760..9447d611e0 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu index e7a01bed61..e2057f5a13 100644 --- a/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu +++ b/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_splitk_simt_sm50.cu b/test/unit/gemm/device/gemm_splitk_simt_sm50.cu index 39b5f10a70..dfbc64e2a8 100644 --- a/test/unit/gemm/device/gemm_splitk_simt_sm50.cu +++ b/test/unit/gemm/device/gemm_splitk_simt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu b/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu index 42e991ed09..3f8cc5eeef 100644 --- a/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu +++ b/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu index 3381f1703a..f4f4fc55db 100644 --- a/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu +++ b/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu index 78c6e8657e..9bc2f10c9a 100644 --- a/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu +++ b/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu index 11af88897f..6ba1ddf371 100644 --- a/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu +++ b/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu index a28101f3d5..7eed9680ba 100644 --- a/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu +++ b/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu index a1a0fd7e31..8374c51889 100644 --- a/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu +++ b/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu index a63163680b..4dc26800d2 100644 --- a/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu +++ b/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu index e32441941d..891b5b578a 100644 --- a/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu +++ b/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu index 301cce7851..901102cd76 100644 --- a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu +++ b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu index df28110a33..aedfdde45f 100644 --- a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu +++ b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu similarity index 96% rename from test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu rename to test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu index e7b4405a08..ef66c6a226 100644 --- a/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu +++ b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -45,7 +45,7 @@ //////////////////////////////////////////////////////////////////////////////// -#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) //////////////////////////////////////////////////////////////////////////////// @@ -105,7 +105,7 @@ TEST(SM75_Device_GemmUniversal_f16n_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32_u //////////////////////////////////////////////////////////////////////////////// -#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) +#endif // #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) //////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/multistage_testbed.h b/test/unit/gemm/device/multistage_testbed.h index f7b6ac8f56..6fb573b918 100644 --- a/test/unit/gemm/device/multistage_testbed.h +++ b/test/unit/gemm/device/multistage_testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/multistage_testbed_interleaved.h b/test/unit/gemm/device/multistage_testbed_interleaved.h index c98264de01..1b12cd5b29 100644 --- a/test/unit/gemm/device/multistage_testbed_interleaved.h +++ b/test/unit/gemm/device/multistage_testbed_interleaved.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_cgemm_nn_sm50.cu b/test/unit/gemm/device/simt_cgemm_nn_sm50.cu index 5aabfca587..680012bcac 100644 --- a/test/unit/gemm/device/simt_cgemm_nn_sm50.cu +++ b/test/unit/gemm/device/simt_cgemm_nn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_cgemm_nt_sm50.cu b/test/unit/gemm/device/simt_cgemm_nt_sm50.cu index c5265ce2b9..0f20a92f73 100644 --- a/test/unit/gemm/device/simt_cgemm_nt_sm50.cu +++ b/test/unit/gemm/device/simt_cgemm_nt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_cgemm_tn_sm50.cu b/test/unit/gemm/device/simt_cgemm_tn_sm50.cu index 9db96c996a..a6072d2804 100644 --- a/test/unit/gemm/device/simt_cgemm_tn_sm50.cu +++ b/test/unit/gemm/device/simt_cgemm_tn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_cgemm_tt_sm50.cu b/test/unit/gemm/device/simt_cgemm_tt_sm50.cu index 0ac7b4c9f8..8162905b31 100644 --- a/test/unit/gemm/device/simt_cgemm_tt_sm50.cu +++ b/test/unit/gemm/device/simt_cgemm_tt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_dgemm_nn_sm50.cu b/test/unit/gemm/device/simt_dgemm_nn_sm50.cu index 1efa9d0446..af5dbb7cd5 100644 --- a/test/unit/gemm/device/simt_dgemm_nn_sm50.cu +++ b/test/unit/gemm/device/simt_dgemm_nn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_dgemm_nt_sm50.cu b/test/unit/gemm/device/simt_dgemm_nt_sm50.cu index 886c0f9c74..d5cb5e7546 100644 --- a/test/unit/gemm/device/simt_dgemm_nt_sm50.cu +++ b/test/unit/gemm/device/simt_dgemm_nt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_dgemm_tn_sm50.cu b/test/unit/gemm/device/simt_dgemm_tn_sm50.cu index a43d0afd5d..84cb465b20 100644 --- a/test/unit/gemm/device/simt_dgemm_tn_sm50.cu +++ b/test/unit/gemm/device/simt_dgemm_tn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_dgemm_tt_sm50.cu b/test/unit/gemm/device/simt_dgemm_tt_sm50.cu index 0175978d00..e9633f5c4b 100644 --- a/test/unit/gemm/device/simt_dgemm_tt_sm50.cu +++ b/test/unit/gemm/device/simt_dgemm_tt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_hgemm_nn_sm50.cu b/test/unit/gemm/device/simt_hgemm_nn_sm50.cu index a3aa5ce840..9cabed9069 100644 --- a/test/unit/gemm/device/simt_hgemm_nn_sm50.cu +++ b/test/unit/gemm/device/simt_hgemm_nn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_hgemm_nt_sm50.cu b/test/unit/gemm/device/simt_hgemm_nt_sm50.cu index d5541939e9..83f5ceb12e 100644 --- a/test/unit/gemm/device/simt_hgemm_nt_sm50.cu +++ b/test/unit/gemm/device/simt_hgemm_nt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_hgemm_tn_sm50.cu b/test/unit/gemm/device/simt_hgemm_tn_sm50.cu index 526bc01a4c..d7c67e2dee 100644 --- a/test/unit/gemm/device/simt_hgemm_tn_sm50.cu +++ b/test/unit/gemm/device/simt_hgemm_tn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_hgemm_tt_sm50.cu b/test/unit/gemm/device/simt_hgemm_tt_sm50.cu index ad464b3018..cfd60a3b14 100644 --- a/test/unit/gemm/device/simt_hgemm_tt_sm50.cu +++ b/test/unit/gemm/device/simt_hgemm_tt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_igemm_nn_sm50.cu b/test/unit/gemm/device/simt_igemm_nn_sm50.cu index 3db133ebfd..be25b52027 100644 --- a/test/unit/gemm/device/simt_igemm_nn_sm50.cu +++ b/test/unit/gemm/device/simt_igemm_nn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_igemm_nt_sm50.cu b/test/unit/gemm/device/simt_igemm_nt_sm50.cu index 01f56ea030..8a81a7b48c 100644 --- a/test/unit/gemm/device/simt_igemm_nt_sm50.cu +++ b/test/unit/gemm/device/simt_igemm_nt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_igemm_tn_sm50.cu b/test/unit/gemm/device/simt_igemm_tn_sm50.cu index 3692ec2c3b..2a871ecc5d 100644 --- a/test/unit/gemm/device/simt_igemm_tn_sm50.cu +++ b/test/unit/gemm/device/simt_igemm_tn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_igemm_tt_sm50.cu b/test/unit/gemm/device/simt_igemm_tt_sm50.cu index 2254669b36..f86e8e975e 100644 --- a/test/unit/gemm/device/simt_igemm_tt_sm50.cu +++ b/test/unit/gemm/device/simt_igemm_tt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61.cu b/test/unit/gemm/device/simt_int8_igemm_sm61.cu index 1364a38cff..ca5f3e7b6b 100644 --- a/test/unit/gemm/device/simt_int8_igemm_sm61.cu +++ b/test/unit/gemm/device/simt_int8_igemm_sm61.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,9 +72,7 @@ cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, \ 2 \ >; \ - EXPECT_TRUE(test::gemm::device::TestAllGemm()); \ - - + EXPECT_TRUE(test::gemm::device::TestAllGemm()); //////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu b/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu index 4e4308ff37..cad5de367d 100644 --- a/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu +++ b/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu b/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu index 88c72aee4c..e7badc070a 100644 --- a/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu +++ b/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_sgemm_nn_sm50.cu b/test/unit/gemm/device/simt_sgemm_nn_sm50.cu index 0412d751c3..64e524b419 100644 --- a/test/unit/gemm/device/simt_sgemm_nn_sm50.cu +++ b/test/unit/gemm/device/simt_sgemm_nn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_sgemm_nt_sm50.cu b/test/unit/gemm/device/simt_sgemm_nt_sm50.cu index 1adb9b5ae4..e520e29810 100644 --- a/test/unit/gemm/device/simt_sgemm_nt_sm50.cu +++ b/test/unit/gemm/device/simt_sgemm_nt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_sgemm_nt_sm80.cu b/test/unit/gemm/device/simt_sgemm_nt_sm80.cu index f0fe1ebd94..3a1b5de6ea 100644 --- a/test/unit/gemm/device/simt_sgemm_nt_sm80.cu +++ b/test/unit/gemm/device/simt_sgemm_nt_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_sgemm_tn_sm50.cu b/test/unit/gemm/device/simt_sgemm_tn_sm50.cu index 0c00e56084..aa3a0d6eed 100644 --- a/test/unit/gemm/device/simt_sgemm_tn_sm50.cu +++ b/test/unit/gemm/device/simt_sgemm_tn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_sgemm_tn_sm80.cu b/test/unit/gemm/device/simt_sgemm_tn_sm80.cu index c183fbff34..9ed5f1292c 100644 --- a/test/unit/gemm/device/simt_sgemm_tn_sm80.cu +++ b/test/unit/gemm/device/simt_sgemm_tn_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_sgemm_tt_sm50.cu b/test/unit/gemm/device/simt_sgemm_tt_sm50.cu index ce7ab9a7e0..c148c9564b 100644 --- a/test/unit/gemm/device/simt_sgemm_tt_sm50.cu +++ b/test/unit/gemm/device/simt_sgemm_tt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_sm50.py b/test/unit/gemm/device/simt_sm50.py index f53dae2715..525fa2a8c1 100644 --- a/test/unit/gemm/device/simt_sm50.py +++ b/test/unit/gemm/device/simt_sm50.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -123,7 +123,7 @@ # write file header out.write("/***************************************************************************************************\n" -" * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.\n" +" * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.\n" " *\n" " * Redistribution and use in source and binary forms, with or without modification, are permitted\n" " * provided that the following conditions are met:\n" diff --git a/test/unit/gemm/device/simt_zgemm_nn_sm50.cu b/test/unit/gemm/device/simt_zgemm_nn_sm50.cu index 7731559a81..e325ced874 100644 --- a/test/unit/gemm/device/simt_zgemm_nn_sm50.cu +++ b/test/unit/gemm/device/simt_zgemm_nn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_zgemm_nt_sm50.cu b/test/unit/gemm/device/simt_zgemm_nt_sm50.cu index 17ea98203a..2a309a4eca 100644 --- a/test/unit/gemm/device/simt_zgemm_nt_sm50.cu +++ b/test/unit/gemm/device/simt_zgemm_nt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_zgemm_tn_sm50.cu b/test/unit/gemm/device/simt_zgemm_tn_sm50.cu index 175c312868..2a9f33d4bb 100644 --- a/test/unit/gemm/device/simt_zgemm_tn_sm50.cu +++ b/test/unit/gemm/device/simt_zgemm_tn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/simt_zgemm_tt_sm50.cu b/test/unit/gemm/device/simt_zgemm_tt_sm50.cu index 544e626c5a..013a1ba53e 100644 --- a/test/unit/gemm/device/simt_zgemm_tt_sm50.cu +++ b/test/unit/gemm/device/simt_zgemm_tt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/testbed.h b/test/unit/gemm/device/testbed.h index c2bf40ec21..24ec13e495 100644 --- a/test/unit/gemm/device/testbed.h +++ b/test/unit/gemm/device/testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -284,10 +284,13 @@ struct Testbed { ElementCompute alpha = ElementCompute(1), ElementCompute beta = ElementCompute(0)) { - // Waive test if insufficient CUDA device - if (!sufficient()) { - return true; - } + // Waive test if insufficient CUDA device + if (!sufficient()) { + if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) { + std::cerr << "Test waived due to insufficient CUDA device." << std::endl; + } + return true; + } this->initialize(problem_size); diff --git a/test/unit/gemm/device/testbed_complex.h b/test/unit/gemm/device/testbed_complex.h index a3e1353ee1..941fa93fba 100644 --- a/test/unit/gemm/device/testbed_complex.h +++ b/test/unit/gemm/device/testbed_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -101,6 +101,7 @@ struct TestbedComplex : public Testbed { return this->compare_reference(problem_size, alpha, beta); } + /// Returns true if the CUDA device is sufficient to execute the kernel. bool sufficient() const { // // Determine SMEM requirements and waive if not satisfied @@ -136,10 +137,13 @@ struct TestbedComplex : public Testbed { ElementCompute alpha = ElementCompute(1), ElementCompute beta = ElementCompute(0)) { - // Waive the test if device not sufficient - if (!sufficient()) { - return true; - } + // Waive test if insufficient CUDA device + if (!sufficient()) { + if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) { + std::cerr << "Test waived due to insufficient CUDA device." << std::endl; + } + return true; + } // // Initialize workspace diff --git a/test/unit/gemm/device/testbed_interleaved.h b/test/unit/gemm/device/testbed_interleaved.h index 6e14f87f6e..3ea1d222b9 100644 --- a/test/unit/gemm/device/testbed_interleaved.h +++ b/test/unit/gemm/device/testbed_interleaved.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -137,9 +137,13 @@ struct InterleavedTestbed { ElementCompute alpha = ElementCompute(1), ElementCompute beta = ElementCompute(0)) { - if (!sufficient()) { - return true; - } + // Waive test if insufficient CUDA device + if (!sufficient()) { + if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) { + std::cerr << "Test waived due to insufficient CUDA device." << std::endl; + } + return true; + } // // Allocate the GEMM workspace diff --git a/test/unit/gemm/device/testbed_planar_complex.h b/test/unit/gemm/device/testbed_planar_complex.h index 0e4e561e42..3bc997757a 100644 --- a/test/unit/gemm/device/testbed_planar_complex.h +++ b/test/unit/gemm/device/testbed_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -112,10 +112,47 @@ class TestbedPlanarComplex { tensor_D.sync_device(); } + /// Returns true if the CUDA device is sufficient to execute the kernel. + bool sufficient() const { + // + // Determine SMEM requirements and waive if not satisfied + // + + int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage)); + + cudaDeviceProp properties; + int device_idx; + cudaError_t result = cudaGetDevice(&device_idx); + + if (result != cudaSuccess) { + throw std::runtime_error("cudaGetDevice() API call failed."); + } + + result = cudaGetDeviceProperties(&properties, device_idx); + + if (result != cudaSuccess) { + throw std::runtime_error("cudaGetDeviceProperties() failed"); + } + + if (properties.sharedMemPerMultiprocessor < smem_size) { + return false; + } + + return true; + } + bool run( cutlass::complex alpha = {1, 0}, cutlass::complex beta = {0, 0}) { + // Waive test if insufficient CUDA device + if (!sufficient()) { + if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) { + std::cerr << "Test waived due to insufficient CUDA device." << std::endl; + } + return true; + } + initialize(); int batch_count = 1; diff --git a/test/unit/gemm/device/testbed_sanity.h b/test/unit/gemm/device/testbed_sanity.h index 025fb3874d..90f61590d5 100644 --- a/test/unit/gemm/device/testbed_sanity.h +++ b/test/unit/gemm/device/testbed_sanity.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/testbed_sparse.h b/test/unit/gemm/device/testbed_sparse.h index 28901a9867..e2611210d1 100644 --- a/test/unit/gemm/device/testbed_sparse.h +++ b/test/unit/gemm/device/testbed_sparse.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -295,6 +295,7 @@ struct SparseTestbed { return compare_reference(problem_size, alpha, beta); } + /// Returns true if the CUDA device is sufficient to execute the kernel. bool sufficient() const { // // Determine SMEM requirements and waive if not satisfied @@ -330,10 +331,13 @@ struct SparseTestbed { ElementCompute alpha = ElementCompute(1), ElementCompute beta = ElementCompute(0)) { - // Waive test if insufficient CUDA device - if (!sufficient()) { - return true; - } + // Waive test if insufficient CUDA device + if (!sufficient()) { + if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) { + std::cerr << "Test waived due to insufficient CUDA device." << std::endl; + } + return true; + } this->initialize(problem_size); diff --git a/test/unit/gemm/device/testbed_splitk.h b/test/unit/gemm/device/testbed_splitk.h index 792d73923a..5e5d7b329f 100644 --- a/test/unit/gemm/device/testbed_splitk.h +++ b/test/unit/gemm/device/testbed_splitk.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -61,6 +61,35 @@ struct TestbedSplitK : public Testbed { ): Base(init_A_, init_B_, init_C_, seed_) { } + /// Returns true if the CUDA device is sufficient to execute the kernel. + bool sufficient() const { + // + // Determine SMEM requirements and waive if not satisfied + // + + int smem_size = int(sizeof(typename Gemm::GemmKernel::SharedStorage)); + + cudaDeviceProp properties; + int device_idx; + cudaError_t result = cudaGetDevice(&device_idx); + + if (result != cudaSuccess) { + throw std::runtime_error("cudaGetDevice() API call failed."); + } + + result = cudaGetDeviceProperties(&properties, device_idx); + + if (result != cudaSuccess) { + throw std::runtime_error("cudaGetDeviceProperties() failed"); + } + + if (properties.sharedMemPerMultiprocessor < smem_size) { + return false; + } + + return true; + } + /// Executes one test bool run( cutlass::gemm::GemmCoord problem_size, @@ -68,6 +97,14 @@ struct TestbedSplitK : public Testbed { ElementCompute alpha = ElementCompute(1), ElementCompute beta = ElementCompute(0)) { + // Waive test if insufficient CUDA device + if (!sufficient()) { + if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) { + std::cerr << "Test waived due to insufficient CUDA device." << std::endl; + } + return true; + } + this->initialize(problem_size); // diff --git a/test/unit/gemm/device/testbed_universal.h b/test/unit/gemm/device/testbed_universal.h index fb36f10e25..4252fd953b 100644 --- a/test/unit/gemm/device/testbed_universal.h +++ b/test/unit/gemm/device/testbed_universal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -250,6 +250,7 @@ struct TestbedUniversal { return compare_reference(problem_size, alpha, beta); } + /// Returns true if the CUDA device is sufficient to execute the kernel. bool sufficient() const { // // Determine SMEM requirements and waive if not satisfied @@ -286,10 +287,13 @@ struct TestbedUniversal { ElementCompute alpha = ElementCompute(1), ElementCompute beta = ElementCompute(0)) { - // Waive test if insufficient CUDA device - if (!sufficient()) { - return true; - } + // Waive test if insufficient CUDA device + if (!sufficient()) { + if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) { + std::cerr << "Test waived due to insufficient CUDA device." << std::endl; + } + return true; + } this->initialize(problem_size); diff --git a/test/unit/gemm/device/testbed_utils.h b/test/unit/gemm/device/testbed_utils.h index 9325b40fe3..2a77e6c8d0 100644 --- a/test/unit/gemm/device/testbed_utils.h +++ b/test/unit/gemm/device/testbed_utils.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/kernel/batched_gemv.cu b/test/unit/gemm/kernel/batched_gemv.cu new file mode 100755 index 0000000000..bf479641d1 --- /dev/null +++ b/test/unit/gemm/kernel/batched_gemv.cu @@ -0,0 +1,1076 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "testbed_gemv.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_rcr_fp32_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 1; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + float, float, float, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_rcr_fp32_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 2; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + float, float, float, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_rcr_fp32_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 8; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + float, float, float, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_rcr_fp32_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + float, float, float, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x64x27x4096_1x8x1x64_1x1x1x64_rcr_alpha_fp32_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 27, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 1>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 1>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + float, float, float, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + kBatchTileSize>(problem_size, -0.5f); +} + +TEST(SM50_batched_gemv, 1x64x27x4096_1x8x1x64_1x1x1x64_rcr_alpha_beta_fp32_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 27, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 1>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 1>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + float, float, float, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + kBatchTileSize>(problem_size, 4.5f, -0.5f); +} + +TEST(SM50_batched_gemv, 1x64x24x4096_1x8x4x64_1x1x4x64_rcr_alpha_beta_fp16_fp16) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 24, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + kBatchTileSize>(problem_size, cutlass::half_t(4.5f), cutlass::half_t(-0.5f)); +} + +/// + +TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_rcr_fp16_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 1; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, float, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_rcr_fp16_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 2; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, float, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_rcr_fp16_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 8; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, float, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_rcr_fp16_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, float, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + kBatchTileSize>(problem_size); +} + +/// + +TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_rcr_fp16_fp16) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 1; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_rcr_fp16_fp16) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 2; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_rcr_fp16_fp16) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 8; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_rcr_fp16_fp16) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + kBatchTileSize>(problem_size); +} + +/// + +TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_rcr_i8_i32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 1; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + int8_t, int32_t, int32_t, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_rcr_i8_i32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 2; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + int8_t, int32_t, int32_t, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_rcr_i8_i32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 8; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + int8_t, int32_t, int32_t, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_rcr_i8_i32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + int8_t, int32_t, int32_t, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + kBatchTileSize>(problem_size); +} + +///////////// + +TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_crc_fp32_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 1; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + float, float, float, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_crc_fp32_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 2; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + float, float, float, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_crc_fp32_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 8; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + float, float, float, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_crc_fp32_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + float, float, float, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +/// + +TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_crc_fp16_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 1; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, float, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_crc_fp16_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 2; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, float, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_crc_fp16_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 8; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, float, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_crc_fp16_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, float, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +/// + +TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_crc_fp16_fp16) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 1; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_crc_fp16_fp16) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 2; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_crc_fp16_fp16) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 8; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_crc_fp16_fp16) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +/// + +TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_crc_i8_i32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 1; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + int8_t, int32_t, int32_t, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_crc_i8_i32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 2; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + int8_t, int32_t, int32_t, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_crc_i8_i32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 8; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + int8_t, int32_t, int32_t, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_crc_i8_i32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + int8_t, int32_t, int32_t, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x64x27x4096_1x8x1x64_1x1x1x64_crc_alpha_fp32_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 27, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 1>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 1>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + float, float, float, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size, -0.5f); +} + +TEST(SM50_batched_gemv, 1x64x27x4096_1x8x1x64_1x1x1x64_crc_alpha_beta_fp32_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 27, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 1>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 1>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + float, float, float, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size, 4.5f, -0.5f); +} + +TEST(SM50_batched_gemv, 1x64x24x4096_1x8x4x64_1x1x4x64_crc_alpha_beta_fp16_fp16) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 24, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size, cutlass::half_t(4.5f), cutlass::half_t(-0.5f)); +} + +///////////// + +TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_rcc_fp32_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 1; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + float, float, float, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_rcc_fp32_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 2; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + float, float, float, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_rcc_fp32_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 8; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + float, float, float, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_rcc_fp32_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + float, float, float, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +/// + +TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_rcc_fp16_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 1; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, float, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_rcc_fp16_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 2; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, float, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_rcc_fp16_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 8; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, float, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_rcc_fp16_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, float, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +/// + +TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_rcc_fp16_fp16) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 1; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_rcc_fp16_fp16) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 2; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_rcc_fp16_fp16) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 8; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_rcc_fp16_fp16) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +/// + +TEST(SM50_batched_gemv, 1x64x64x1_1x64x4x1_1x4x4x1_rcc_i8_i32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 1); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 1; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + int8_t, int32_t, int32_t, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x64x64x4_1x64x4x2_1x4x4x2_rcc_i8_i32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 64, 4); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 2; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + int8_t, int32_t, int32_t, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x256x256x64_1x64x4x8_1x4x4x8_rcc_i8_i32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 256, 256, 64); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 64, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 4, 4>; + static int const kBatchTileSize = 8; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + int8_t, int32_t, int32_t, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x7x256x4096_1x8x4x64_1x1x4x64_rcc_i8_i32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 7, 256, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + int8_t, int32_t, int32_t, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size); +} + +TEST(SM50_batched_gemv, 1x64x27x4096_1x8x1x64_1x1x1x64_rcc_alpha_fp32_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 27, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 1>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 1>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + float, float, float, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size, -0.5f); +} + +TEST(SM50_batched_gemv, 1x64x27x4096_1x8x1x64_1x1x1x64_rcc_alpha_beta_fp32_fp32) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 27, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 1>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 1>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + float, float, float, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size, 4.5f, -0.5f); +} + +TEST(SM50_batched_gemv, 1x64x24x4096_1x8x4x64_1x1x4x64_rcc_alpha_beta_fp16_fp16) +{ + cutlass::gemm::BatchedGemmCoord problem_size(1, 64, 24, 4096); + + using ThreadBlockShape = cutlass::gemm::GemmShape<1, 8, 4>; + using ThreadShape = cutlass::gemm::GemmShape<1, 1, 4>; + static int const kBatchTileSize = 64; + + test::gemm::kernel::batched_gemv_kernel_test< + ThreadBlockShape, + ThreadShape, + cutlass::half_t, float, cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::layout::ColumnMajor, + cutlass::layout::ColumnMajor, + kBatchTileSize>(problem_size, cutlass::half_t(4.5f), cutlass::half_t(-0.5f)); +} diff --git a/test/unit/gemm/kernel/testbed_gemv.h b/test/unit/gemm/kernel/testbed_gemv.h new file mode 100755 index 0000000000..fb9c7d7076 --- /dev/null +++ b/test/unit/gemm/kernel/testbed_gemv.h @@ -0,0 +1,352 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#pragma once + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/core_io.h" +#include "cutlass/numeric_types.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/tensor_ref.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "cutlass/gemm/kernel/default_gemv.h" +#include "cutlass/gemm/kernel/gemv_batched_strided.h" + +namespace test { +namespace gemm { +namespace kernel { + +template +void batched_gemv_kernel_test(cutlass::gemm::BatchedGemmCoord problem_size, + ElementCD_ alpha = ElementCD_(1), + ElementCD_ beta = ElementCD_(0), + bool perf_test = false, + int perf_test_iter = 1) +{ + using ThreadBlockShape = ThreadBlockShape_; + using ThreadShape = ThreadShape_; + using ElementA = ElementAB_; + using LayoutA = LayoutA_; + using ElementB = ElementAB_; + using LayoutB = LayoutB_; + using ElementAccumulator = ElementCD_; + using ElementCD = ElementCD_; + using LayoutCD = LayoutCD_; + + using GemvKernel = cutlass::gemm::kernel::DefaultGemv; + + using ThreadBlockGemv = typename GemvKernel::ThreadBlockGemv; + using ThreadBlockSwizzle = typename GemvKernel::ThreadBlockSwizzle; + + if (DEBUG) + { + problem_size = cutlass::gemm::BatchedGemmCoord( + problem_size.m(), problem_size.n(), problem_size.k(), 1); + } + + // Create host tensors that will be the backing store for the batches + // Note that no device memory is initially allocated + cutlass::HostTensor matrix_A({problem_size.m(), problem_size.k()}, false); + cutlass::HostTensor matrix_B({problem_size.k(), problem_size.n()}, false); + cutlass::HostTensor matrix_C_computed({problem_size.m(), problem_size.n()}, false); + cutlass::HostTensor matrix_C_reference({problem_size.m(), problem_size.n()}, false); + + // Reserve memory for the batch of tensors + matrix_A.reserve(problem_size.m()*problem_size.k()*problem_size.batch()); + matrix_B.reserve(problem_size.n()*problem_size.k()*problem_size.batch()); + matrix_C_computed.reserve(problem_size.m()*problem_size.n()*problem_size.batch()); + matrix_C_reference.reserve(problem_size.m()*problem_size.n()*problem_size.batch(), false); + + // Fill eatch tensor batch + const int seed = 9876; + for (int b = 0; b < problem_size.batch(); b++) + { + if(DEBUG) + { + cutlass::reference::host::BlockFillSequential( + matrix_A.host_data_ptr_offset(b*matrix_A.capacity()), matrix_A.capacity()); + cutlass::reference::host::BlockFillSequential( + matrix_B.host_data_ptr_offset(b*matrix_B.capacity()), matrix_B.capacity()); + } + else + { + cutlass::reference::host::TensorFillRandomUniform( + matrix_A.host_view(b*matrix_A.capacity()), + seed + 1660, + 8, + -8, + 0 + ); + + cutlass::reference::host::TensorFillRandomUniform( + matrix_B.host_view(b*matrix_B.capacity()), + seed + 1880, + 8, + -8, + 0 + ); + } + + cutlass::reference::host::TensorFill(matrix_C_computed.host_view(b*matrix_C_computed.capacity())); + cutlass::reference::host::TensorFill(matrix_C_reference.host_view(b*matrix_C_reference.capacity())); + } + + matrix_A.sync_device(); + matrix_B.sync_device(); + matrix_C_computed.sync_device(); + + ThreadBlockSwizzle swizzle; + + cutlass::gemm::BatchedGemmCoord tiled_size{ThreadBlockShape::kM, + ThreadBlockShape::kN, + problem_size.k(), // no split-k + DEBUG ? 1 : LDG_B }; + + cutlass::gemm::BatchedGemmCoord tiled_shape = swizzle.get_tiled_shape(problem_size, tiled_size); + + #if 0 + printf("tiled_size = %d %d %d %d\n", tiled_size.m(), tiled_size.n(), tiled_size.k(), tiled_size.batch()); + printf("tiled_shape = %d %d %d %d\n", tiled_shape.m(), tiled_shape.n(), tiled_shape.k(), tiled_shape.batch()); + #endif + + // No split-k + EXPECT_EQ(tiled_size.k(), problem_size.k()); + + dim3 grid = swizzle.get_grid_shape(tiled_shape); + dim3 block(tiled_size.n() / ThreadShape::kN, tiled_size.batch(), tiled_size.k() / problem_size.k()); + + // Some sanity checks + EXPECT_TRUE( block.x*block.y*block.z <= 1024 ); + EXPECT_TRUE( block.x <= 1024 ); + EXPECT_TRUE( block.y <= 1024 ); + EXPECT_TRUE( block.z <= 64 ); + + #if 0 + printf("grid dim = %d, %d, %d\n", grid.x, grid.y, grid.z); + printf("block dim = %d, %d, %d\n", block.x, block.y, block.z); + #endif + + cudaError_t result; + cudaEvent_t start_event, end_event; + + for (int iter = 0; iter < (perf_test ? (perf_test_iter+1) : 1); ++iter) + { + if (perf_test && iter == 1) + { + result = cudaEventCreate(&start_event); + EXPECT_EQ(result, cudaSuccess); + + result = cudaEventCreate(&end_event); + EXPECT_EQ(result, cudaSuccess); + + result = cudaEventRecord(start_event); + EXPECT_EQ(result, cudaSuccess); + } + + if (beta == ElementCD(0)) + { + if (alpha == ElementCD(1)) + { + cutlass::gemm::kernel::GemvBatchedStrided<<< grid, block >>>( + problem_size, + matrix_A.device_ref(), + matrix_A.capacity(), + matrix_B.device_ref(), + matrix_B.capacity(), + matrix_C_computed.device_ref(), + matrix_C_computed.capacity() + ); + } + else + { + cutlass::gemm::kernel::GemvBatchedStrided<<< grid, block >>>( + problem_size, + alpha, + matrix_A.device_ref(), + matrix_A.capacity(), + matrix_B.device_ref(), + matrix_B.capacity(), + matrix_C_computed.device_ref(), + matrix_C_computed.capacity() + ); + } + } + else + { + cutlass::gemm::kernel::GemvBatchedStrided<<< grid, block >>>( + problem_size, + alpha, + beta, + matrix_A.device_ref(), + matrix_A.capacity(), + matrix_B.device_ref(), + matrix_B.capacity(), + matrix_C_computed.device_ref(), + matrix_C_computed.capacity(), + matrix_C_computed.device_ref(), + matrix_C_computed.capacity() + ); + } + + if (iter == 0) + { + result = cudaGetLastError(); + EXPECT_EQ(result, cudaSuccess) << " kernel error: " << cudaGetErrorString(result); + } + } + + if (perf_test) + { + result = cudaEventRecord(end_event); + EXPECT_EQ(result, cudaSuccess); + } + + result = cudaDeviceSynchronize(); + EXPECT_EQ(result, cudaSuccess) << " kernel error: " << cudaGetErrorString(result); + + if (perf_test) + { + float ms; + result = cudaEventElapsedTime(&ms, start_event, end_event); + EXPECT_EQ(result, cudaSuccess); + + double flops = (double(problem_size.m()) * + double(problem_size.n()) * + double(problem_size.k()) * + double(problem_size.batch()) * 2); // 2 for MAC + + double read_bytes = double(problem_size.batch()) * (sizeof(ElementA)*double(problem_size.m())*double(problem_size.k()) + + sizeof(ElementB)*double(problem_size.k())*double(problem_size.n())); + + double write_bytes = double(problem_size.batch()) * (sizeof(ElementCD)*double(problem_size.m())*double(problem_size.n())); + + double avg_runtime = double(ms) / perf_test_iter; + double gflops_per_sec = flops / 1.0e6 / avg_runtime; + double read_bandwidth = read_bytes / 1.0e6 / avg_runtime; + double write_bandwidth = write_bytes / 1.0e6 / avg_runtime; + + std::cout << "\n\nProblem size: " + << problem_size.m() + << " x " << problem_size.n() + << " x " << problem_size.k() + << " x " << problem_size.batch() + << std::endl; + + std::cout << " GFLOPs: " << gflops_per_sec << std::endl; + std::cout << "BW (R/W): " << read_bandwidth << " / " << write_bandwidth << " GB/sec" << std::endl; + std::cout << " Runtime: " << avg_runtime << " ms" << std::endl; + } + else + { + matrix_C_computed.sync_host(); + + // Compute the batched gemms + for (int b = 0; b < problem_size.batch(); b++) + { + cutlass::reference::host::Gemm + reference_gemm; + + reference_gemm( + problem_size.mnk(), alpha, + matrix_A.host_ref(b * matrix_A.capacity()), + matrix_B.host_ref(b * matrix_B.capacity()), beta, + matrix_C_reference.host_ref(b * matrix_C_computed.capacity())); + + bool passed = cutlass::reference::host::TensorEquals( + matrix_C_computed.host_view(b * matrix_C_computed.capacity()), + matrix_C_reference.host_view(b * matrix_C_reference.capacity())); + + EXPECT_TRUE(passed) + //<< "A:\n" << matrix_A.host_view() << "\n" + //<< "B:\n" << matrix_B.host_view() << "\n" + << "Batch: " << b << "\n" + << "Reference:\n" + << matrix_C_reference.host_view(b * matrix_C_reference.capacity()) + << "\n" + << "Computed:\n" + << matrix_C_computed.host_view(b * matrix_C_computed.capacity()) + << "\n"; + } + } +} + +template +void batched_gemv_kernel_perf_test(cutlass::gemm::BatchedGemmCoord problem_size, + ElementCD_ alpha = ElementCD_(1), + ElementCD_ beta = ElementCD_(0), + int iter = 50) +{ + batched_gemv_kernel_test(problem_size, alpha, beta, true, iter); +} + +} // namespace threadblock +} // namespace kernel +} // namespace test diff --git a/test/unit/gemm/thread/CMakeLists.txt b/test/unit/gemm/thread/CMakeLists.txt index 48ca115728..af84c9d0a7 100644 --- a/test/unit/gemm/thread/CMakeLists.txt +++ b/test/unit/gemm/thread/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/gemm/thread/gemm_sm50.cu b/test/unit/gemm/thread/gemm_sm50.cu index 4265922841..c28fc20c23 100644 --- a/test/unit/gemm/thread/gemm_sm50.cu +++ b/test/unit/gemm/thread/gemm_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/thread/gemm_sm60.cu b/test/unit/gemm/thread/gemm_sm60.cu index b0b9fdb5b7..3725ccbbd6 100644 --- a/test/unit/gemm/thread/gemm_sm60.cu +++ b/test/unit/gemm/thread/gemm_sm60.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/thread/gemm_sm61.cu b/test/unit/gemm/thread/gemm_sm61.cu index f6e7724dd8..50a8ba7839 100644 --- a/test/unit/gemm/thread/gemm_sm61.cu +++ b/test/unit/gemm/thread/gemm_sm61.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/thread/host/CMakeLists.txt b/test/unit/gemm/thread/host/CMakeLists.txt index c58540264d..136d0f33c4 100644 --- a/test/unit/gemm/thread/host/CMakeLists.txt +++ b/test/unit/gemm/thread/host/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/gemm/thread/host/gemm_sm60_host.cu b/test/unit/gemm/thread/host/gemm_sm60_host.cu index 346b80cbe2..aef63790ff 100644 --- a/test/unit/gemm/thread/host/gemm_sm60_host.cu +++ b/test/unit/gemm/thread/host/gemm_sm60_host.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/thread/host/testbed_host.h b/test/unit/gemm/thread/host/testbed_host.h index 4d5e441dd5..ef24bbc30f 100644 --- a/test/unit/gemm/thread/host/testbed_host.h +++ b/test/unit/gemm/thread/host/testbed_host.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/thread/testbed.h b/test/unit/gemm/thread/testbed.h index bdfb8278f4..175cd4cdd3 100644 --- a/test/unit/gemm/thread/testbed.h +++ b/test/unit/gemm/thread/testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/CMakeLists.txt b/test/unit/gemm/threadblock/CMakeLists.txt index f4f074fe99..7ad3ca784b 100644 --- a/test/unit/gemm/threadblock/CMakeLists.txt +++ b/test/unit/gemm/threadblock/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/batched_gemv.cu b/test/unit/gemm/threadblock/batched_gemv.cu index 94ae947bd2..f4a9d425a3 100644 --- a/test/unit/gemm/threadblock/batched_gemv.cu +++ b/test/unit/gemm/threadblock/batched_gemv.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/epilogue_workspace.cu b/test/unit/gemm/threadblock/epilogue_workspace.cu index 1301aeb4dd..b627a5a96a 100644 --- a/test/unit/gemm/threadblock/epilogue_workspace.cu +++ b/test/unit/gemm/threadblock/epilogue_workspace.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_multistage.cu b/test/unit/gemm/threadblock/mma_multistage.cu index e4a030d6fa..8e76904189 100644 --- a/test/unit/gemm/threadblock/mma_multistage.cu +++ b/test/unit/gemm/threadblock/mma_multistage.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_multistage_sparse.cu b/test/unit/gemm/threadblock/mma_multistage_sparse.cu index 13eb180e05..ca5b259c6d 100644 --- a/test/unit/gemm/threadblock/mma_multistage_sparse.cu +++ b/test/unit/gemm/threadblock/mma_multistage_sparse.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h b/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h index d667d8f550..a947af7f5c 100644 --- a/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h +++ b/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_multistage_testbed.h b/test/unit/gemm/threadblock/mma_multistage_testbed.h index 6b8dc94fb6..84dfdbdb5c 100644 --- a/test/unit/gemm/threadblock/mma_multistage_testbed.h +++ b/test/unit/gemm/threadblock/mma_multistage_testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -267,6 +267,9 @@ struct Testbed { cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size); if (result != cudaSuccess) { + if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) { + std::cerr << "Test waived due to insufficient CUDA device." << std::endl; + } return true; } @@ -275,7 +278,10 @@ struct Testbed { cudaFuncAttributePreferredSharedMemoryCarveout, 100); if (result != cudaSuccess) { - return true; + if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) { + std::cerr << "Test waived due to insufficient CUDA device." << std::endl; + } + return true; } } diff --git a/test/unit/gemm/threadblock/mma_pipelined_simt.cu b/test/unit/gemm/threadblock/mma_pipelined_simt.cu index 522b029adb..010e4306c4 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_simt.cu +++ b/test/unit/gemm/threadblock/mma_pipelined_simt.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm70.cu b/test/unit/gemm/threadblock/mma_pipelined_sm70.cu index c9c714bcf6..301b8ea878 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_sm70.cu +++ b/test/unit/gemm/threadblock/mma_pipelined_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm75.cu b/test/unit/gemm/threadblock/mma_pipelined_sm75.cu index e4125eb4f0..134712b660 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_sm75.cu +++ b/test/unit/gemm/threadblock/mma_pipelined_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm80.cu b/test/unit/gemm/threadblock/mma_pipelined_sm80.cu index 14dd68e72d..7cd16006ef 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_sm80.cu +++ b/test/unit/gemm/threadblock/mma_pipelined_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_pipelined_testbed.h b/test/unit/gemm/threadblock/mma_pipelined_testbed.h index 8190c50a41..ee71c51a6e 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_testbed.h +++ b/test/unit/gemm/threadblock/mma_pipelined_testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without *modification, are permitted provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu index 4fb964c1ae..6214359b26 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu +++ b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu index fd2ae356fa..c67a24740b 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu +++ b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_planar_complex_sm80.cu b/test/unit/gemm/threadblock/mma_planar_complex_sm80.cu index ebcf0a355e..4465a3aa8f 100644 --- a/test/unit/gemm/threadblock/mma_planar_complex_sm80.cu +++ b/test/unit/gemm/threadblock/mma_planar_complex_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_planar_complex_testbed.h b/test/unit/gemm/threadblock/mma_planar_complex_testbed.h index 148e34d959..e1b537d556 100644 --- a/test/unit/gemm/threadblock/mma_planar_complex_testbed.h +++ b/test/unit/gemm/threadblock/mma_planar_complex_testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without *modification, are permitted provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu index 8c687f8810..146849d923 100644 --- a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu +++ b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu index 262269b75d..909e56c4f4 100644 --- a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu +++ b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/CMakeLists.txt b/test/unit/gemm/warp/CMakeLists.txt index 695508fa5a..14a85df83e 100644 --- a/test/unit/gemm/warp/CMakeLists.txt +++ b/test/unit/gemm/warp/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -29,6 +29,7 @@ cutlass_test_unit_add_executable( gemm_sm75.cu gemm_sm80.cu gemm_complex_sm80.cu + gemm_sparse_sm80.cu gemm_gaussian_complex_sm80.cu wmma_sm70.cu wmma_sm72.cu diff --git a/test/unit/gemm/warp/gemm_complex_sm80.cu b/test/unit/gemm/warp/gemm_complex_sm80.cu index 99effe4004..abc26487aa 100644 --- a/test/unit/gemm/warp/gemm_complex_sm80.cu +++ b/test/unit/gemm/warp/gemm_complex_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu b/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu index 43ad2dfd85..682d37b559 100644 --- a/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu +++ b/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/gemm_sm50.cu b/test/unit/gemm/warp/gemm_sm50.cu index bb4ba5be58..88b84d8743 100644 --- a/test/unit/gemm/warp/gemm_sm50.cu +++ b/test/unit/gemm/warp/gemm_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -35,7 +35,7 @@ #include "testbed.h" ///////////////////////////////////////////////////////////////////////////////////////////////// - +// NT SMEM layout TEST(SM50_warp_gemm_f32_col_row_col, 32x16x1_4x4x1) { using Policy = cutlass::gemm::warp::MmaSimtPolicy< @@ -58,6 +58,78 @@ TEST(SM50_warp_gemm_f32_col_row_col, 32x16x1_4x4x1) { test::gemm::warp::Testbed >().run(); } +// TN SMEM layout +TEST(SM50_warp_gemm_f32_row_col_col, 32x16x1_4x4x1) { + + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape<8, 4>, + cutlass::layout::ColumnMajorInterleaved<2>, + cutlass::gemm::GemmShape<4, 4, 1> + >; + + using Mma = cutlass::gemm::warp::MmaSimt< + cutlass::gemm::GemmShape<32, 16, 8>, + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::ColumnMajor, + float, + cutlass::layout::ColumnMajor, + Policy + >; + + test::gemm::warp::Testbed >().run(); +} + +// TT SMEM layout +TEST(SM50_warp_gemm_f32_row_row_col, 32x16x1_4x4x1) { + + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape<8, 4>, + cutlass::layout::ColumnMajorInterleaved<2>, + cutlass::gemm::GemmShape<4, 4, 1> + >; + + using Mma = cutlass::gemm::warp::MmaSimt< + cutlass::gemm::GemmShape<32, 16, 8>, + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::ColumnMajor, + Policy + >; + + test::gemm::warp::Testbed >().run(); +} + +// NN SMEM layout +TEST(SM50_warp_gemm_f32_col_col_col, 32x16x1_4x4x1) { + + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape<8, 4>, + cutlass::layout::ColumnMajorInterleaved<2>, + cutlass::gemm::GemmShape<4, 4, 1> + >; + + using Mma = cutlass::gemm::warp::MmaSimt< + cutlass::gemm::GemmShape<32, 16, 8>, + float, + cutlass::layout::ColumnMajor, + float, + cutlass::layout::ColumnMajor, + float, + cutlass::layout::ColumnMajor, + Policy + >; + + test::gemm::warp::Testbed >().run(); +} + + +///////////////////////////////////////////////////////////////////////////////////////////////// +// NT SMEM layout TEST(SM50_warp_gemm_f32_col_row_row, 16x32x1_4x4x1) { using Policy = cutlass::gemm::warp::MmaSimtPolicy< @@ -80,8 +152,31 @@ TEST(SM50_warp_gemm_f32_col_row_row, 16x32x1_4x4x1) { test::gemm::warp::Testbed >().run(); } -///////////////////////////////////////////////////////////////////////////////////////////////// +// TN SMEM layout +TEST(SM50_warp_gemm_f32_row_col_row, 16x32x1_4x4x1) { + + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape<4, 8>, + cutlass::layout::RowMajorInterleaved<2>, + cutlass::gemm::GemmShape<4, 4, 1> + >; + + using Mma = cutlass::gemm::warp::MmaSimt< + cutlass::gemm::GemmShape<16, 32, 8>, + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::ColumnMajor, + float, + cutlass::layout::RowMajor, + Policy + >; + + test::gemm::warp::Testbed >().run(); +} +///////////////////////////////////////////////////////////////////////////////////////////////// +// NT SMEM layout TEST(SM50_warp_gemm_f32_col_row_col, 32x16x1_2x2x1) { using Policy = cutlass::gemm::warp::MmaSimtPolicy< @@ -126,8 +221,52 @@ TEST(SM50_warp_gemm_f32_col_row_row, 32x16x1_2x2x1) { test::gemm::warp::Testbed>().run(); } -///////////////////////////////////////////////////////////////////////////////////////////////// +// TN SMEM layout +TEST(SM50_warp_gemm_f32_row_col_col, 32x16x1_2x2x1) { + + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape<8, 4>, + cutlass::layout::ColumnMajorInterleaved<2>, + cutlass::gemm::GemmShape<2, 2, 1> + >; + + using Mma = cutlass::gemm::warp::MmaSimt< + cutlass::gemm::GemmShape<32, 16, 8>, + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::ColumnMajor, + float, + cutlass::layout::ColumnMajor, + Policy + >; + test::gemm::warp::Testbed>().run(); +} + +TEST(SM50_warp_gemm_f32_row_col_row, 32x16x1_2x2x1) { + + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape<8, 4>, + cutlass::layout::RowMajorInterleaved<2>, + cutlass::gemm::GemmShape<2, 2, 1> + >; + + using Mma = cutlass::gemm::warp::MmaSimt< + cutlass::gemm::GemmShape<32, 16, 8>, + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::ColumnMajor, + float, + cutlass::layout::RowMajor, + Policy + >; + + test::gemm::warp::Testbed>().run(); +} +///////////////////////////////////////////////////////////////////////////////////////////////// +// NT SMEM layout TEST(SM50_warp_gemm_f32_col_row_col, 32x64x1_4x4x1) { using Policy = cutlass::gemm::warp::MmaSimtPolicy< @@ -172,6 +311,50 @@ TEST(SM50_warp_gemm_f32_col_row_row, 32x64x1_4x4x1) { test::gemm::warp::Testbed>().run(); } +// TN SMEM layout +TEST(SM50_warp_gemm_f32_row_col_col, 32x64x1_4x4x1) { + + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape<8, 4>, + cutlass::layout::ColumnMajorInterleaved<2>, + cutlass::gemm::GemmShape<4, 4, 1> + >; + + using Mma = cutlass::gemm::warp::MmaSimt< + cutlass::gemm::GemmShape<64, 32, 8>, + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::ColumnMajor, + float, + cutlass::layout::ColumnMajor, + Policy + >; + + test::gemm::warp::Testbed>().run(); +} + +TEST(SM50_warp_gemm_f32_row_col_row, 32x64x1_4x4x1) { + + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape<4, 8>, + cutlass::layout::RowMajorInterleaved<2>, + cutlass::gemm::GemmShape<4, 4, 1> + >; + + using Mma = cutlass::gemm::warp::MmaSimt< + cutlass::gemm::GemmShape<32, 64, 8>, + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::ColumnMajor, + float, + cutlass::layout::RowMajor, + Policy + >; + + test::gemm::warp::Testbed>().run(); +} ///////////////////////////////////////////////////////////////////////////////////////////////// TEST(SM50_warp_gemm_complex_f32_col_row_col, 64x32x1_2x2x1) { @@ -409,5 +592,4 @@ TEST(SM50_warp_gemm_complex_f64_col_row_row, 32x16x1_1x1x1) { test::gemm::warp::Testbed>().run(); } - ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/warp/gemm_sm60.cu b/test/unit/gemm/warp/gemm_sm60.cu index 4f2f3f1582..2196d10415 100644 --- a/test/unit/gemm/warp/gemm_sm60.cu +++ b/test/unit/gemm/warp/gemm_sm60.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/gemm_sm61.cu b/test/unit/gemm/warp/gemm_sm61.cu index 63e07165b6..71a905b5eb 100644 --- a/test/unit/gemm/warp/gemm_sm61.cu +++ b/test/unit/gemm/warp/gemm_sm61.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/gemm_sm70.cu b/test/unit/gemm/warp/gemm_sm70.cu index 3785290e5c..00678f3040 100644 --- a/test/unit/gemm/warp/gemm_sm70.cu +++ b/test/unit/gemm/warp/gemm_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/gemm_sm75.cu b/test/unit/gemm/warp/gemm_sm75.cu index 144475cae4..202e543640 100644 --- a/test/unit/gemm/warp/gemm_sm75.cu +++ b/test/unit/gemm/warp/gemm_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/gemm_sm80.cu b/test/unit/gemm/warp/gemm_sm80.cu index 0f736b1355..32abb54167 100644 --- a/test/unit/gemm/warp/gemm_sm80.cu +++ b/test/unit/gemm/warp/gemm_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/gemm_sparse_sm80.cu b/test/unit/gemm/warp/gemm_sparse_sm80.cu index 8df0846076..6ae76c11f9 100644 --- a/test/unit/gemm/warp/gemm_sparse_sm80.cu +++ b/test/unit/gemm/warp/gemm_sparse_sm80.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/testbed.h b/test/unit/gemm/warp/testbed.h index 3cc00fb447..cc5b55b26f 100644 --- a/test/unit/gemm/warp/testbed.h +++ b/test/unit/gemm/warp/testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/wmma_sm70.cu b/test/unit/gemm/warp/wmma_sm70.cu index 5b9ce63db1..6d777acf73 100644 --- a/test/unit/gemm/warp/wmma_sm70.cu +++ b/test/unit/gemm/warp/wmma_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/wmma_sm72.cu b/test/unit/gemm/warp/wmma_sm72.cu index 89bfbb5945..3a0c80f687 100644 --- a/test/unit/gemm/warp/wmma_sm72.cu +++ b/test/unit/gemm/warp/wmma_sm72.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/wmma_sm75.cu b/test/unit/gemm/warp/wmma_sm75.cu index 3818793e84..0751daeb5c 100644 --- a/test/unit/gemm/warp/wmma_sm75.cu +++ b/test/unit/gemm/warp/wmma_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/layout/CMakeLists.txt b/test/unit/layout/CMakeLists.txt index 29ebdbdd30..df496bc00e 100644 --- a/test/unit/layout/CMakeLists.txt +++ b/test/unit/layout/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/layout/matrix.cu b/test/unit/layout/matrix.cu index e463f0974e..2f686ca209 100644 --- a/test/unit/layout/matrix.cu +++ b/test/unit/layout/matrix.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/layout/tensor.cu b/test/unit/layout/tensor.cu index b4a43fb3a9..68e1dfc16f 100644 --- a/test/unit/layout/tensor.cu +++ b/test/unit/layout/tensor.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/layout/tensor_nhwc.cu b/test/unit/layout/tensor_nhwc.cu index 46482b2b2f..34300f8c68 100644 --- a/test/unit/layout/tensor_nhwc.cu +++ b/test/unit/layout/tensor_nhwc.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/nvrtc/CMakeLists.txt b/test/unit/nvrtc/CMakeLists.txt index 668ea35ebe..86aa42eeca 100644 --- a/test/unit/nvrtc/CMakeLists.txt +++ b/test/unit/nvrtc/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/nvrtc/cutlass/nvrtc/environment.h b/test/unit/nvrtc/cutlass/nvrtc/environment.h index 27e999348c..fd8bae1f82 100644 --- a/test/unit/nvrtc/cutlass/nvrtc/environment.h +++ b/test/unit/nvrtc/cutlass/nvrtc/environment.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/nvrtc/kernel/thread/testbed_kernel.h b/test/unit/nvrtc/kernel/thread/testbed_kernel.h index 500870581d..55edcc5518 100644 --- a/test/unit/nvrtc/kernel/thread/testbed_kernel.h +++ b/test/unit/nvrtc/kernel/thread/testbed_kernel.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/nvrtc/stdlib/stdint.h b/test/unit/nvrtc/stdlib/stdint.h index 380216811b..7ceda345a0 100644 --- a/test/unit/nvrtc/stdlib/stdint.h +++ b/test/unit/nvrtc/stdlib/stdint.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/nvrtc/thread/CMakeLists.txt b/test/unit/nvrtc/thread/CMakeLists.txt index 2e12ccfa8c..cb9b189635 100644 --- a/test/unit/nvrtc/thread/CMakeLists.txt +++ b/test/unit/nvrtc/thread/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/nvrtc/thread/gemm_nvrtc.cu b/test/unit/nvrtc/thread/gemm_nvrtc.cu index 785ebcb2ce..b799e6c9be 100644 --- a/test/unit/nvrtc/thread/gemm_nvrtc.cu +++ b/test/unit/nvrtc/thread/gemm_nvrtc.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/nvrtc/thread/testbed.h b/test/unit/nvrtc/thread/testbed.h index 41ba503ad5..1062d7a21c 100644 --- a/test/unit/nvrtc/thread/testbed.h +++ b/test/unit/nvrtc/thread/testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/reduction/CMakeLists.txt b/test/unit/reduction/CMakeLists.txt index 96c3716141..d53bc0c1d9 100644 --- a/test/unit/reduction/CMakeLists.txt +++ b/test/unit/reduction/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/reduction/kernel/CMakeLists.txt b/test/unit/reduction/kernel/CMakeLists.txt index e1983153d1..89bb511a47 100644 --- a/test/unit/reduction/kernel/CMakeLists.txt +++ b/test/unit/reduction/kernel/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/reduction/kernel/reduce_splitk.cu b/test/unit/reduction/kernel/reduce_splitk.cu index b169cb60f1..6a27736f96 100644 --- a/test/unit/reduction/kernel/reduce_splitk.cu +++ b/test/unit/reduction/kernel/reduce_splitk.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/reduction/kernel/reduce_splitk_testbed.h b/test/unit/reduction/kernel/reduce_splitk_testbed.h index 8e70407063..4e6274bec0 100644 --- a/test/unit/reduction/kernel/reduce_splitk_testbed.h +++ b/test/unit/reduction/kernel/reduce_splitk_testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/reduction/thread/CMakeLists.txt b/test/unit/reduction/thread/CMakeLists.txt index 0641590e8c..29de471363 100644 --- a/test/unit/reduction/thread/CMakeLists.txt +++ b/test/unit/reduction/thread/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/reduction/thread/reduction_thread.cu b/test/unit/reduction/thread/reduction_thread.cu index f71e30f53c..b2cf8045c3 100644 --- a/test/unit/reduction/thread/reduction_thread.cu +++ b/test/unit/reduction/thread/reduction_thread.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/reduction/thread/testbed.h b/test/unit/reduction/thread/testbed.h index 919839b3d6..5873d9e6a1 100644 --- a/test/unit/reduction/thread/testbed.h +++ b/test/unit/reduction/thread/testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/test_unit.cpp b/test/unit/test_unit.cpp index 3bb8ac1387..51e9269541 100644 --- a/test/unit/test_unit.cpp +++ b/test/unit/test_unit.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/transform/CMakeLists.txt b/test/unit/transform/CMakeLists.txt index a7b881ae20..d7f800f472 100644 --- a/test/unit/transform/CMakeLists.txt +++ b/test/unit/transform/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/transform/threadblock/CMakeLists.txt b/test/unit/transform/threadblock/CMakeLists.txt index 0d5e5c44a0..65d31daca1 100644 --- a/test/unit/transform/threadblock/CMakeLists.txt +++ b/test/unit/transform/threadblock/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/transform/threadblock/predicated_tile_iterator.cu b/test/unit/transform/threadblock/predicated_tile_iterator.cu index 562c7888a2..be8084e20f 100644 --- a/test/unit/transform/threadblock/predicated_tile_iterator.cu +++ b/test/unit/transform/threadblock/predicated_tile_iterator.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu b/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu index 8d2382e4cf..4183ed0f8e 100644 --- a/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu +++ b/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/util/CMakeLists.txt b/test/unit/util/CMakeLists.txt index 7f103cbf3c..9f583b821b 100644 --- a/test/unit/util/CMakeLists.txt +++ b/test/unit/util/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/util/tensor_reduce.cu b/test/unit/util/tensor_reduce.cu index 5a1afc7f39..d29022b16f 100644 --- a/test/unit/util/tensor_reduce.cu +++ b/test/unit/util/tensor_reduce.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index e43c821e64..753471bf3e 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/tools/library/CMakeLists.txt b/tools/library/CMakeLists.txt index 4bf7577fb8..5b3cec087e 100644 --- a/tools/library/CMakeLists.txt +++ b/tools/library/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/tools/library/include/cutlass/library/handle.h b/tools/library/include/cutlass/library/handle.h index 27d2bfe6a4..fe5ac8191e 100644 --- a/tools/library/include/cutlass/library/handle.h +++ b/tools/library/include/cutlass/library/handle.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/library/include/cutlass/library/library.h b/tools/library/include/cutlass/library/library.h index 6a018a704c..18bfce2454 100644 --- a/tools/library/include/cutlass/library/library.h +++ b/tools/library/include/cutlass/library/library.h @@ -571,7 +571,6 @@ struct ConvDescription : public OperationDescription { }; - ///////////////////////////////////////////////////////////////////////////////////////////////// /// Base class for all operations @@ -933,49 +932,14 @@ struct Conv2dConfiguration { // also includes (split_k_slices, groups) conv::Conv2dProblemSize problem_size; - /// Layout object for activations tensor - layout::TensorNHWC layout_activations; + // stride of operand A + std::vector stride_a; - /// Layout object for filters tensor - layout::TensorNHWC layout_filters; - - /// Layout object for source tensor - layout::TensorNHWC layout_source; - - /// Layout object for output tensor - layout::TensorNHWC layout_output; - - // - // Methods - // + // stride of operand B + std::vector stride_b; - // Mapping functions (A,B,C -> activation,filter,output) - layout::TensorNHWC layout_a(library::ConvKind const &conv_kind) const { - switch (conv_kind) { - case library::ConvKind::kFprop: return layout_activations; - case library::ConvKind::kDgrad: return layout_output; - case library::ConvKind::kWgrad: return layout_output; - default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); - } - } - - layout::TensorNHWC layout_b(library::ConvKind const &conv_kind) const { - switch (conv_kind) { - case library::ConvKind::kFprop: return layout_filters; - case library::ConvKind::kDgrad: return layout_filters; - case library::ConvKind::kWgrad: return layout_activations; - default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); - } - } - - layout::TensorNHWC layout_c(library::ConvKind const &conv_kind) const { - switch (conv_kind) { - case library::ConvKind::kFprop: return layout_output; - case library::ConvKind::kDgrad: return layout_activations; - case library::ConvKind::kWgrad: return layout_filters; - default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)"); - } - } + // stride of operand C + std::vector stride_c; }; diff --git a/tools/library/include/cutlass/library/manifest.h b/tools/library/include/cutlass/library/manifest.h index 2bde2884b4..99e6b79248 100644 --- a/tools/library/include/cutlass/library/manifest.h +++ b/tools/library/include/cutlass/library/manifest.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/library/scripts/generator.py b/tools/library/scripts/generator.py index 491997cb89..681fb82837 100644 --- a/tools/library/scripts/generator.py +++ b/tools/library/scripts/generator.py @@ -929,10 +929,10 @@ def GenerateSM75_TensorOp_8816_Interleaved(manifest, args): operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) -# conv_layout = (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32, LayoutType.TensorNC32HW32) -# -# operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions, -# data_type_mixed, 16, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp) + conv_layout = (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32, LayoutType.TensorNC32HW32) + + operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions, + data_type_mixed, 16, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp) for op in operations: op.C.alignment = 8 @@ -1069,10 +1069,10 @@ def GenerateSM75_TensorOp_8832_Interleaved(manifest, args): operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) -# conv_layout = (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64, LayoutType.TensorNC64HW64) -# -# operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions, -# data_type_mixed, 32, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp) + conv_layout = (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64, LayoutType.TensorNC64HW64) + + operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions, + data_type_mixed, 32, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp) for op in operations: op.C.alignment = 16 @@ -1644,10 +1644,10 @@ def GenerateSM80_TensorOp_16832_Interleaved(manifest, args): operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) -# conv_layout = (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32, LayoutType.TensorNC32HW32) -# -# operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions, -# data_type_mixed, 16, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp) + conv_layout = (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32, LayoutType.TensorNC32HW32) + + operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions, + data_type_mixed, 16, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp) for op in operations: op.C.alignment = 8 @@ -1825,10 +1825,10 @@ def GenerateSM80_TensorOp_16864_Interleaved(manifest, args): operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \ data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) -# conv_layout = (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64, LayoutType.TensorNC64HW64) -# -# operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions, -# data_type_mixed, 32, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp) + conv_layout = (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64, LayoutType.TensorNC64HW64) + + operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions, + data_type_mixed, 32, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp) for op in operations: op.C.alignment = 16 @@ -2096,7 +2096,6 @@ def GenerateSM80_TensorOp_1688_complex(manifest, args): max_cc = 1024 tile_descriptions = [ - TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc), TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc), TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), diff --git a/tools/library/scripts/library.py b/tools/library/scripts/library.py index b9538cdbc5..5df09a8970 100644 --- a/tools/library/scripts/library.py +++ b/tools/library/scripts/library.py @@ -187,7 +187,6 @@ class DataType(enum.Enum): } ################################################################################################### - # class ComplexTransform(enum.Enum): none = enum_auto() @@ -312,7 +311,7 @@ class LayoutType(enum.Enum): # ShortLayoutTypeNames = { LayoutType.ColumnMajor: 'n', - LayoutType.ColumnMajorInterleaved32: 'n2', + LayoutType.ColumnMajorInterleaved2: 'n2', LayoutType.ColumnMajorInterleaved32: 'n32', LayoutType.ColumnMajorInterleaved64: 'n64', LayoutType.RowMajor: 't', @@ -343,6 +342,8 @@ class OpcodeClass(enum.Enum): Simt = enum_auto() TensorOp = enum_auto() WmmaTensorOp = enum_auto() + SparseTensorOp = enum_auto() + OpcodeClassNames = { OpcodeClass.Simt: 'simt', diff --git a/tools/library/src/conv2d_operation.h b/tools/library/src/conv2d_operation.h index 5e8f887fd1..9cc332498e 100644 --- a/tools/library/src/conv2d_operation.h +++ b/tools/library/src/conv2d_operation.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/library/src/conv3d_operation.h b/tools/library/src/conv3d_operation.h index 32ad036320..6f110a46e1 100644 --- a/tools/library/src/conv3d_operation.h +++ b/tools/library/src/conv3d_operation.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/library/src/gemm_operation.h b/tools/library/src/gemm_operation.h index d65e3414d5..5dd2ed2935 100644 --- a/tools/library/src/gemm_operation.h +++ b/tools/library/src/gemm_operation.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/library/src/handle.cu b/tools/library/src/handle.cu index 3f19def654..6108bdc759 100644 --- a/tools/library/src/handle.cu +++ b/tools/library/src/handle.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/library/src/library_internal.h b/tools/library/src/library_internal.h index 4bbd21c763..218e1a3f32 100644 --- a/tools/library/src/library_internal.h +++ b/tools/library/src/library_internal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/library/src/manifest.cpp b/tools/library/src/manifest.cpp index 12358dcdd3..bbfc3411f9 100644 --- a/tools/library/src/manifest.cpp +++ b/tools/library/src/manifest.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/library/src/reduction/init_reduction_operations.cu b/tools/library/src/reduction/init_reduction_operations.cu index 5f86b64f78..41788f5d72 100644 --- a/tools/library/src/reduction/init_reduction_operations.cu +++ b/tools/library/src/reduction/init_reduction_operations.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/library/src/reduction/reduction_device.cu b/tools/library/src/reduction/reduction_device.cu index e2133cc0a5..c07ba01455 100644 --- a/tools/library/src/reduction/reduction_device.cu +++ b/tools/library/src/reduction/reduction_device.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/library/src/reference/conv2d.cu b/tools/library/src/reference/conv2d.cu index 750ebdf31c..f115384dcf 100644 --- a/tools/library/src/reference/conv2d.cu +++ b/tools/library/src/reference/conv2d.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/library/src/reference/conv3d.cu b/tools/library/src/reference/conv3d.cu index 1e1544bff6..29dc880a05 100644 --- a/tools/library/src/reference/conv3d.cu +++ b/tools/library/src/reference/conv3d.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/library/src/reference/conv_reference_operation.h b/tools/library/src/reference/conv_reference_operation.h index 1e826ab29e..811621c125 100644 --- a/tools/library/src/reference/conv_reference_operation.h +++ b/tools/library/src/reference/conv_reference_operation.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -109,7 +109,19 @@ struct ConvReferenceDispatcher< Conv2dConfiguration const &config = *static_cast(configuration); - ConvKind const conv_kind = ConvKindMap::kId; + // TODO: make below code more general. It is fixed for NHWC now. + layout::TensorNHWC layout_a; + layout::TensorNHWC layout_b; + layout::TensorNHWC layout_c; + + layout_a.stride() = + make_Coord(config.stride_a[0], config.stride_a[1], config.stride_a[2]); + + layout_b.stride() = + make_Coord(config.stride_b[0], config.stride_b[1], config.stride_b[2]); + + layout_c.stride() = + make_Coord(config.stride_c[0], config.stride_c[1], config.stride_c[2]); if (kProvider == Provider::kReferenceHost) { @@ -127,10 +139,10 @@ struct ConvReferenceDispatcher< >( kConvolutionalOperator, config.problem_size, - {ptr_A, config.layout_a(conv_kind)}, - {ptr_B, config.layout_b(conv_kind)}, - {ptr_C, config.layout_c(conv_kind)}, - {ptr_D, config.layout_c(conv_kind)}, + {ptr_A, layout_a}, + {ptr_B, layout_b}, + {ptr_C, layout_c}, + {ptr_D, layout_c}, alpha, beta ); @@ -152,10 +164,10 @@ struct ConvReferenceDispatcher< >( kConvolutionalOperator, config.problem_size, - {ptr_A, config.layout_a(conv_kind)}, - {ptr_B, config.layout_b(conv_kind)}, - {ptr_C, config.layout_c(conv_kind)}, - {ptr_D, config.layout_c(conv_kind)}, + {ptr_A, layout_a}, + {ptr_B, layout_b}, + {ptr_C, layout_c}, + {ptr_D, layout_c}, alpha, beta, stream diff --git a/tools/library/src/reference/gemm.cu b/tools/library/src/reference/gemm.cu index 8e5361fd20..c95f3b5444 100644 --- a/tools/library/src/reference/gemm.cu +++ b/tools/library/src/reference/gemm.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/library/src/reference/gemm_reference_operation.h b/tools/library/src/reference/gemm_reference_operation.h index 11a5230bbe..b331bb5870 100644 --- a/tools/library/src/reference/gemm_reference_operation.h +++ b/tools/library/src/reference/gemm_reference_operation.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/library/src/reference/initialize_reference_operations.cu b/tools/library/src/reference/initialize_reference_operations.cu index c749c2bca9..624506d704 100644 --- a/tools/library/src/reference/initialize_reference_operations.cu +++ b/tools/library/src/reference/initialize_reference_operations.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/CMakeLists.txt b/tools/profiler/CMakeLists.txt index 3ac944a9f2..bb3975c4d0 100644 --- a/tools/profiler/CMakeLists.txt +++ b/tools/profiler/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -87,6 +87,7 @@ install( set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM --operation=Gemm --providers=cutlass --verification-providers=cublas,device --junit-output=test_cutlass_profiler_gemm) set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D --operation=Conv2d --providers=cutlass --verification-providers=cudnn,device --junit-output=test_cutlass_profiler_conv2d) set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D --operation=Conv3d --providers=cutlass --verification-providers=cudnn,device,host --junit-output=test_cutlass_profiler_conv3d) +set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SPGEMM --operation=SparseGemm --providers=cutlass --verification-providers=cublas,device,host --junit-output=test_cutlass_profiler_spgemm) cutlass_add_executable_tests( test_profiler cutlass_profiler DEPENDEES test_all @@ -94,5 +95,6 @@ cutlass_add_executable_tests( CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D + CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SPGEMM DISABLE_EXECUTABLE_INSTALL_RULE ) diff --git a/tools/profiler/src/conv2d_operation_profiler.cu b/tools/profiler/src/conv2d_operation_profiler.cu index 4b91535719..2246e9610e 100644 --- a/tools/profiler/src/conv2d_operation_profiler.cu +++ b/tools/profiler/src/conv2d_operation_profiler.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -381,24 +381,9 @@ Status Conv2dOperationProfiler::initialize_configuration( conv_workspace_.configuration.split_k_mode = static_cast(static_cast(problem_.split_k_mode)); - conv_workspace_.configuration.layout_activations.stride() = make_Coord( - int(problem_.c), - int(problem_.w) * int(problem_.c), - int(problem_.h) * int(problem_.w) * int(problem_.c) - ); - - conv_workspace_.configuration.layout_filters.stride() = make_Coord( - int(problem_.c), - int(problem_.s) * int(problem_.c), - int(problem_.r) * int(problem_.s) * int(problem_.c) - ); - - conv_workspace_.configuration.layout_output.stride() = make_Coord( - int(problem_.k), - int(problem_.q) * int(problem_.k), - int(problem_.q) * int(problem_.p) * int(problem_.k) - ); - + conv_workspace_.set_stride_vector( + problem_, operation_desc.conv_kind, operation_desc.A.layout, + operation_desc.B.layout, operation_desc.C.layout); // initialize library::ConvArguments conv_workspace_.arguments.A = nullptr; @@ -540,9 +525,12 @@ bool Conv2dOperationProfiler::initialize_reduction_configuration_( conv_workspace_.reduction_configuration.problem_size = problem_.eq_gemm_size(conv_kind).mn(); conv_workspace_.reduction_configuration.partitions = int(problem_.split_k_slices); conv_workspace_.reduction_configuration.partition_stride = problem_.eq_gemm_size(conv_kind).mn().product(); - conv_workspace_.reduction_configuration.ldw = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx]; - conv_workspace_.reduction_configuration.lds = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx]; - conv_workspace_.reduction_configuration.ldd = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx]; + conv_workspace_.reduction_configuration.ldw = + conv_workspace_.configuration.stride_c[tensor_c_stride_idx]; + conv_workspace_.reduction_configuration.lds = + conv_workspace_.configuration.stride_c[tensor_c_stride_idx]; + conv_workspace_.reduction_configuration.ldd = + conv_workspace_.configuration.stride_c[tensor_c_stride_idx]; // find reduction operation library::ReductionFunctionalKey reduction_key( @@ -616,7 +604,7 @@ Status Conv2dOperationProfiler::initialize_workspace( operation_desc.A.element, operation_desc.A.layout, problem_.extent_a(operation_desc.conv_kind), - conv_workspace_.stride_a(operation_desc.conv_kind), + conv_workspace_.configuration.stride_a, conv_workspace_.problem_count ); @@ -626,7 +614,7 @@ Status Conv2dOperationProfiler::initialize_workspace( operation_desc.B.element, operation_desc.B.layout, problem_.extent_b(operation_desc.conv_kind), - conv_workspace_.stride_b(operation_desc.conv_kind), + conv_workspace_.configuration.stride_b, conv_workspace_.problem_count ); @@ -636,7 +624,7 @@ Status Conv2dOperationProfiler::initialize_workspace( operation_desc.C.element, operation_desc.C.layout, problem_.extent_c(operation_desc.conv_kind), - conv_workspace_.stride_c(operation_desc.conv_kind), + conv_workspace_.configuration.stride_c, conv_workspace_.problem_count ); @@ -645,7 +633,7 @@ Status Conv2dOperationProfiler::initialize_workspace( operation_desc.C.element, operation_desc.C.layout, problem_.extent_c(operation_desc.conv_kind), - conv_workspace_.stride_c(operation_desc.conv_kind), + conv_workspace_.configuration.stride_c, conv_workspace_.problem_count ); @@ -654,10 +642,9 @@ Status Conv2dOperationProfiler::initialize_workspace( operation_desc.C.element, operation_desc.C.layout, problem_.extent_c(operation_desc.conv_kind), - conv_workspace_.stride_c(operation_desc.conv_kind), + conv_workspace_.configuration.stride_c, conv_workspace_.problem_count ); - } // diff --git a/tools/profiler/src/conv2d_operation_profiler.h b/tools/profiler/src/conv2d_operation_profiler.h index 40c003e1d4..2f99b67ce4 100644 --- a/tools/profiler/src/conv2d_operation_profiler.h +++ b/tools/profiler/src/conv2d_operation_profiler.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -257,42 +257,95 @@ class Conv2dOperationProfiler : public OperationProfiler { /// host buffer for tensor c std::vector host_tensor_c; - // // Methods // - Conv2dWorkspace(): - A(nullptr), B(nullptr), C(nullptr), Computed(nullptr), Reference(nullptr) { } - - // Returns stride vector for tensor A - std::vector stride_a(library::ConvKind const &conv_kind) { - return { - configuration.layout_a(conv_kind).stride()[0], - configuration.layout_a(conv_kind).stride()[1], - configuration.layout_a(conv_kind).stride()[2] - }; - } - - // Returns stride vector for tensor B - std::vector stride_b(library::ConvKind const &conv_kind) { - - return { - configuration.layout_b(conv_kind).stride()[0], - configuration.layout_b(conv_kind).stride()[1], - configuration.layout_b(conv_kind).stride()[2] - }; + Conv2dWorkspace() + : A(nullptr), + B(nullptr), + C(nullptr), + Computed(nullptr), + Reference(nullptr) {} + + // Set stride vector for tensor activations, filters, output + void set_stride_vector(Conv2dProblem const &problem, + library::ConvKind const &conv_kind, + library::LayoutTypeID const &layout_a, + library::LayoutTypeID const &layout_b, + library::LayoutTypeID const &layout_c) { + std::vector stride_activations; + std::vector stride_filters; + std::vector stride_output; + + // Strides for interleaved fprop + if (conv_kind == library::ConvKind::kFprop && + ((layout_a == library::LayoutTypeID::kTensorNC32HW32 && + layout_b == library::LayoutTypeID::kTensorC32RSK32 && + layout_c == library::LayoutTypeID::kTensorNC32HW32) || + (layout_a == library::LayoutTypeID::kTensorNC64HW64 && + layout_b == library::LayoutTypeID::kTensorC64RSK64 && + layout_c == library::LayoutTypeID::kTensorNC64HW64))) { + int interleave = + (layout_a == library::LayoutTypeID::kTensorNC32HW32) ? 32 : 64; + + stride_activations.push_back(int(problem.w) * interleave); + stride_activations.push_back(int(problem.w) * int(problem.h) * + interleave); + stride_activations.push_back(int(problem.h) * int(problem.w) * + int(problem.c)); + + stride_filters.push_back(int(problem.k) * interleave); + stride_filters.push_back(int(problem.k) * int(problem.s) * interleave); + stride_filters.push_back(int(problem.k) * int(problem.s) * + int(problem.r) * interleave); + + stride_output.push_back(int(problem.q) * interleave); + stride_output.push_back(int(problem.q) * int(problem.p) * interleave); + stride_output.push_back(int(problem.q) * int(problem.p) * + int(problem.k)); + } else { + // Strides for the rest cases + stride_activations.push_back(int(problem.c)); + stride_activations.push_back(int(problem.w) * int(problem.c)); + stride_activations.push_back(int(problem.h) * int(problem.w) * + int(problem.c)); + + stride_filters.push_back(int(problem.c)); + stride_filters.push_back(int(problem.s) * int(problem.c)); + stride_filters.push_back(int(problem.r) * int(problem.s) * + int(problem.c)); + + stride_output.push_back(int(problem.k)); + stride_output.push_back(int(problem.q) * int(problem.k)); + stride_output.push_back(int(problem.q) * int(problem.p) * + int(problem.k)); } - // Returns stride vector for tensor C - std::vector stride_c(library::ConvKind const &conv_kind) { - - return { - configuration.layout_c(conv_kind).stride()[0], - configuration.layout_c(conv_kind).stride()[1], - configuration.layout_c(conv_kind).stride()[2] - }; + switch (conv_kind) { + case library::ConvKind::kFprop: + configuration.stride_a = stride_activations; + configuration.stride_b = stride_filters; + configuration.stride_c = stride_output; + + break; + case library::ConvKind::kDgrad: + configuration.stride_a = stride_output; + configuration.stride_b = stride_filters; + configuration.stride_c = stride_activations; + + break; + case library::ConvKind::kWgrad: + configuration.stride_a = stride_output; + configuration.stride_b = stride_activations; + configuration.stride_c = stride_filters; + + break; + default: + throw std::runtime_error( + "Invalid Conv Operator (fprop, dgrad, wgrad)"); } + } }; protected: diff --git a/tools/profiler/src/conv3d_operation_profiler.cu b/tools/profiler/src/conv3d_operation_profiler.cu index 67f21d8f7a..6e45759abf 100644 --- a/tools/profiler/src/conv3d_operation_profiler.cu +++ b/tools/profiler/src/conv3d_operation_profiler.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/conv3d_operation_profiler.h b/tools/profiler/src/conv3d_operation_profiler.h index 04c2a15e82..2192a984c6 100644 --- a/tools/profiler/src/conv3d_operation_profiler.h +++ b/tools/profiler/src/conv3d_operation_profiler.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/cublas_helpers.cpp b/tools/profiler/src/cublas_helpers.cpp index 3369d9615a..30db20e22c 100644 --- a/tools/profiler/src/cublas_helpers.cpp +++ b/tools/profiler/src/cublas_helpers.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/cublas_helpers.h b/tools/profiler/src/cublas_helpers.h index c2bf13b5f7..ec1bf0dbba 100644 --- a/tools/profiler/src/cublas_helpers.h +++ b/tools/profiler/src/cublas_helpers.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/cudnn_helpers.cpp b/tools/profiler/src/cudnn_helpers.cpp index 86f18095bf..838a41a055 100644 --- a/tools/profiler/src/cudnn_helpers.cpp +++ b/tools/profiler/src/cudnn_helpers.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/cudnn_helpers.h b/tools/profiler/src/cudnn_helpers.h index 58fe4e678f..c93fbc93e8 100644 --- a/tools/profiler/src/cudnn_helpers.h +++ b/tools/profiler/src/cudnn_helpers.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/cutlass_profiler.cu b/tools/profiler/src/cutlass_profiler.cu index c1e33ad61e..c53e8c221f 100644 --- a/tools/profiler/src/cutlass_profiler.cu +++ b/tools/profiler/src/cutlass_profiler.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -167,6 +167,7 @@ void CutlassProfiler::print_usage_(std::ostream &out) { << " $ cutlass_profiler --operation=Gemm --help\n\n" << " $ cutlass_profiler --operation=Conv3d --help\n\n" << " $ cutlass_profiler --operation=Conv2d --help\n\n" + << " $ cutlass_profiler --operation=SparseGemm --help\n\n" ; } diff --git a/tools/profiler/src/cutlass_profiler.h b/tools/profiler/src/cutlass_profiler.h index d3b592a4ea..8bd44a893d 100644 --- a/tools/profiler/src/cutlass_profiler.h +++ b/tools/profiler/src/cutlass_profiler.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/debug.h b/tools/profiler/src/debug.h index aed11ca188..7bf5b8e761 100644 --- a/tools/profiler/src/debug.h +++ b/tools/profiler/src/debug.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/device_allocation.cu b/tools/profiler/src/device_allocation.cu index 247bcccf15..38a4acbe59 100644 --- a/tools/profiler/src/device_allocation.cu +++ b/tools/profiler/src/device_allocation.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/device_allocation.h b/tools/profiler/src/device_allocation.h index b7bb5ec729..0aa9d0ecd1 100644 --- a/tools/profiler/src/device_allocation.h +++ b/tools/profiler/src/device_allocation.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/device_context.cu b/tools/profiler/src/device_context.cu index a8bd4fa218..3ab6b4c796 100644 --- a/tools/profiler/src/device_context.cu +++ b/tools/profiler/src/device_context.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -82,6 +82,9 @@ DeviceAllocation *DeviceContext::allocate_tensor( if(!options.initialization.fix_data_distribution) { // change data distribution based on bit width switch(type) { + case library::NumericTypeID::kF16: + data_distribution.set_uniform(-3, 3, 0); + break; case library::NumericTypeID::kB1: data_distribution.set_uniform(0, 1, 0); break; diff --git a/tools/profiler/src/device_context.h b/tools/profiler/src/device_context.h index 1633a2dd29..5e74f07e20 100644 --- a/tools/profiler/src/device_context.h +++ b/tools/profiler/src/device_context.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/enumerated_types.cpp b/tools/profiler/src/enumerated_types.cpp index 29be6f8baf..0b7b21ba04 100644 --- a/tools/profiler/src/enumerated_types.cpp +++ b/tools/profiler/src/enumerated_types.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/enumerated_types.h b/tools/profiler/src/enumerated_types.h index e7e713bdbf..6b8429c49f 100644 --- a/tools/profiler/src/enumerated_types.h +++ b/tools/profiler/src/enumerated_types.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/gemm_operation_profiler.cu b/tools/profiler/src/gemm_operation_profiler.cu index cf7f8ff64c..63bbc32a99 100644 --- a/tools/profiler/src/gemm_operation_profiler.cu +++ b/tools/profiler/src/gemm_operation_profiler.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -659,7 +659,7 @@ bool GemmOperationProfiler::verify_with_cublas_( gemm_workspace_.arguments.B = gemm_workspace_.B->data(); gemm_workspace_.arguments.batch_stride_B = gemm_workspace_.B->batch_stride(); gemm_workspace_.arguments.C = gemm_workspace_.Reference->data(); - gemm_workspace_.arguments.batch_stride_D = gemm_workspace_.Reference->batch_stride(); + gemm_workspace_.arguments.batch_stride_C = gemm_workspace_.Reference->batch_stride(); gemm_workspace_.arguments.D = gemm_workspace_.Reference->data(); gemm_workspace_.arguments.batch_stride_D = gemm_workspace_.Reference->batch_stride(); gemm_workspace_.arguments.alpha = problem_.alpha.data(); diff --git a/tools/profiler/src/gemm_operation_profiler.h b/tools/profiler/src/gemm_operation_profiler.h index 1c6c5e7ceb..1adc88968d 100644 --- a/tools/profiler/src/gemm_operation_profiler.h +++ b/tools/profiler/src/gemm_operation_profiler.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/gpu_timer.cpp b/tools/profiler/src/gpu_timer.cpp index eb3a841150..a6297b025b 100644 --- a/tools/profiler/src/gpu_timer.cpp +++ b/tools/profiler/src/gpu_timer.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/gpu_timer.h b/tools/profiler/src/gpu_timer.h index 5cd4b0037f..79d8760c2f 100644 --- a/tools/profiler/src/gpu_timer.h +++ b/tools/profiler/src/gpu_timer.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/main.cpp b/tools/profiler/src/main.cpp index a1e523111d..4f76a1119d 100644 --- a/tools/profiler/src/main.cpp +++ b/tools/profiler/src/main.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/operation_profiler.cu b/tools/profiler/src/operation_profiler.cu index edd6f07ce2..f50b77e1b6 100644 --- a/tools/profiler/src/operation_profiler.cu +++ b/tools/profiler/src/operation_profiler.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/operation_profiler.h b/tools/profiler/src/operation_profiler.h index 731554b6f2..c47741290b 100644 --- a/tools/profiler/src/operation_profiler.h +++ b/tools/profiler/src/operation_profiler.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/options.cu b/tools/profiler/src/options.cu index 6bac578072..eeb7814703 100644 --- a/tools/profiler/src/options.cu +++ b/tools/profiler/src/options.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/options.h b/tools/profiler/src/options.h index 79e0169970..69d93ae257 100644 --- a/tools/profiler/src/options.h +++ b/tools/profiler/src/options.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/performance_report.cpp b/tools/profiler/src/performance_report.cpp index de184eb04b..afe2debe0e 100644 --- a/tools/profiler/src/performance_report.cpp +++ b/tools/profiler/src/performance_report.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/performance_report.h b/tools/profiler/src/performance_report.h index 5005103158..a2fe5baa3a 100644 --- a/tools/profiler/src/performance_report.h +++ b/tools/profiler/src/performance_report.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/performance_result.cu b/tools/profiler/src/performance_result.cu index 86cabfb753..1a01aa2b06 100644 --- a/tools/profiler/src/performance_result.cu +++ b/tools/profiler/src/performance_result.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/performance_result.h b/tools/profiler/src/performance_result.h index 9e3ebeb5ce..e5dc6a5c95 100644 --- a/tools/profiler/src/performance_result.h +++ b/tools/profiler/src/performance_result.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/problem_space.cpp b/tools/profiler/src/problem_space.cpp index a8c4943218..910764e55d 100644 --- a/tools/profiler/src/problem_space.cpp +++ b/tools/profiler/src/problem_space.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/problem_space.h b/tools/profiler/src/problem_space.h index 8e10dbafce..cf4e766234 100644 --- a/tools/profiler/src/problem_space.h +++ b/tools/profiler/src/problem_space.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/sparse_gemm_operation_profiler.cu b/tools/profiler/src/sparse_gemm_operation_profiler.cu index 7eff2062b0..aa960ec3df 100644 --- a/tools/profiler/src/sparse_gemm_operation_profiler.cu +++ b/tools/profiler/src/sparse_gemm_operation_profiler.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/sparse_gemm_operation_profiler.h b/tools/profiler/src/sparse_gemm_operation_profiler.h index 37905d3b88..9ae62d24cb 100644 --- a/tools/profiler/src/sparse_gemm_operation_profiler.h +++ b/tools/profiler/src/sparse_gemm_operation_profiler.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/CMakeLists.txt b/tools/util/CMakeLists.txt index 0d2f86fb99..db4dc3d9b3 100644 --- a/tools/util/CMakeLists.txt +++ b/tools/util/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/command_line.h b/tools/util/include/cutlass/util/command_line.h index c158ef9768..31187a7969 100644 --- a/tools/util/include/cutlass/util/command_line.h +++ b/tools/util/include/cutlass/util/command_line.h @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are not permitted. diff --git a/tools/util/include/cutlass/util/debug.h b/tools/util/include/cutlass/util/debug.h index 3ebbd4d843..e10e91459a 100644 --- a/tools/util/include/cutlass/util/debug.h +++ b/tools/util/include/cutlass/util/debug.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/device_dump.h b/tools/util/include/cutlass/util/device_dump.h index dac6029c41..1028d5d584 100644 --- a/tools/util/include/cutlass/util/device_dump.h +++ b/tools/util/include/cutlass/util/device_dump.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/device_memory.h b/tools/util/include/cutlass/util/device_memory.h index 79b123687a..424a0e6f09 100644 --- a/tools/util/include/cutlass/util/device_memory.h +++ b/tools/util/include/cutlass/util/device_memory.h @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are not permitted. diff --git a/tools/util/include/cutlass/util/distribution.h b/tools/util/include/cutlass/util/distribution.h index 0337737747..8e4ea159c4 100644 --- a/tools/util/include/cutlass/util/distribution.h +++ b/tools/util/include/cutlass/util/distribution.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/exceptions.h b/tools/util/include/cutlass/util/exceptions.h index 519205f6d2..d8d6ef94cc 100644 --- a/tools/util/include/cutlass/util/exceptions.h +++ b/tools/util/include/cutlass/util/exceptions.h @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are not permitted. diff --git a/tools/util/include/cutlass/util/host_reorder.h b/tools/util/include/cutlass/util/host_reorder.h index 660ee0f956..1935e390c4 100644 --- a/tools/util/include/cutlass/util/host_reorder.h +++ b/tools/util/include/cutlass/util/host_reorder.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -62,15 +62,15 @@ void reorder_column(TensorRef dest, } } -template +template void reorder_convK(TensorRef dest, TensorRef src, cutlass::gemm::GemmCoord problem_size) { - TensorRef> mappedDest(dest.data(), dest.stride(0)); - TensorRef> mappedSrc(src.data(), src.stride(0)); + TensorRef> mappedDest(dest.data(), dest.stride(0)); + TensorRef> mappedSrc(src.data(), src.stride(0)); - reorder_column( + reorder_column( mappedDest, mappedSrc, problem_size); } diff --git a/tools/util/include/cutlass/util/host_tensor.h b/tools/util/include/cutlass/util/host_tensor.h index 465d74a93b..f105434fde 100644 --- a/tools/util/include/cutlass/util/host_tensor.h +++ b/tools/util/include/cutlass/util/host_tensor.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -238,7 +238,7 @@ class HostTensor { Element * host_data() { return host_.data(); } /// Gets pointer to host data with a pointer offset - Element * host_data_ptr_offset(LongIndex ptr_element_offset) { return host_.data() + ptr_element_offset; } + Element * host_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory::get(host_.data(), ptr_element_offset); } /// Gets a reference to an element in host memory Reference host_data(LongIndex idx) { @@ -257,7 +257,7 @@ class HostTensor { Element * device_data() { return device_.get(); } /// Gets pointer to device data with a pointer offset - Element * device_data_ptr_offset(LongIndex ptr_element_offset) { return device_.get() + ptr_element_offset; } + Element * device_data_ptr_offset(LongIndex ptr_element_offset) { return &ReferenceFactory::get(device_data(), ptr_element_offset); } /// Gets pointer to device data Element const * device_data() const { return device_.get(); } diff --git a/tools/util/include/cutlass/util/host_tensor_planar_complex.h b/tools/util/include/cutlass/util/host_tensor_planar_complex.h index 6bdc8fe47b..50919a1759 100644 --- a/tools/util/include/cutlass/util/host_tensor_planar_complex.h +++ b/tools/util/include/cutlass/util/host_tensor_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/host_uncompress.h b/tools/util/include/cutlass/util/host_uncompress.h index 8b630030e5..7f5e8213ba 100644 --- a/tools/util/include/cutlass/util/host_uncompress.h +++ b/tools/util/include/cutlass/util/host_uncompress.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/detail/inner_product.h b/tools/util/include/cutlass/util/reference/detail/inner_product.h index f75f8b8884..7fdc2462e8 100644 --- a/tools/util/include/cutlass/util/reference/detail/inner_product.h +++ b/tools/util/include/cutlass/util/reference/detail/inner_product.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h b/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h index db00e712ed..67e0e1aa10 100644 --- a/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h +++ b/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/convolution.h b/tools/util/include/cutlass/util/reference/device/convolution.h index 843b6b15b9..9d814263e7 100644 --- a/tools/util/include/cutlass/util/reference/device/convolution.h +++ b/tools/util/include/cutlass/util/reference/device/convolution.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/gemm.h b/tools/util/include/cutlass/util/reference/device/gemm.h index 3e4bfb31b6..93ecdda4e2 100644 --- a/tools/util/include/cutlass/util/reference/device/gemm.h +++ b/tools/util/include/cutlass/util/reference/device/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/gemm_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_complex.h index 7c736603bb..7ad38a84db 100644 --- a/tools/util/include/cutlass/util/reference/device/gemm_complex.h +++ b/tools/util/include/cutlass/util/reference/device/gemm_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h index b9bdbfa026..0ff572a270 100644 --- a/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h +++ b/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h index 3b9688d17a..0e5c668ebb 100644 --- a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h +++ b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h index 4d9de5156e..67ddfdc4f0 100644 --- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h +++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h index 8d813ea243..7524b740de 100644 --- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h +++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/tensor_compare.h b/tools/util/include/cutlass/util/reference/device/tensor_compare.h index eb61754e47..9aa0a4f923 100644 --- a/tools/util/include/cutlass/util/reference/device/tensor_compare.h +++ b/tools/util/include/cutlass/util/reference/device/tensor_compare.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/tensor_fill.h b/tools/util/include/cutlass/util/reference/device/tensor_fill.h index ff2e5f3666..09ead0ef4d 100644 --- a/tools/util/include/cutlass/util/reference/device/tensor_fill.h +++ b/tools/util/include/cutlass/util/reference/device/tensor_fill.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h index 54621006e1..f9031c5cae 100644 --- a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h +++ b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/tensor_reduce.h b/tools/util/include/cutlass/util/reference/device/tensor_reduce.h index a268c92526..c8f279c066 100644 --- a/tools/util/include/cutlass/util/reference/device/tensor_reduce.h +++ b/tools/util/include/cutlass/util/reference/device/tensor_reduce.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/tensor_relu.h b/tools/util/include/cutlass/util/reference/device/tensor_relu.h index d78e19533e..8717c921a5 100644 --- a/tools/util/include/cutlass/util/reference/device/tensor_relu.h +++ b/tools/util/include/cutlass/util/reference/device/tensor_relu.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/thread/gemm.h b/tools/util/include/cutlass/util/reference/device/thread/gemm.h index 318e6c8368..880b1a12a9 100644 --- a/tools/util/include/cutlass/util/reference/device/thread/gemm.h +++ b/tools/util/include/cutlass/util/reference/device/thread/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/convolution.h b/tools/util/include/cutlass/util/reference/host/convolution.h index 48f5db81ea..f69ba174bc 100644 --- a/tools/util/include/cutlass/util/reference/host/convolution.h +++ b/tools/util/include/cutlass/util/reference/host/convolution.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/gemm.h b/tools/util/include/cutlass/util/reference/host/gemm.h index 6381aa3066..628961e41f 100644 --- a/tools/util/include/cutlass/util/reference/host/gemm.h +++ b/tools/util/include/cutlass/util/reference/host/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/gemm_complex.h b/tools/util/include/cutlass/util/reference/host/gemm_complex.h index 473115ff87..a195ece7ae 100644 --- a/tools/util/include/cutlass/util/reference/host/gemm_complex.h +++ b/tools/util/include/cutlass/util/reference/host/gemm_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h b/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h index 127c501bd3..6fe9d8e0ae 100644 --- a/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h +++ b/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/tensor_compare.h b/tools/util/include/cutlass/util/reference/host/tensor_compare.h index 2d7545e907..faa1177590 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_compare.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_compare.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/tensor_copy.h b/tools/util/include/cutlass/util/reference/host/tensor_copy.h index a81f021127..ec62515c35 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_copy.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_copy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h b/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h index 88bbb39f45..9dd8995a61 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/tensor_fill.h b/tools/util/include/cutlass/util/reference/host/tensor_fill.h index 1a0230b55d..7904b746fd 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_fill.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_fill.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/tensor_foreach.h b/tools/util/include/cutlass/util/reference/host/tensor_foreach.h index feb439d724..e0dc000c01 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_foreach.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_foreach.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/tensor_norm.h b/tools/util/include/cutlass/util/reference/host/tensor_norm.h index c2958e32e3..549167f8d5 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_norm.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_norm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/tensor_reduce.h b/tools/util/include/cutlass/util/reference/host/tensor_reduce.h index dd1d4fda66..2d41791576 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_reduce.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_reduce.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/tensor_view_io.h b/tools/util/include/cutlass/util/tensor_view_io.h index 0043d745c2..a097e637ea 100644 --- a/tools/util/include/cutlass/util/tensor_view_io.h +++ b/tools/util/include/cutlass/util/tensor_view_io.h @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/type_traits.h b/tools/util/include/cutlass/util/type_traits.h index d97af0a421..e4c8951caf 100644 --- a/tools/util/include/cutlass/util/type_traits.h +++ b/tools/util/include/cutlass/util/type_traits.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: