diff --git a/changelog.md b/CHANGELOG.md similarity index 77% rename from changelog.md rename to CHANGELOG.md index d9ff1d5dd5..c0606491ea 100644 --- a/changelog.md +++ b/CHANGELOG.md @@ -1,6 +1,22 @@ # NVIDIA CUTLASS Changelog -## [1.0.1](https://github.com/NVIDIA/cutlass/releases/tag/v1.0.1) (2018-06-11) + +## 1.1.0 (2018-09-19) + * Turing Features + * WMMA GEMM targeting TensorCores - INT8, INT4, 1-bit + * Batched Strided GEMM + * Threadblock rasterization strategies + * Improved performance for adverse problem sizes and data layouts + * Extended CUTLASS Core comonents + * Tensor views support arbitrary matrix and tensor layouts + * Zip iterators for structuring multiple data streams + * Enhanced CUTLASS utilities + * Reference code for tensor operations in host and device code + * Added HostMatrix<> for simplified matrix creation + * Examples + * Basic GEMM, tensor views, CUTLASS utilities, batched GEMM, WMMA GEMM + +## 1.0.1 (2018-06-11) * Intra-threadblock reduction added for small threadblock tile sizes * sgemm_64x128x16, sgemm_128x128x16, sgemm_128x64x16, sgemm_128x32x16, sgemm_64x64x16, sgemm_64x32x16 diff --git a/CMakeLists.txt b/CMakeLists.txt index 5a53fae555..fdd51ae88e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,11 +55,21 @@ endif() find_package(CUDA) find_package(Doxygen QUIET) +################################################################################################### +# +# Configure CMake variables +# +################################################################################################### + +find_library(CUBLAS_LIBRARY cublas HINTS + ${CUDA_TOOLKIT_ROOT_DIR}/lib64 + ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64) + # By default we want to build in Release mode to ensure that we're getting best performance if (NOT (CMAKE_BUILD_TYPE OR CONFIGURATION_TYPES)) set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose build level" FORCE) # We do support Debug or Release builds - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release") + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "RelWithDebInfo" "Release") endif() if(WIN32) @@ -68,27 +78,59 @@ if(WIN32) endif() if (WIN32) - # Enable more warnings and treat as errors - string(APPEND NVCC_FLAGS " -Xcompiler /W3 -Xcompiler /WX") + # Enable more warnings and treat as errors + string(APPEND NVCC_FLAGS " -Xcompiler /W3 -Xcompiler /WX") - # Disable excess x86 floating point precision that can lead to results being labeled incorrectly - string(APPEND NVCC_FLAGS " -Xcompiler /fp:strict") + # Disable warning on Unicode characters + string(APPEND NVCC_FLAGS " -Xcompiler /wd4819") - # Verbose option - if (${CUTLASS_NVCC_VERBOSE}) - string(APPEND NVCC_FLAGS " -v") - endif() + # Disable excess x86 floating point precision that can lead to results being labeled incorrectly + string(APPEND NVCC_FLAGS " -Xcompiler /fp:strict") + + # Verbose option + if (${CUTLASS_NVCC_VERBOSE}) + string(APPEND NVCC_FLAGS " -v") + endif() endif(WIN32) -# Configure CUDA options -set(CUTLASS_NVCC_ARCHS "50;60;61;70" CACHE STRING "The SM architectures to build code for.") -set(CUTLASS_NVCC_KEEP OFF CACHE BOOL "Keep intermediate files generated by NVCC.") +set(CUTLASS_NVCC_ARCHS "50;60;61;70;75" CACHE STRING "The SM architectures to build code for.") +set(CUTLASS_NVCC_EMBED_CUBIN ON CACHE BOOL "Embed compiled CUDA kernel binaries into executables.") +set(CUTLASS_NVCC_EMBED_PTX ON CACHE BOOL "Embed compiled PTX into executables.") +set(CUTLASS_NVCC_KEEP OFF CACHE BOOL "Keep intermediate files generated by NVCC.") + +# +# NOTE: running with asan and CUDA requires the following environment variable: +# +# ASAN_OPTIONS=protect_shadow_gap=0:replace_intrin=0:detect_leaks=0 +# +# without the above environment setting, an error like the following may be generated: +# +# *** Error: Could not detect active GPU device ID [out of memory] +# ... +# ==9149==ERROR: LeakSanitizer: detected memory leaks +# ... +# +if(ENABLE_ASAN) # https://github.com/google/sanitizers/wiki/AddressSanitizer + string(APPEND NVCC_FLAGS " --compiler-options -fsanitize=address --compiler-options -fno-omit-frame-pointer") + string(APPEND CMAKE_EXE_LINKER_FLAGS " -fsanitize=address") +endif() +################################################################################################### +# +# Configure CUDA build options +# +################################################################################################### + +# Set NVCC arguments foreach(ARCH ${CUTLASS_NVCC_ARCHS}) - string(APPEND NVCC_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}") + if(CUTLASS_NVCC_EMBED_CUBIN) + string(APPEND NVCC_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}") + endif() + if(CUTLASS_NVCC_EMBED_PTX) + string(APPEND NVCC_FLAGS " -gencode arch=compute_${ARCH},code=compute_${ARCH}") + endif() endforeach() - if (CUTLASS_NVCC_KEEP) string(APPEND NVCC_FLAGS " -keep") endif() @@ -99,11 +141,8 @@ else() string(APPEND NVCC_FLAGS " -lineinfo") endif() -if (UNIX) - string(APPEND NVCC_FLAGS " -Xcompiler -Wconversion") -endif() - string(APPEND NVCC_FLAGS_DEBUG " -g") +string(APPEND NVCC_FLAGS_RELWITHDEBINFO " -O3") string(APPEND NVCC_FLAGS_RELEASE " -O3") # define NDEBUG for release mode to disable assertions @@ -111,11 +150,13 @@ string(APPEND NVCC_FLAGS_RELEASE " -DNDEBUG") if (CUTLASS_NATIVE_CUDA) set(CMAKE_CUDA_FLAGS "${NVCC_FLAGS}") - set(CMAKE_CUDA_FLAGS_DEBUG "${NVCC_FLAGS_DEBUG}") set(CMAKE_CUDA_FLAGS_RELEASE "${NVCC_FLAGS_RELEASE}") + set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "${NVCC_FLAGS_RELWITHDEBINFO}") + set(CMAKE_CUDA_FLAGS_DEBUG "${NVCC_FLAGS_DEBUG}") else() set(CUDA_NVCC_FLAGS ${NVCC_FLAGS}) set(CUDA_NVCC_FLAGS_DEBUG ${NVCC_FLAGS_DEBUG}) + set(CUDA_NVCC_FLAGS_RELWITHDEBINFO ${NVCC_FLAGS_RELWITHDEBINFO}) set(CUDA_NVCC_FLAGS_RELEASE ${NVCC_FLAGS_RELEASE}) endif() @@ -128,6 +169,11 @@ file(GLOB CUTLASS_GEMM RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/gemm/*.h) file(GLOB CUTLASS_UTIL RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/util/*.h) file(GLOB CUTLASS_DEVICE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/device/*.h) file(GLOB CUTLASS_CORE RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} cutlass/*.h) +################################################################################################### +# +# Define build targets +# +################################################################################################### source_group("cutlass\\gemm" FILES ${CUTLASS_GEMM}) source_group("cutlass\\util" FILES ${CUTLASS_UTIL}) @@ -156,9 +202,9 @@ add_custom_target(cutlass_ide SOURCES if (DOXYGEN_FOUND) # DOT is available. Enable graph generation in the documentation if (DOXYGEN_DOT_EXECUTABLE) - set(CUTLASS_ENABLE_DOXYGEN_DOT ON CACHE BOOL "Use dot to generate graphs in the doxygen documentation.") + set(CUTLASS_ENABLE_DOXYGEN_DOT ON CACHE BOOL "Use dot to generate graphs in the doxygen documentation.") else() - set(CUTLASS_ENABLE_DOXYGEN_DOT OFF CACHE BOOL "Use dot to generate graphs in the doxygen documentation." FORCE) + set(CUTLASS_ENABLE_DOXYGEN_DOT OFF CACHE BOOL "Use dot to generate graphs in the doxygen documentation." FORCE) endif() if (CUTLASS_ENABLE_DOXYGEN_DOT) @@ -177,6 +223,5 @@ if (DOXYGEN_FOUND) ) endif() - -#add_subdirectory(examples/gemm) add_subdirectory(tools) +add_subdirectory(examples) diff --git a/CUTLASS.md b/CUTLASS.md new file mode 100644 index 0000000000..7dea0f3729 --- /dev/null +++ b/CUTLASS.md @@ -0,0 +1,311 @@ +![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition") + +# CUTLASS + +This document is intended to accompany the CUTLASS source code, to describe the interaction between +CUTLASS core components, and to identify their role in implementing GEMM computations efficiently in CUDA. + +1. [Design Patterns](#S-design-patterns) +2. [General Matrix Multiply](#S-general-matrix-multiply) +3. [Core Components](#S-core-components) +4. [Utilities](#S-utilities) + +# 1. Design Patterns + +CUTLASS strives to achieve the highest performance possible on NVIDIA GPUs while also offering a +flexible composition that an be easily applied to solve new problems related to Deep Learning and +linear algebra. Though we intend to make CUTLASS as simple and straightforward as possible, given +a tradeoff between simplicity and performance, CUTLASS chooses performance. Consequently, several +design patterns are necessary to yield a composable structure while also satisfying these performance +objectives. This section is intended to provide more detail. + +* [Sequencing and Nesting](#S-patterns-sequencing-nesting) +* [Tiles and Iterators](#S-patterns-tiles-iterators) +* [Host-side Params](#S-patterns-host-side-params) +* [Composable Shared Memory](#S-patterns-composable-shared-memory) + +## Sequencing and Nesting of Collective Primitives + +CUTLASS embodies a design paradigm exemplified by the [CUB library](https://nvlabs.github.io/cub/) for expressing collective operations. Objects expose an interface for a problem that is then decomposed into concurrent subtasks executed by cooperating threadblocks, warps, and threads. For example, a grid-level object may be constructed with base pointers to the start of a GEMM operation, add a threadblock-dependent offset to partition the problem, and then compute a per-threadblock GEMM. This in turn performs some operations as a collection of cooperating threads, while it may partition other parts of the task into warp-level subtasks. + +## Tiles and Iterators + +Efficient dense linear algebra computations emphasize data movement to match the execution of mathemtical operators to the flow of data. Consequently, CUTLASS defines a rich set of primitives for partitioning a tile of data among participating threads, warps, and threadblocks. CUTLASS applies the familiar iterator design pattern to provide an abstraction layer to (1.) access these tile objects and (2.) traverse a sequence of objects embedded in a higher level data structure. These subpartitions are typically defined by compile-time constants +specifying element type, size, and data layout. CUTLASS refers to subpartitions as _tiles_. + +_Iterators_ are familiar design patterns in C++ that provide an abstraction for accessing individual +elements in memory as well as traversing over a collection. GEMM kernels in CUTLASS depend on accessing +a sequence of tiles from global memory, from shared memory, and in registers. Consequently, _tile iterators_ +are prevalent throughout the CUTLASS implementation. + +The canonical CUTLASS tile iterator template is defined in [cutlass/tile_iterator.h](cutlass/tile_iterator.h). + +## Host-side Params structure + +Several CUTLASS template classes exhibit a pattern in which problem-specific internal state is known at kernel launch time and remains invariant throughout the execution of a kernel. For example, tile iterators compute several offsets based on the strides of the input tensor that is added to an internal pointer when loading the elements of a tile. These are computed from the tensor stride and never updated; the per-thread internal state consists only of the internal global memory pointer. + +CUTLASS can take advantage of this CUDA grid-invariant property by constructing the object in host code and passing a composed parameters structure to the kernel. This confers two benefits: (1.) invariant state is held in constant memory, and (2.) there is no overhead to compute the initial state by each thread. + +The design pattern in CUTLASS is for classes with nontrivial constructors to define `struct Params` as an inner class which contains grid-invariant state. These should define a constructor and an `initialize()` method. The `Params` structure should also include a data member corresponding to each data member in the parent class, so these too can be properly constructed in host code. The parent class should define a constructor which accepts `Params const &` as its first argument. + +For example, `cutlass::gemm::Gemm<>` should define `struct cutlass::gemm::Gemm::Params`. The latter should define data members for each data member in `cutlass::gemm::Gemm<>`. + + +## Composable shared memory allocation + +Shared memory requires explicit effort by the programmer to allocate and de-allocate. CUTLASS follows the paradigm introduced by [CUB](https://nvlabs.github.io/cub/) to define composed structures for storing data intended to be held in shared memory. Any object requiring shared memory storage for itself or its data members should define a child structure called SharedStorage. This holds data needed by the class and also instantiates SharedStorage objects for each data member. + +To be consistent, this pattern defines a convention in which classes define internal shared memory storage requirements. Classes should consider all SharedStorage structures to be opaque other than their own child class. When the lifetimes of child objects are known to be non-overlapping, unions may be used to alias multiple SharedStorage objects to the same shared memory region and reduce overall SMEM capacity. + +## Loop Unrolling + +CUTLASS requires tiles of data to be stored in registers for high-bandwidth access. Simultaneously, high-throughput math instructions +must be issued concurrently with memory instructions to hide latency with relatively few concurrent threads. These objectives are +achieved by unrolling loops whose iteration counts are known at compile time. + +Consequently, most loops within the CUTLASS GEMM implementation are specified by constant values and template arguments. The CUDA compiler +is able to unroll the loop bodies, map array elements to registers, and construct an efficient instruction schedule. + +## Templates + +CUDA C++ templates and modern generic programming techniques enable CUTLASS device code to span a large design space. + +This design space includes: +* Mixed precision arithmetic and data storage +* Kernels specialized for layout and problem size +* Support for kernel fusion + +Moreover, templates provided a structured approach to collecting compile-time constants such as tile dimensions. These +must be template arguments to target static array allocation and take advantage of loop unrolling, constant folding, +and function inlining. + +# 2. General Matrix Multiply + +The following figure illustrates the hierarchical GEMM computation embodied by CUTLASS. Each stage depicts a nested level of tiling which corresponds to a layer of concurrency within the CUDA execution model and to a level within the memory hierarchy, becoming increasingly finer moving left to right. + +![ALT](/media/images/gemm-structural-components.png "CUTLASS GEMM Structural Components") + +## Threadblock-level GEMM + +The CUTLASS GEMM kernel partitions the _C_ matrix into a 2D tiling of threadblocks. +Each threadblock computes a matrix product whose outer dimensions _M_ and _N_ are compile-time constants. The +GEMM's _K_ dimension is partitioned into tiles and iterated over by the GEMM _mainloop_. The shape of the matrix +multiply operation performed by each iteration of the mainloop is referred to as _OutputTile_. + +The threadblock loads a sequence of tiles from global memory and stores this data to shared memory. The iterative +access and traversal of tiles in global memory are performed by a _TileLoadIterator_, and storing to a circular +buffer in shared memory is performed by a _GlobalLoadIterator_. + +**[Global Load Stream](cutlass/gemm/gemm_global_stream.h)** manages loading of the threadblock-scope multiplicands to the GEMM kernel. It owns an iterator into global memory for loading tiles of data, a TensorAllocation in shared memory to hold the resulting tile, and an iterator for writing the tile into this allocation. A transformer exists to optionally transform the data as it is loaded which may of use to perform type conversion or, in the case of int8 GEMM, transpose 4x4 tiles held in registers. + +The Global Load Stream template contains members defined by the following templates: + +* [GemmGlobalIteratorAb](cutlass/gemm/gemm_global_tile.h) +* [Transformer](cutlass/convert.h) +* [GemmSharedStoreTileAb](cutlass/gemm/gemm_shared_tile.h) + +## Warp-level GEMM + +The threadblock's _OutputTile_ is partitioned among the warps, and each computes a warp-level matrix product. +Data is loaded from shared memory into registers, and math instructions are dispatched to CUDA Cores or Tensor Cores. + +[**Shared Load Stream**](cutlass/gemm/gemm_shared_stream.h) manages loading of warp-level multiplicands from shared memory into registers. This owns an iterator for fetching data and the destination fragments for holding the results. + +* [GemmSharedLoadTile{A,B}](cutlass/gemm/gemm_shared_tile.h) + +**Matrix Multiply** computes a matrix product operation on data held in registers. Specializations exist for thread-level instructions such as single-precision fused multiply-add as well as warp-level matrix operations targeting TensorCores. + +* [WMMA Multiply Add](cutlass/gemm/wmma_gemm_multiply_add.h) + +## Thread-level GEMM + +SGEMM, IGEMM, HGEMM, and DGEMM are computed by SIMT math instructions issued by thread-level matrix multiply +procedures. + +* [ThreadMultiplyAdd](cutlass/gemm/thread_multiply_add.h) +* [IGEMM specialization](cutlass/gemm/igemm_multiply_add.h) +* [HGEMM specialization](cutlass/gemm/hgemm_multiply_add.h) + +## Epilogue + +The [**epilogue**](cutlass/gemm/gemm_epilogue.h) iteratively selects a subset of accumulator elements held by a warp, writes them to shared memory, and loads them by different threads such that a threadblock-scoped tile store operation will make contiguous, striped accesses to global memory. Thus, the flow of data utilizes the following components: + +1. [Transformer](cutlass/convert.h) for converting the data types of accumulator elements +2. [GemmSharedStoreTileD](cutlass/gemm/gemm_shared_tile.h) to store to shared memory specialized to the accumulator layout. +3. [GemmSharedLoadTileD](cutlass/gemm/gemm_shared_tile.h) to load the data from shared memory. +4. [GemmGlobalIteratorC](cutlass/gemm/gemm_global_tile.h) to load a tile from global memory. +5. A [functor](cutlass/gemm/linear_scaling.h) to compute an element-wise operation on the matrix product and source data (such as alpha*AB+beta*C). +6. [GemmGlobalIteratorD](cutlass/gemm/gemm_global_tile.h) to write the output to global memory. + +## GEMM Traits + +[**cutlass::gemm::GemmTraits**](cutlass/gemm/gemm_traits.h) collects the structural properties of a complete GEMM computation into a single template class. As a result, the Traits classes encapsulate the the iterators and transformers for all supported GEMM operands and layouts. Low-level details needed by Traits (such as scalar types for operands, thread-block tile size, number of scalar elements per memory access within each phase, number of stages in shared memory, as well as other implementation-specific properties of the GEMM computation) are specified in class [**cutlass::gemm::GemmConfig**](cutlass/gemm/gemm_config.h). + + +# 3. Core Components + +CUTLASS GEMM kernels are implemented by a set of Core components for interacting with mathematical tensor and matrix +objects as well as constructing efficient CUDA kernels. + +* [Tensor views](#S-core-tensor-views) +* [Shape](#S-core-shape) +* [Tile structure](#S-core-tile-structure) +* [Fragment](#S-core-fragment) +* [Predicate vector](#S-core-predicate-vector) + +## Tensor View + +Matrices and tensors are typically represented as n-D arrays held in linear memory with a single base pointer and a stride vector. Element _i_ of the stride vector indicates the offset in linear memory between consecutive elements in dimension i. Consequently, the linear offset for an arbitrary element specified as an n-tuple may be computed as the dot product of the coordinate and the stride vector. + +CUTLASS provides abstractions for interacting with multidimension tensors in device memory. +Consequently, we define a hierarchy of pointer-like types for referencing tensors. + +`T *` - raw pointer to elements of type T + +`cutlass::TensorRef` - reference to a tensor of elements of type T and given rank. Includes a mapping function and associated stride vector for accessing elements in linear memory. + +`cutlass::TensorView` - extends `TensorRef<>` by adding bounds information. This is a complete mathematical object which may be used as the argument to CUTLASS functions. + +The above provide an identity maping of a logical index space to linear memory. An element +at logical coordinate X has an offset computed as follows: +``` +offset = dot(X, stride) +``` +where `dot()` computes the inner product of X and a vector of "strides." + +CUTLASS 1.1 introduces a mapping function and an additional "storage rank" to offer a flexible way to +map the logical index space of the tensor to memory. The mapping function maps a coordinate +of rank _R_ to an index space of rank _S_. The linear offset is computed as: +``` +offset = dot( MapFunc(X), stride ) +``` +where stride is a vector of rank _S_. + +CUTLASS kernels make extensive use of vectorization of memory accesses for efficiency and +correctness. Consequently, we enforce a constraint on the strides used by mapping functions +such that: + +1. The "fastest-changing" stride is always 1 thereby mandating that consecutive elements in + that rank are consecutive in linear memory. + +2. The fastest changing rank is always last in the stride vector and not explicitly stored. + +Thus, the stride vector used by mapping functions has length of one fewer than the rank of the +storage tensor. These constraints are consistent with the BLAS interface of passing matrices as +a tuple consisting of a pointer and a "leading dimension." In fact, these are rank=2 tensors +whose fastest changing dimension is 1, and only the strided dimension is explicitly represented. + +A typical mapping function might simply map the rows and columns of a matrix, a rank=2 tensor, +to linear memory such that (1.) elements in the same column are consecutive in memory +(column-major), or (2.) elements in the same row are consecutive (row-major). These can be +accomplished by two different mapping functions whose stride vector is length=2. The first +element is the "leading dimension." + +The requirement that the fastest-changing stride always be of unit size need not be a limitation. +To implement "sparse" computations or matrix operations in which matrix elements have arbitrary +stride along both row and column, define a mapping function whose storage rank is 3. This permits +two elements of the stride vector to have a non-unit value. + +`cutlass::TensorView<>` extends this concept by including a size vector to specify the bounds of +the index space. The value of each coordinate in the size vector defines the half-open range of +indices whose smallest value is zero. + +## Shape + +To avoid complicated template metaprogramming, CUTLASS targets fixed compile-time tile sizes specified +by a four-dimensional template `cutlass::Shape<>`. This defines the following dimensions, mirroring +the NHWC tensor format used for convolution in Deep Learning frameworks. + +- `D`: depth of tensor +- `H`: first strided dimension +- `W`: contiguous sequence of tensor elements +- `C`: number of channels, usually used for vectorized access + +Template specializations of `Shape` appear as arguments to numerous dependent template classes which +must specify compile-time constant tile sizes. + +## Tile Structure + +Tiled structures express an arrangement of data in memory as well as a logical mapping of concurrent CUDA +threads to the problem space. For example, the CUTLASS GEMM + +Tiled structures can be defined using the `cutlass::TileTraits<>` concept which defines the following +members. Collectively, these members offer a flexible way to define a 4-D subpartition of an integer +lattice, partition its elements among a collection of threads, and map each unique thread ID to a unique +offset. + +- _Tile_ (concept `Shape<>`) - describes the dimensions of the tile in terms of scalar elements +- _Delta_ (concept `Shape<>`) - describes the distance along each logical dimension between items +- _Iterations_ (concept `Shape<>`) - describes the number of items along each logical dimension +- _ThreadOffset_ (concept _functor_) - implements `Coord<4> operator()() const` to determine a thread's + initial offset in the logical 4-D coordinate space + +The following figure illustrates the CUTLASS tile structure. The overall shape, 16-by-16, is partitioned into +vectors of length two among 32 threads. The elements stored by thread 9 are highlighted. + +CUTLASS tile structure + +The `cutlass::TileTraits<>` definition that describes this arrangement may be defined as follows: + +``` +struct ExampleTileTraits { + + /// Overall shape of tile + typedef Shape<1, 16, 16, 1> Tile; + + /// Distance along each dimension of accesses + typedef Shape<1, 4, 1, 1> Delta; + + /// Number of memory accesses performed by each thread + typedef Shape<1, 4, 1, 1> Iterations; + + /// Offset function - maps each thread to a unique starting offset within the 4D tile + struct ThreadOffset { + + CUTLASS_DEVICE Coord<4> operator()() const { + + typdef Shape<1, 16, 8, 2> Vectorized; + + return make_Coord( + 0, // depth "D" dimension + threadIdx.x / Vectorized::kW, // horisontal "H" dimension - first strided dimension + threadIdx.x % Vectorized::kW, // vertical "W" dimension - contiguous dimension + 0 + ); + } + }; +}; +``` + +## Tile Iterator + +The iterator design pattern provides an abstraction for accessing the items in a collection in sequence. Basic +operators defined by iterators consist of accessing an item - either a load or store - followed by traversal to +the next item in sequence. + +CUTLASS tile access and traversal + +To offer a generic solution that spans numerous data types and layouts, CUTLASS defines the _TileIterator_ concept. +This concept provides access to a sequence of _tiles_ embedded in a tensor in addressable memory. + +The canonical CUTLASS tile iterator template is defined in [cutlass/tile_iterator.h](cutlass/tile_iterator.h). + +## Fragment + +A fragment is analogous to `std::array<>` in that it is a constant-sized array of elements. Typically backed by storage in the SM's register file, CUTLASS `Fragment<>` objects are used to store tiles. For threadblock- and warp-scope operations, the contents of these tiles are distributed across the partipcipating threads. In such cases, a thread's `Fragment<>` contains the part of the tile held by that thread. + +## Predicate Vector + +SIMT architectures utilize predicated execution in place of control flow when conditional code sequences are fairly short, on the order of a few machine instructions. While CUDA C++ does not include constructs at the language level for predication, PTX makes this explicit, and compilation to SASS is assumed to aggressively utilize predication. Typical applications are to initialize a sequence of bits used to mask memory operations and use these bits as predicates guarding memory load and store instructions. + +CUTLASS provides `PredicateVector` defined in [cutlass/predicate_vector.h](cutlass/predicate_vector.h) to manage a statically-sized bit vector, store them into general purpose registers, and efficiently access them in sequence. By storing four predicates per byte in hardware registers, the CUDA compiler is able to issue specialized instructions to achieve very efficient unpacking. + + +# 4. Utilities + +CUTLASS implements efficient matrix multiply computations on GPUs. It is accompanied by an extensive utility +framework offering features such as: + +* [cutlass::half_t](tools/util/half.h) - a host-side half-precision type +* Components for allocating and initializing [host-side and device-side tensors](tools/util/host_tensor.h) usable by CUTLASS +* Reference implementations of [GEMM](tools/util/reference/host/gemm.h) and [element-wise operations](tools/util/reference/host/tensor_elementwise.h) diff --git a/Doxyfile b/Doxyfile index 51cec529b3..1d96f37708 100644 --- a/Doxyfile +++ b/Doxyfile @@ -58,7 +58,7 @@ PROJECT_LOGO = # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. -OUTPUT_DIRECTORY = docs +OUTPUT_DIRECTORY = doxygen # If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and diff --git a/README.md b/README.md index 56473a2861..c53a42f4bc 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ ![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition") -# CUTLASS 1.0 +# CUTLASS 1.1 -_CUTLASS 1.0.1 - June 2018_ +_CUTLASS 1.1.0 - September 2018_ -CUTLASS 1.0 is a collection of CUDA C++ template abstractions for implementing +CUTLASS 1.1 is a collection of CUDA C++ template abstractions for implementing high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA. It incorporates strategies for hierarchical decomposition and data movement similar to those used to implement cuBLAS. CUTLASS decomposes these "moving parts" into @@ -22,14 +22,27 @@ point (FP64) types. Furthermore, CUTLASS demonstrates CUDA's WMMA API for targe the programmable, high-throughput _Tensor Cores_ provided by NVIDIA's Volta architecture and beyond. -CUTLASS 1.0 has changed substantially from our preview release described in -the [CUTLASS Parallel For All](https://devblogs.nvidia.com/parallelforall/cutlass-linear-algebra-cuda) -post. We have decomposed the structure of the GEMM computation into deeper, structured -primitives for loading data, computing predicate masks, streaming data at each level of -the GEMM hierarchy, and updating the output matrix. - -CUTLASS 1.0 is described in the [Doxygen documentation](https://nvidia.github.io/cutlass) -and our talk at the [GPU Technology Conference 2018](http://on-demand.gputechconf.com/gtc/2018/presentation/s8854-cutlass-software-primitives-for-dense-linear-algebra-at-all-levels-and-scales-within-cuda.pdf). +CUTLASS 1.1 is described in the [CUTLASS Documentation](CUTLASS.md) and the accompanying +[Doxygen documentation](https://nvidia.github.io/cutlass). +We describe the structure of an efficient GEMM in our talk at the +[GPU Technology Conference 2018](http://on-demand.gputechconf.com/gtc/2018/presentation/s8854-cutlass-software-primitives-for-dense-linear-algebra-at-all-levels-and-scales-within-cuda.pdf). + +# What's New in CUTLASS 1.1 + +* [CUTLASS Documentation](CUTLASS.md) +* [Examples](examples/) + * Basic GEMM, tensor views, CUTLASS utilities, batched GEMM, WMMA GEMM +* Turing Features + * [WMMA GEMM targeting TensorCores](tools/test/unit/gemm/wmma_integer_gemm.cu) - INT8, INT4, 1-bit +* [Batched Strided GEMM](tools/test/unit/gemm/batched_strided_sgemm_128x128x8.cu) +* [Threadblock rasterization strategies](tools/test/unit/gemm/sgemm_threadblock_swizzle_nt.cu) + * Improved performance for adverse problem sizes and data layouts +* Extended CUTLASS Core comonents + * Tensor views support arbitrary matrix and tensor layouts + * Zip iterators for structuring multiple data streams +* Enhanced CUTLASS utilities + * [Reference implementations](tools/util/reference) for tensor operations in [host](tools/util/reference/host) and [device](tools/util/reference/device) code + * Added `HostMatrix<>` for simplified matrix creation # Performance @@ -39,11 +52,11 @@ CUTLASS primitives are very efficient. When used to construct device-wide GEMM they exhibit performance comparable to cuBLAS for scalar GEMM computations. The above figure shows CUTLASS performance relative to cuBLAS for large matrix dimensions (M=10240, N=K=4096) running on an NVIDIA Titan V GPU -when compiled with CUDA 9.2. +when compiled with CUDA 10.0. # Compatibility -CUTLASS requires CUDA 9 and performs best with [CUDA 9.2 Toolkit](ttps://developer.nvidia.com/cuda-toolkit) or later. +CUTLASS requires CUDA 9 but performs best with [CUDA 10.0 Toolkit](ttps://developer.nvidia.com/cuda-toolkit) or later. |**Operating System** | **Compiler** | |-----------------|----------| @@ -63,7 +76,7 @@ any Maxwell-, Pascal-, or Volta-architecture NVIDIA GPU. |NVIDIA Tesla P100| |NVIDIA Tesla V100| |NVIDIA TitanV| - +|NVIDIA GeForce RTX 2080 TI, 2080, 2070| # Building CUTLASS @@ -79,7 +92,7 @@ $ git submodule update --init --recursive ``` CUTLASS can be build with CMake starting version 3.10. By default CUTLASS will build kernels -for CUDA architecture versions 5.0, 6.0, 6.1 and 7.0. To reduce compile time you can specify +for CUDA architecture versions 5.0, 6.0, 6.1, 7.0 and 7.5. To reduce compile time you can specify the architectures to build CUTLASS for by changing the CMake configuration setting `CUTLASS_NVCC_ARCHS`. @@ -107,13 +120,12 @@ $ ./tools/test/unit/cutlass_unit_test ... ... [----------] Global test environment tear-down -[==========] 481 tests from 24 test cases ran. (5954 ms total) -[ PASSED ] 481 tests. +[==========] 946 tests from 57 test cases ran. (10812 ms total) +[ PASSED ] 946 tests. ``` All tests should pass, though the exact number of tests may vary over time. - # Project Structure CUTLASS is arranged as a header-only library with several example test programs @@ -128,28 +140,41 @@ templates in the cutlass/gemm directory. ``` cutlass/ - gemm/ - util/ - + gemm/ + util/ + ``` Several tools and test programs are also distributed with the CUTLASS library. They are contained in the following directories. ``` +examples/ + 00_basic_gemm/ + 01_tensor_view/ + 02_cutlass_utilities/ + 03_batched_gemm/ + 04_tile_iterator/ + 05_wmma_gemm/ tools/ - test/ - unit/ - core/ - gemm/ - perf/ - util/ - + test/ + unit/ + core/ + gemm/ + perf/ + util/ + reference/ + device/ + host/ + ``` The `test/unit/` directory consist of unit tests implemented with Google Test that demonstrate basic usage of Core API components and complete tests of the CUTLASS GEMM computations. +The `tools/util` directory contains CUTLASS utilities including reference implementations of GEMM and +several element-wise tensor operations. + # Performance Profiling The `test/perf/` directory contains a command-line utility for launching each of the GEMM kernels. diff --git a/clang-format.sh b/clang-format.sh deleted file mode 100755 index b2570d9147..0000000000 --- a/clang-format.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -set -e - -function formatFiles { - for f in `find "$1" -type f -name "*.$2"` ; do - COMMAND="clang-format -i $f" - echo $COMMAND - $COMMAND - done -} - -formatFiles "cutlass" "h" -formatFiles "tools/test" "h" -formatFiles "tools/test" "cpp" -formatFiles "tools/util" "h" - diff --git a/cutlass/convert.h b/cutlass/convert.h index 933d68a82a..b4d0f8eddb 100644 --- a/cutlass/convert.h +++ b/cutlass/convert.h @@ -28,7 +28,7 @@ */ #pragma once -#include +#include "cutlass/fragment.h" namespace cutlass { diff --git a/cutlass/coord.h b/cutlass/coord.h index 431c9bf1a0..625a22723d 100644 --- a/cutlass/coord.h +++ b/cutlass/coord.h @@ -28,7 +28,8 @@ #pragma once -#include +#include "cutlass/cutlass.h" +#include "cutlass/util/platform.h" namespace cutlass { @@ -44,20 +45,27 @@ struct Identity { //////////////////////////////////////////////////////////////////////////////////////////////////// /// Statically-sized array specifying Coords within a tensor -template +template struct Coord { // // Type and constant definitions // - static int const N = N_; + /// Number of elements in Coord + static int const kRank = Rank_; + + /// Number of elements in Coord, aliased for compatibility + static int const N = Rank_; + + /// Index type used to store elements + typedef Index_ Index; // // Data members // /// Indices - int idx[N]; + Index idx[kRank]; // // Methods @@ -65,25 +73,72 @@ struct Coord { /// Default ctor initializes uniformly CUTLASS_HOST_DEVICE - Coord(int value = 0) { - for (int i = 0; i < N; ++i) { + Coord(Index value = 0) { + for (int i = 0; i < kRank; ++i) { idx[i] = value; } } /// Constructs from an array of integers CUTLASS_HOST_DEVICE - Coord(int _idx[]) { - for (int i = 0; i < N; ++i) { + Coord(Index _idx[]) { + for (int i = 0; i < kRank; ++i) { idx[i] = _idx[i]; } } + /// Constructs from an array of integers + CUTLASS_HOST_DEVICE + Coord(Coord const &coord) { + for (int i = 0; i < kRank; ++i) { + idx[i] = coord[i]; + } + } + + /// Returns a slice of the Coord which may be larger or smaller in rank + /// than this. + template + CUTLASS_HOST_DEVICE + Coord slice(int start = 0, Index identity = 0) const { + Coord result; + for (int i = 0; i < Slice; ++i) { + if (i + start < kRank) { + slice[i] = idx[i + start]; + } + else { + slice[i] = identity; + } + } + return result; + } + + /// Returns true if Coord is non-zero. + CUTLASS_HOST_DEVICE + operator bool() const { + for (int i = 0; i < kRank; ++i) { + if (idx[i]) { + return true; + } + } + return false; + } + + /// Returns true if Coord is uniformly zero. + CUTLASS_HOST_DEVICE + bool operator!() const { + for (int i = 0; i < kRank; ++i) { + if (idx[i]) { + return false; + } + } + return true; + } + /// Element-wise addition CUTLASS_HOST_DEVICE Coord operator+(Coord const& b) const { Coord c; - for (int i = 0; i < N; ++i) { + for (int i = 0; i < kRank; ++i) { c.idx[i] = idx[i] + b.idx[i]; } return c; @@ -93,7 +148,7 @@ struct Coord { CUTLASS_HOST_DEVICE Coord operator-(Coord const& b) const { Coord c; - for (int i = 0; i < N; ++i) { + for (int i = 0; i < kRank; ++i) { c.idx[i] = idx[i] - b.idx[i]; } return c; @@ -103,7 +158,7 @@ struct Coord { CUTLASS_HOST_DEVICE Coord operator*(Coord const& b) const { Coord c; - for (int i = 0; i < N; ++i) { + for (int i = 0; i < kRank; ++i) { c.idx[i] = idx[i] * b.idx[i]; } return c; @@ -113,7 +168,7 @@ struct Coord { CUTLASS_HOST_DEVICE Coord operator/(Coord const& b) const { Coord c; - for (int i = 0; i < N; ++i) { + for (int i = 0; i < kRank; ++i) { c.idx[i] = idx[i] / b.idx[i]; } return c; @@ -122,7 +177,7 @@ struct Coord { /// In-place addition CUTLASS_HOST_DEVICE Coord& operator+=(Coord const& b) { - for (int i = 0; i < N; ++i) { + for (int i = 0; i < kRank; ++i) { idx[i] += b.idx[i]; } return *this; @@ -131,7 +186,7 @@ struct Coord { /// In-place subtraction CUTLASS_HOST_DEVICE Coord& operator-=(Coord const& b) { - for (int i = 0; i < N; ++i) { + for (int i = 0; i < kRank; ++i) { idx[i] -= b.idx[i]; } return *this; @@ -140,7 +195,7 @@ struct Coord { /// In-place multiplication CUTLASS_HOST_DEVICE Coord& operator*=(Coord const& b) { - for (int i = 0; i < N; ++i) { + for (int i = 0; i < kRank; ++i) { idx[i] *= b.idx[i]; } return *this; @@ -149,22 +204,22 @@ struct Coord { /// In-place division CUTLASS_HOST_DEVICE Coord& operator/=(Coord const& b) { - for (int i = 0; i < N; ++i) { + for (int i = 0; i < kRank; ++i) { idx[i] /= b.idx[i]; } return *this; } /// Member access operator - CUTLASS_HOST_DEVICE int& operator[](int dim) { return idx[dim]; } + CUTLASS_HOST_DEVICE Index& operator[](int dim) { return idx[dim]; } /// Member access operator - CUTLASS_HOST_DEVICE int const& operator[](int dim) const { return idx[dim]; } + CUTLASS_HOST_DEVICE Index const& operator[](int dim) const { return idx[dim]; } /// Computes the dot product of two Coord instances template CUTLASS_HOST_DEVICE T dot(Coord const& b, T sum) const { - for (int i = 0; i < N; ++i) { + for (int i = 0; i < kRank; ++i) { sum += idx[i] * b.idx[i]; } return sum; @@ -174,7 +229,7 @@ struct Coord { template CUTLASS_HOST_DEVICE T dot(Coord const& b) const { T sum = T(0); - for (int i = 0; i < N; ++i) { + for (int i = 0; i < kRank; ++i) { sum += idx[i] * b.idx[i]; } return sum; @@ -182,29 +237,29 @@ struct Coord { /// Gets the index of a given Coord element template - CUTLASS_HOST_DEVICE int& at() { + CUTLASS_HOST_DEVICE Index& at() { return idx[Dim]; } /// Access via index; may limit unrolling potential CUTLASS_HOST_DEVICE - int& at(int dim) { return idx[dim]; } + Index& at(int dim) { return idx[dim]; } /// Gets the index of a given Coord element template - CUTLASS_HOST_DEVICE int const& at() const { + CUTLASS_HOST_DEVICE Index const& at() const { return idx[Dim]; } /// Access via index; may limit unrolling potential CUTLASS_HOST_DEVICE - int const& at(int dim) const { return idx[dim]; } + Index const& at(int dim) const { return idx[dim]; } /// Determines if two Coord<> objects are equal CUTLASS_HOST_DEVICE - bool operator==(Coord const& b) const { + bool operator==(Coord const& b) const { bool equal = true; - for (int i = 0; equal && i < N; ++i) { + for (int i = 0; equal && i < kRank; ++i) { equal = (idx[i] == b.idx[i]); } return equal; @@ -212,12 +267,12 @@ struct Coord { /// Not equal CUTLASS_HOST_DEVICE - bool operator!=(Coord const& b) const { return !(*this == b); } + bool operator!=(Coord const& b) const { return !(*this == b); } /// Clamps a coordinate to a range specified by maximum and minimum values CUTLASS_HOST_DEVICE - Coord& clamp(Coord const& max, Coord const& min = Coord()) { - for (int i = 0; i < N; ++i) { + Coord& clamp(Coord const& max, Coord const& min = Coord()) { + for (int i = 0; i < kRank; ++i) { idx[i] = __NV_STD_MAX(__NV_STD_MIN(idx[i], max.idx[i]), min.idx[i]); } return *this; @@ -225,13 +280,35 @@ struct Coord { /// Returns the product of all elements CUTLASS_HOST_DEVICE - int count() const { - int product = idx[0]; - for (int i = 1; i < N; ++i) { + Index count() const { + Index product = idx[0]; + for (int i = 1; i < kRank; ++i) { product *= idx[i]; } return product; } + + /// Less than operator + CUTLASS_HOST_DEVICE + bool operator<(Coord const &b) const { + for (int i = 0; i < kRank; ++i) { + if (!(idx[i] < b[i])) { + return false; + } + } + return true; + } + + /// Less than or equals operator + CUTLASS_HOST_DEVICE + bool operator<=(Coord const &b) const { + for (int i = 0; i < kRank; ++i) { + if (!(idx[i] <= b[i])) { + return false; + } + } + return true; + } }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -266,21 +343,10 @@ Coord<4> make_Coord(int _0, int _1, int _2, int _3) { //////////////////////////////////////////////////////////////////////////////////////////////////// -/// Getter -CUTLASS_HOST_DEVICE -Coord<2> get_Coord_hw(Coord<3> const& coord) { return make_Coord(coord[1], coord[2]); } - -/// Getter -CUTLASS_HOST_DEVICE -Coord<2> get_Coord_hw(Coord<4> const& coord) { return make_Coord(coord[1], coord[2]); } - -/// Getter -CUTLASS_HOST_DEVICE -Coord<3> get_Coord_hwc(Coord<4> const& coord) { return make_Coord(coord[1], coord[2], coord[3]); } - -/// Getter -CUTLASS_HOST_DEVICE -Coord<3> get_Coord_dhw(Coord<4> const& coord) { return make_Coord(coord[0], coord[1], coord[2]); } +template +CUTLASS_HOST_DEVICE Coord<3> make_Coord_from_shape() { + return make_Coord(Shape_::kD, Shape_::kH, Shape_::kW); +} //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cutlass/core_io.h b/cutlass/core_io.h index cceea4c06d..849a7613f4 100644 --- a/cutlass/core_io.h +++ b/cutlass/core_io.h @@ -22,8 +22,6 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ -#pragma once - /*! \file \brief Helpers for printing cutlass/core objects */ @@ -33,12 +31,96 @@ #include #include -#include +#include "cutlass/coord.h" +#include "cutlass/vector.h" + +namespace cutlass { + +/////////////////////////////////////////////////////////////////////////////////////////////////// template -std::ostream& operator<<(std::ostream& out, cutlass::Coord const& coord) { +std::ostream& operator<<(std::ostream& out, Coord const& coord) { for (int i = 0; i < Rank; ++i) { out << (i ? ", " : "") << coord.idx[i]; } return out; } + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Helper to enable formatted printing of CUTLASS scalar types to an ostream +template +struct ScalarIO { + + /// Value to print + T value; + + /// Default ctor + ScalarIO() { } + + /// Constructs from a value + ScalarIO(T value): value(value) {} +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Default printing to ostream +template +inline std::ostream &operator<<(std::ostream &out, ScalarIO const &scalar) { + return out << scalar.value; +} + +/// Printing to ostream of int8_t as integer rather than character +template <> +inline std::ostream &operator<<(std::ostream &out, ScalarIO const &scalar) { + return out << int(scalar.value); +} + +/// Printing to ostream of uint8_t as integer rather than character +template <> +inline std::ostream &operator<<(std::ostream &out, ScalarIO const &scalar) { + return out << unsigned(scalar.value); +} + +/// Printing to ostream of vector of 1b elements +template <> +inline std::ostream &operator<<( + std::ostream &out, + ScalarIO > const &scalar) { + + for (int i = 0; i < 32; i++) { + out << int(scalar.value[i]); + out << ((i != 31) ? ", " : ""); + } + return out; +} + +/// Printing to ostream of vector of 4b signed integer elements +template <> +inline std::ostream &operator<<( + std::ostream &out, + ScalarIO > const &scalar) { + + for (int i = 0; i < 8; i++) { + out << int(scalar.value[i]); + out << ((i != 7) ? ", " : ""); + } + return out; +} + +/// Printing to ostream of vector of 4b unsigned integer elements +template <> +inline std::ostream &operator<<( + std::ostream &out, + ScalarIO > const &scalar) { + + for (int i = 0; i < 8; i++) { + out << unsigned(scalar.value[i]); + out << ((i != 7) ? ", " : ""); + } + return out; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass diff --git a/cutlass/cutlass.h b/cutlass/cutlass.h index 19600ec8f7..15ea83c014 100644 --- a/cutlass/cutlass.h +++ b/cutlass/cutlass.h @@ -32,8 +32,8 @@ //////////////////////////////////////////////////////////////////////////////////////////////////// #define CUTLASS_MAJOR 1 -#define CUTLASS_MINOR 0 -#define CUTLASS_PATCH 1 +#define CUTLASS_MINOR 1 +#define CUTLASS_PATCH 0 #define CUTLASS_VERSION ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH) #ifdef __NVCC__ @@ -47,7 +47,9 @@ // CUTLASS_DEVICE is an error if not compiling device code #endif -// CUTLASS_PRAGMA_UNROLL inserts a CUTLASS_PRAGMA_UNROLL if supported by the compiler +#define CUTLASS_ASSERT(x) assert(x) + +// CUTLASS_PRAGMA_(UNROLL|NO_UNROLL) optimization directives for the CUDA compiler. #if defined(__CUDA_ARCH__) #if defined(_MSC_VER) #define CUTLASS_PRAGMA_UNROLL __pragma("unroll") @@ -61,7 +63,22 @@ #define CUTLASS_PRAGMA_NO_UNROLL #endif -#define CUTLASS_ASSERT(x) assert(x) +#define CUTLASS_GEMM_LOOP CUTLASS_PRAGMA_NO_UNROLL + +// A small helper class to dump a type at compile time +// Usage:: DumpType::Class +template +struct DebugType {}; + +template +void DebugTypeFunc(T const& t) { + T::t; +} + +// A small helper class to dump a compile time constant at compile time +// Usage: DumpValue::kConstant +template +struct DebugValue {}; namespace cutlass { diff --git a/cutlass/fragment.h b/cutlass/fragment.h index 886b11405c..6a93d779c4 100644 --- a/cutlass/fragment.h +++ b/cutlass/fragment.h @@ -29,9 +29,9 @@ #pragma once #include -#include -#include -#include +#include "cutlass/shape.h" +#include "cutlass/util/cutlass_math.h" +#include "cutlass/vector.h" namespace cutlass { @@ -72,7 +72,7 @@ provides access to element at (d, h, w, c) //////////////////////////////////////////////////////////////////////////////////////////////////// -template +template struct StorageType { typedef uint64_t Type; }; @@ -108,9 +108,11 @@ struct Fragment : public AlignedStruct { typedef Element_ Element; /// The number of elements. static int const kElements = kElements_; + /// Alignment + static int const kAlignment = kAlignment_; /// Clear a fragment. - CUTLASS_DEVICE void clear() { + CUTLASS_HOST_DEVICE void clear() { // Avoid element-wise access for sub 32b element type if (kAlignment_ >= 8 && (kElements * sizeof(Element)) % 8 == 0) { uint64_t* ptr = reinterpret_cast(storage); @@ -135,14 +137,10 @@ struct Fragment : public AlignedStruct { } /// The accessor. - CUTLASS_DEVICE Element& operator[](int i) { - assert(i < kElements_); - return reinterpret_cast(storage)[i]; - } + CUTLASS_HOST_DEVICE Element& operator[](int i) { return reinterpret_cast(storage)[i]; } /// The accessor. - CUTLASS_DEVICE Element const& operator[](int i) const { - assert(i < kElements_); + CUTLASS_HOST_DEVICE Element const& operator[](int i) const { return reinterpret_cast(storage)[i]; } @@ -188,35 +186,35 @@ struct FragmentIterator { /// Ctor. template - CUTLASS_DEVICE FragmentIterator(OtherFragment_& fragment, int offset = 0) + CUTLASS_HOST_DEVICE FragmentIterator(OtherFragment_& fragment, int offset = 0) : pointer(reinterpret_cast(&fragment[offset])) { static_assert(OtherFragment_::kElements >= Fragment::kElements, ""); } /// The accessor. - CUTLASS_DEVICE AccessType const& at(int d, int h, int w, int c = 0) const { + CUTLASS_HOST_DEVICE AccessType const& at(int d, int h, int w, int c = 0) const { int const imm = ComputeOffsetFromStrides::get(d, h, w, c); return reinterpret_cast(pointer[imm]); } /// The accessor. - CUTLASS_DEVICE AccessType& at(int d, int h, int w, int c = 0) { + CUTLASS_HOST_DEVICE AccessType& at(int d, int h, int w, int c = 0) { int const imm = ComputeOffsetFromStrides::get(d, h, w, c); return reinterpret_cast(pointer[imm]); } /// The accessor. - CUTLASS_DEVICE AccessType const& operator[](int i) const { + CUTLASS_HOST_DEVICE AccessType const& operator[](int i) const { return reinterpret_cast(pointer[i * kElementsPerAccess]); } /// The accessor. - CUTLASS_DEVICE AccessType& operator[](int i) { + CUTLASS_HOST_DEVICE AccessType& operator[](int i) { return reinterpret_cast(pointer[i * kElementsPerAccess]); } /// Is the iterator valid? - CUTLASS_DEVICE bool valid(int d, int h, int w, int c) const { return true; } + CUTLASS_HOST_DEVICE bool valid(int d, int h, int w, int c) const { return true; } /// The pointer. Element* pointer; @@ -246,28 +244,28 @@ struct FragmentConstIterator { /// Ctor. template - CUTLASS_DEVICE FragmentConstIterator(OtherFragment_& fragment, int offset = 0) + CUTLASS_HOST_DEVICE FragmentConstIterator(OtherFragment_& fragment, int offset = 0) : pointer(reinterpret_cast(&fragment[offset])) { static_assert(OtherFragment_::kElements >= Fragment::kElements, ""); } /// Create from non-constant FragmentIterator - CUTLASS_DEVICE FragmentConstIterator( + CUTLASS_HOST_DEVICE FragmentConstIterator( FragmentIterator const& rhs_) : pointer(reinterpret_cast(rhs_.offset)) {} /// The accessor. - CUTLASS_DEVICE AccessType const& at(int d, int h, int w, int c = 0) const { + CUTLASS_HOST_DEVICE AccessType const& at(int d, int h, int w, int c = 0) const { int const imm = ComputeOffsetFromStrides::get(d, h, w, c); return reinterpret_cast(pointer[imm]); } /// The accessor. - CUTLASS_DEVICE AccessType const& operator[](int i) const { + CUTLASS_HOST_DEVICE AccessType const& operator[](int i) const { return reinterpret_cast(pointer[i * kElementsPerAccess]); } /// Is the iterator valid? - CUTLASS_DEVICE bool valid(int d, int h, int w, int c) const { return true; } + CUTLASS_HOST_DEVICE bool valid(int d, int h, int w, int c) const { return true; } /// The pointer. Element const* pointer; diff --git a/cutlass/fragment_load_store.h b/cutlass/fragment_load_store.h deleted file mode 100644 index a7d272e9e3..0000000000 --- a/cutlass/fragment_load_store.h +++ /dev/null @@ -1,135 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, are permitted - * provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright notice, this list of - * conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright notice, this list of - * conditions and the following disclaimer in the documentation and/or other materials - * provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used - * to endorse or promote products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -/*! \file - \brief Defines accessors for loading and storing fragments to memory efficiently. -*/ -#pragma once - -#include -#include - -namespace cutlass { - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -template -struct FragmentLoad {}; - -template -struct FragmentLoad { - /// The output type. - typedef FragmentElement_ AccessType; - - /// The load function. - static CUTLASS_DEVICE void load(AccessType& value, Scalar_ const* pointer, int offset) { - value.load(&pointer[offset], kStride); - } -}; - -template -struct FragmentLoad { - /// The output type. - typedef typename Vectorize::Type AccessType; - - /// The load function. - static CUTLASS_DEVICE void load(AccessType& value, Scalar_ const* pointer, int offset) { - Load::load(value, pointer, offset); - } -}; - -template -struct FragmentStore {}; - -template -struct FragmentStore { - /// The input type. - typedef FragmentElement_ AccessType; - - /// The store function. - static CUTLASS_DEVICE void store(AccessType const& value, Scalar_* pointer, int offset) { - value.store(&pointer[offset], kStride); - } -}; - -template -struct FragmentStore { - /// The input type. - typedef typename Vectorize::Type AccessType; - - /// The store function. - static CUTLASS_DEVICE void store(AccessType const& value, Scalar_* pointer, int offset) { - Store::store(value, pointer, offset); - } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -} /// namespace cutlass diff --git a/cutlass/fragment_multiply_add.h b/cutlass/fragment_multiply_add.h index 36a4d6f6a5..de2c8052fe 100644 --- a/cutlass/fragment_multiply_add.h +++ b/cutlass/fragment_multiply_add.h @@ -27,52 +27,59 @@ */ #pragma once -#include +#include "cutlass/fragment.h" namespace cutlass { namespace gemm { //////////////////////////////////////////////////////////////////////////////////////////////////// -template +template < typename ScalarAlphaBeta_, + typename ScalarAccum_, + bool fragMul2 = true /*number of element per fragment is multiple of 2*/ +> struct FragmentMultiplyAdd { /// The shape of the instruction. typedef Shape<1, 1, 1, 1> InstructionShape; - /// The type for A. - typedef Scalar_ ScalarA; - /// The type for B. - typedef Scalar_ ScalarB; - /// The type for C and D. - typedef Scalar_ ScalarC; + /// The type for alpha and beta + typedef ScalarAlphaBeta_ ScalarAlphaBeta; + /// The type for accumlator + typedef ScalarAccum_ ScalarAccum; /// Ctor. CUTLASS_DEVICE FragmentMultiplyAdd() {} /// Multiply : d = a*b. template - CUTLASS_DEVICE void multiply(Scalar_ a, FragmentB_ const& b, FragmentCd_& d) { + CUTLASS_DEVICE void multiply(ScalarAlphaBeta a, FragmentB_ const& b, FragmentCd_& d) { +#if defined(__CUDACC__) && __CUDA_ARCH__ >= 530 int const kReduction = FragmentB_::kElements / FragmentCd_::kElements; for (int j = 0; j < FragmentCd_::kElements; ++j) { - d[j] = a * b[j * kReduction + 0]; + d[j] = b[j * kReduction + 0]; for (int k = 1; k < kReduction; ++k) { - d[j] += a * b[j * kReduction + k]; + d[j] += b[j * kReduction + k]; } + d[j] = a * ScalarAlphaBeta(d[j]); } +#endif } /// Multiply : d = a*b + c. template - CUTLASS_DEVICE void multiply_add(Scalar_ a, + CUTLASS_DEVICE void multiply_add(ScalarAlphaBeta a, FragmentB_ const& b, FragmentCd_ const& c, FragmentCd_& d) { +#if defined(__CUDACC__) && __CUDA_ARCH__ >= 530 int const kReduction = FragmentB_::kElements / FragmentCd_::kElements; for (int j = 0; j < FragmentCd_::kElements; ++j) { - d[j] = a * b[j * kReduction + 0] + c[j]; + d[j] = b[j * kReduction + 0]; for (int k = 1; k < kReduction; ++k) { - d[j] += a * b[j * kReduction + k]; + d[j] += b[j * kReduction + k]; } + d[j] = a * ScalarAlphaBeta(d[j]) + ScalarAlphaBeta(c[j]); } +#endif } }; @@ -80,15 +87,13 @@ struct FragmentMultiplyAdd { #if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16) template <> -struct FragmentMultiplyAdd { +struct FragmentMultiplyAdd { /// The shape of the instruction. - typedef Shape<1, 1, 2, 1> InstructionShape; - /// The type for A. - typedef half ScalarA; - /// The type for B. - typedef half ScalarB; - /// The type for C and D. - typedef half ScalarC; + typedef Shape<1, 1, 1, 1> InstructionShape; + /// The type for alpha and beta + typedef half ScalarAlphaBeta; + /// The type for accumlator + typedef half ScalarAccum; /// Ctor. CUTLASS_DEVICE FragmentMultiplyAdd() {} @@ -97,17 +102,19 @@ struct FragmentMultiplyAdd { template CUTLASS_DEVICE void multiply(half a, FragmentB_ const& b, FragmentCd_& d) { #if defined(__CUDACC__) && __CUDA_ARCH__ >= 530 - - // Assemble a half2 from a. - __half2 const a_half2 = __half2half2(a); // The input. __half2 const* b_half2 = reinterpret_cast<__half2 const*>(&b[0]); // The output. __half2* d_half2 = reinterpret_cast<__half2*>(&d[0]); - int const kReduction = FragmentB_::kElements / FragmentCd_::kElements; + // Assemble a half2 from a. + __half2 const a_half2 = __half2half2(a); + + int const kReduction = (FragmentB_::kElements / FragmentCd_::kElements); + for (int j = 0; j < FragmentCd_::kElements / 2; ++j) { d_half2[j] = __hmul2(a_half2, b_half2[j * kReduction + 0]); + for (int k = 1; k < kReduction; ++k) { d_half2[j] = __hfma2(a_half2, b_half2[j * kReduction + k], d_half2[j]); } @@ -115,6 +122,7 @@ struct FragmentMultiplyAdd { #endif } + /// Multiply : d = a*b + c. template CUTLASS_DEVICE void multiply_add(half a, @@ -122,17 +130,19 @@ struct FragmentMultiplyAdd { FragmentCd_ const& c, FragmentCd_& d) { #if defined(__CUDACC__) && __CUDA_ARCH__ >= 530 - // Assemble a half2 from a. - __half2 const a_half2 = __half2half2(a); // The inputs. __half2 const* b_half2 = reinterpret_cast<__half2 const*>(&b[0]); __half2 const* c_half2 = reinterpret_cast<__half2 const*>(&c[0]); // The output. __half2* d_half2 = reinterpret_cast<__half2*>(&d[0]); + // Assemble a half2 from a. + __half2 const a_half2 = __half2half2(a); + int const kReduction = (FragmentB_::kElements / FragmentCd_::kElements); for (int j = 0; j < FragmentCd_::kElements / 2; ++j) { d_half2[j] = __hfma2(a_half2, b_half2[j * kReduction + 0], c_half2[j]); + for (int k = 1; k < kReduction; ++k) { d_half2[j] = __hfma2(a_half2, b_half2[j * kReduction + k], d_half2[j]); } diff --git a/cutlass/gemm/clear_accumulators.h b/cutlass/gemm/clear_accumulators.h index 441370f4c3..3a2f337525 100644 --- a/cutlass/gemm/clear_accumulators.h +++ b/cutlass/gemm/clear_accumulators.h @@ -27,7 +27,7 @@ */ #pragma once -#include +#include "cutlass/vector.h" namespace cutlass { namespace gemm { @@ -39,11 +39,12 @@ struct ClearAccumulators { /// The shared storage. struct SharedStorage {}; - /// Ctor. - CUTLASS_DEVICE ClearAccumulators() {} /// Ctor. CUTLASS_DEVICE ClearAccumulators(SharedStorage& shared_storage) {} + /// Ctor. + CUTLASS_DEVICE ClearAccumulators() {} + /// Clear the fragment. template CUTLASS_DEVICE void clear(Fragment_& fragment) { diff --git a/cutlass/gemm/dgemm_traits.h b/cutlass/gemm/dgemm_traits.h index 0bbc2210bc..5c05590207 100644 --- a/cutlass/gemm/dgemm_traits.h +++ b/cutlass/gemm/dgemm_traits.h @@ -27,13 +27,13 @@ */ #pragma once -#include -#include -#include -#include -#include -#include -#include +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/gemm_epilogue.h" +#include "cutlass/gemm/gemm_epilogue_traits.h" +#include "cutlass/gemm/gemm_global_tile.h" +#include "cutlass/gemm/gemm_shared_tile.h" +#include "cutlass/gemm/gemm_traits.h" +#include "cutlass/gemm/thread_multiply_add.h" namespace cutlass { namespace gemm { @@ -41,10 +41,10 @@ namespace gemm { //////////////////////////////////////////////////////////////////////////////////////////////////// template < - /// The tile size for the GEMM KxNxM. + /// The tile size for threadblock-level GEMM (K-by-N-by-M). typename OutputTile_, - /// The number of accumulators per thread. - typename AccumulatorsPerThread_, + /// Tile size for thread-level GEMM (K-by-N-by-M) + typename ThreadGemmShape_, /// The number of scalars per LDG for A. int kScalarsPerLdgA_ = 1, /// The number of scalars per LDG for B. @@ -62,7 +62,7 @@ struct DgemmConfig /// The tile size for the GEMM KxNxM. OutputTile_, /// The functor to do the math in the main loop. - ThreadMultiplyAdd, double, double, double>, + ThreadMultiplyAdd, double, double, double>, /// The number of scalars per LDG for A. kScalarsPerLdgA_, /// The number of scalars per STS for A. @@ -82,7 +82,14 @@ struct DgemmConfig /// The number of scalars per LDS for D. 1, /// The number of stages in shared memory. - 2> {}; + 2, + /// kResidueSeparate + false, + /// kResidueInPrologue + false, + /// kLaunchBounds + false + >{}; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -91,12 +98,12 @@ template < MatrixLayout::Kind kLayoutA_, /// The layout for B. MatrixLayout::Kind kLayoutB_, - /// The output tile. + /// The tile size for threadblock-level GEMM (K-by-N-by-M) typename OutputTile_ = Shape<8, 64, 128>, /// The functor to use in the epilogue. typename EpilogueFunctor_ = LinearScaling, - /// The number of accumulators per thread. - typename AccumulatorsPerThread_ = Shape<8, 8, 8>, + /// Tile size for thread-level GEMM (K-by-N-by-M) + typename ThreadGemmShape_ = Shape<8, 8, 8>, /// The number of doubles loaded in one LDG for A. int kScalarsPerLdgA_ = 1, /// The number of doubles loaded in one LDG for B. @@ -105,7 +112,7 @@ template < typename Index_ = int, /// The DGEMM config. typename GemmConfig_ = - DgemmConfig, + DgemmConfig, /// The traits class for the epilogue. typename GemmEpilogueTraits_ = SimplifiedGemmEpilogueTraits > diff --git a/cutlass/gemm/fp16_sgemm_multiply_add.h b/cutlass/gemm/fp16_sgemm_multiply_add.h new file mode 100644 index 0000000000..534b8c8998 --- /dev/null +++ b/cutlass/gemm/fp16_sgemm_multiply_add.h @@ -0,0 +1,83 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template implementing matrix multiply-add operations on fragments. +*/ +#pragma once + +#include "cutlass/fragment.h" +#include "cutlass/gemm/thread_multiply_add.h" +namespace cutlass { +namespace gemm { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Template performing matrix multiply-add operation within a thread +template +struct ThreadMultiplyAdd { + /// The shape of the instruction. + typedef Shape<1, 1, 1, 1> InstructionShape; + /// The shape of a thread-leveel matrix multiply accumulate. + typedef ThreadGemmShape_ ThreadGemmShape; + /// Aliased to "AccumulatorsPerThread" for compatibility. Expect to be renamed in CUTLASS v2.0 + typedef ThreadGemmShape AccumulatorsPerThread; + /// The number of threads per warp. + typedef ThreadsPerWarp_ ThreadsPerWarp; + /// The number of accumulators per warp. + typedef typename ShapeMul::Shape AccumulatorsPerWarp; + /// The type for A. specialized to half + typedef half ScalarA; + /// The fragment for A. + typedef Fragment FragmentA; + /// The type for B. specialized to half + typedef half ScalarB; + /// The fragment for B. + typedef Fragment FragmentB; + /// The type for C and D. specialized to float + typedef float ScalarC; + /// The accumulators. + typedef Fragment Accumulators; + + /// Ctor. + CUTLASS_DEVICE ThreadMultiplyAdd() {} + + /// Multiply : d = a*b + c. + CUTLASS_DEVICE void multiply_add(FragmentA const& a, + FragmentB const& b, + Accumulators const& c, + Accumulators& d) { + for (int j = 0; j < AccumulatorsPerThread::kH; ++j) { + for (int i = 0; i < AccumulatorsPerThread::kW; ++i) { + d[j * AccumulatorsPerThread::kW + i] = static_cast(a[i]) * static_cast(b[j]) + c[j * AccumulatorsPerThread::kW + i]; + } + } + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace gemm +} // namespace cutlass diff --git a/cutlass/gemm/fp16_sgemm_traits.h b/cutlass/gemm/fp16_sgemm_traits.h new file mode 100644 index 0000000000..361186455b --- /dev/null +++ b/cutlass/gemm/fp16_sgemm_traits.h @@ -0,0 +1,152 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Defies structural properties of single-precision GEMM where any number of the input/output + could be fp16 or fp32. The accumulator type stays in fp32 +*/ +#pragma once + +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/gemm_epilogue.h" +#include "cutlass/gemm/gemm_epilogue_traits.h" +#include "cutlass/gemm/gemm_global_tile.h" +#include "cutlass/gemm/gemm_shared_tile.h" +#include "cutlass/gemm/gemm_traits.h" +#include "cutlass/gemm/fp16_sgemm_multiply_add.h" + +namespace cutlass { +namespace gemm { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template < + /// The tile size for the GEMM KxNxM. + typename OutputTile_, + /// Tile size for thread-level GEMM (K-by-N-by-M) + typename ThreadGemmShape_, + /// The type for A + typename ScalarA_, + /// The type for B + typename ScalarB_, + /// The type for C + typename ScalarC_, + /// The type for D + typename ScalarD_, + /// The number of scalars per LDG for A. + int kScalarsPerLdgA_ = 1, + /// The number of scalars per LDG for B. + int kScalarsPerLdgB_ = 1> +struct Fp16SgemmConfig : public GemmConfig< + /// The scalar type for A. + ScalarA_, + /// The scalar type for B. + ScalarB_, + /// The scalar type for C. + ScalarC_, + /// The scalar type for D. + ScalarD_, + /// The tile size for the GEMM KxNxM. + OutputTile_, + /// The functor to do the math in the main loop. + ThreadMultiplyAdd, ScalarA_, ScalarB_, float /*for sgemm accum is float*/>, + /// The number of scalars per LDG for A. + kScalarsPerLdgA_, + /// The number of scalars per STS for A. + kScalarsPerLdgA_, + /// The number of scalars per LDS for A. + 4, + /// The number of scalars per LDG for B. + kScalarsPerLdgB_, + /// The number of scalars per STS for B. + kScalarsPerLdgB_, + /// The number of scalars per LDS for B. + 4, + /// The number of scalars per LDG for C and STG for D. + 1, + /// The number of scalars per STS for D. + 4, + /// The number of scalars per LDS for D. + 1, + /// The number of stages in shared memory. + 2> {}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template < + /// The layout for A. + MatrixLayout::Kind kLayoutA_, + /// The layout for B. + MatrixLayout::Kind kLayoutB_, + /// The output tile. + typename OutputTile_ = Shape<8, 128, 128>, + /// The type for A + typename ScalarA_ = half, + /// The type for B + typename ScalarB_ = half, + /// The type for C + typename ScalarC_ = half, + /// The type for D + typename ScalarD_ = half, + /// the Type for alpha and beta, + typename Scalar_ = half, + /// The functor to use in the epilogue. + typename EpilogueFunctor_ = LinearScaling >, + /// Tile size for thread-level GEMM (K-by-N-by-M) + typename ThreadGemmShape_ = Shape<8, 8, 8>, + /// The number of floats loaded in one LDG for A. + int kScalarsPerLdgA_ = 1, + /// The number of floats loaded in one LDG for B. + int kScalarsPerLdgB_ = 1, + /// The index. + typename Index_ = int, + /// The SGEMM config. + typename GemmConfig_ = + Fp16SgemmConfig, + /// The traits class for the epilogue. + typename GemmEpilogueTraits_ = + SimplifiedGemmEpilogueTraits > +struct Fp16SgemmSgemmTraits : public SimplifiedGemmTraits< + // The layout for A. + kLayoutA_, + // The layout for B. + kLayoutB_, + // The config. + GemmConfig_, + // The epilogue. + GemmEpilogue, + // The index. + Index_> {}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace gemm +} // namespace cutlass diff --git a/cutlass/gemm/gemm.h b/cutlass/gemm/gemm.h index c50a3f04b4..6340ab4f33 100644 --- a/cutlass/gemm/gemm.h +++ b/cutlass/gemm/gemm.h @@ -31,16 +31,32 @@ #include #endif -#include -#include - +#include "cutlass/coord.h" +#include "cutlass/util/platform.h" namespace cutlass { namespace gemm { //////////////////////////////////////////////////////////////////////////////////////////////////// +/// GEMM kernel with launch bounds specified +template +__global__ __launch_bounds__(Gemm_::kThreads) +void gemm_kernel(typename Gemm_::Params params) { + // Declare shared memory. + __shared__ typename Gemm_::SharedStorage shared_storage; + + // Construct the GEMM object. + Gemm_ gemm(params, shared_storage); + // Run GEMM. + gemm.multiply_add(); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// GEMM kernel without launch bounds specified template -__global__ /*__launch_bounds__(Gemm_::kThreads)*/ void gemm_kernel(typename Gemm_::Params params) { +__global__ /* __launch_bounds__(Gemm_::kThreads) */ +void gemm_kernel_nolb(typename Gemm_::Params params) { // Declare shared memory. __shared__ typename Gemm_::SharedStorage shared_storage; @@ -52,28 +68,22 @@ __global__ /*__launch_bounds__(Gemm_::kThreads)*/ void gemm_kernel(typename Gemm //////////////////////////////////////////////////////////////////////////////////////////////////// -template -struct GemmDesc { - /// The dimensions of the GEMM. - Index_ m, n, k; - /// The alpha/beta scaling values. - Scalar_ alpha, beta; - /// The source matrix A. - void const* d_a; - /// The stride for A. - Index_ lda; - /// The source matrix B. - void const* d_b; - /// The stride for B. - Index_ ldb; - /// The source matrix C. - void const* d_c; - /// The stride for C. - Index_ ldc; - /// The destination matrix D. - void* d_d; - /// The stride for D. - Index_ ldd; +/// Partial specialization for launching the GEMM kernel with or without launch bounds +template +struct Launch { + Launch(typename Gemm::Params params, dim3 grid, dim3 block, cudaStream_t stream = 0) { + gemm_kernel<<< grid, block, 0, stream >>>(params); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for launching the GEMM kernel with or without launch bounds +template +struct Launch { + Launch(typename Gemm::Params params, dim3 grid, dim3 block, cudaStream_t stream = 0) { + gemm_kernel_nolb<<< grid, block, 0, stream >>>(params); + } }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -100,86 +110,52 @@ struct Gemm { /// The index. typedef typename Traits::Index Index; + /// Define the mainloop iteration size + typedef typename Traits::MultiplyAdd MultiplyAdd; + /// The number of threads. static int const kThreads = Traits::GemmConfig::kThreads; - /// The params. - struct Params : public Traits::Params { - CUTLASS_HOST_DEVICE int initialize(Index m, - Index n, - Index k, - ScalarEpilogue alpha, - ScalarA const* d_a, - Index lda, - ScalarB const* d_b, - Index ldb, - ScalarEpilogue beta, - ScalarC const* d_c, - Index ldc, - ScalarD* d_d, - Index ldd) { - GemmDesc desc; - desc.m = m; - desc.n = n; - desc.k = k; - desc.alpha = alpha; - desc.beta = beta; - desc.d_a = reinterpret_cast(d_a); - desc.lda = lda; - desc.d_b = reinterpret_cast(d_b); - desc.ldb = ldb; - desc.d_c = reinterpret_cast(d_c); - desc.ldc = ldc; - desc.d_d = reinterpret_cast(d_d); - desc.ldd = ldd; - return Traits::Params::initialize(desc); - } - }; + // Number of warp-level multiply-accumulate steps executed by each warp. + static Index const kWarpGemmSteps = + Traits::GemmConfig::AccumulatorsPerWarp::kD / MultiplyAdd::InstructionShape::kD; + + // Make sure we have at least 2 unrolling steps or our pipeling is not going to work. + static_assert(kWarpGemmSteps >= 2, "The pipelining assumes at least two steps"); + /// Use the params object defined in traits + typedef typename Traits::Params Params; + +// +// Static function members +// + +/// Support for NVRTC #if !defined(__CUDACC_RTC__) /// Launch the kernel. static __host__ cudaError_t launch(Params const& params, cudaStream_t stream = cudaStreamDefault) { - // Setup the grid. - dim3 grid; - grid.x = (params.m + Traits::OutputTile::kW - 1) / Traits::OutputTile::kW; - grid.y = (params.n + Traits::OutputTile::kH - 1) / Traits::OutputTile::kH; - - // The number of threads. - dim3 block; - block.x = kThreads; // Launch the kernel. - void const* params_ = reinterpret_cast(¶ms); - - return cudaLaunchKernel(reinterpret_cast(&gemm_kernel), - grid, - block, - const_cast(¶ms_), - 0, - stream); + Launch( + params, params.grid, params.block, stream); + + return cudaGetLastError(); } /// Launch the kernel. static __host__ cudaError_t launch(CUfunction kernel, Params const& params, CUstream stream = CU_STREAM_LEGACY) { - // Setup the grid. - dim3 grid; - grid.x = (params.m + Traits::OutputTile::kW - 1) / Traits::OutputTile::kW; - grid.y = (params.n + Traits::OutputTile::kH - 1) / Traits::OutputTile::kH; - - // The number of threads. - dim3 block; - block.x = kThreads; // Launch the kernel. void* params_[] = {const_cast(reinterpret_cast(¶ms))}; - // return cudaLaunchKernel(reinterpret_cast(&gemm_kernel), grid, block, - // const_cast(¶ms_), 0, stream); CUresult result = cuLaunchKernel( - kernel, grid.x, grid.y, grid.z, block.x, block.y, block.z, 0, stream, params_, 0); + kernel, + params.grid.x, params.grid.y, params.grid.z, + params.block.x, params.block.y, params.block.z, + 0, stream, params_, 0); if (result != CUDA_SUCCESS) { return cudaErrorLaunchFailure; @@ -189,39 +165,41 @@ struct Gemm { #endif + // + // Methods + // + /// Ctor. CUTLASS_DEVICE Gemm(Params const& params_, SharedStorage& shared_storage_) : params(params_), shared_storage(shared_storage_) {} - /// Consume a single iteration of the loop. - template - CUTLASS_DEVICE void consume_tile(typename Traits::GlobalLoadStream& global_stream, - typename Traits::SharedLoadStream& shared_load_stream, - typename Traits::MultiplyAdd::Accumulators& accumulators, + /// Computes a warp-level GEMM on data held in shared memory + template + CUTLASS_DEVICE void consume_tile(typename Traits::GlobalLoadStream& global_to_shared_stream, + typename Traits::SharedStream& shared_load_stream, + typename MultiplyAdd::Accumulators& accumulators, Index outer_k) { - // If that's the last "load iteration" update the predicates. - if (!kIsLastIteration) { - global_stream.move_to_residue(outer_k); + // If residue portion and not calculating residue in prolog, update residue predicates now. + if (Residue && outer_k <= Traits::OutputTile::kD) { + global_to_shared_stream.residue(outer_k); } - // Load data for the next iteration of the main loop. - if (!kIsLastIteration) { - global_stream.copy(); + // Load data for the next iteration of the main loop (unless it's the last iteration). + if (!LastIteration) { + global_to_shared_stream.copy(); } - // The unrolling steps for the main loop. - int const kUnrollingSteps = - Traits::MultiplyAdd::AccumulatorsPerWarp::kD / Traits::MultiplyAdd::InstructionShape::kD; - CUTLASS_PRAGMA_UNROLL - for (int step = 0; step < kUnrollingSteps - 1; ++step) { + for (int step = 0; step < kWarpGemmSteps - 1; ++step) { // Trigger the copy from shared memory for the next A/B values. shared_load_stream.copy(step + 1); + // Make sure the values are available for the current iteration to do the multiply-add. shared_load_stream.commit(step); + MultiplyAdd multiply_add; + // Do the math on the fragments of the current iteration. - typename Traits::MultiplyAdd multiply_add; multiply_add.multiply_add(shared_load_stream.fragment_a(step), shared_load_stream.fragment_b(step), accumulators, @@ -232,28 +210,25 @@ struct Gemm { Traits::shared_load_fence(true); // Commit the data in shared memory for A/B. - if (!kIsLastIteration) { - global_stream.commit(); + if (!LastIteration) { + global_to_shared_stream.commit(); } - // Make sure the data is in shared memory. Traits::shared_store_fence(true); - // Trigger the loads for the next iteration (if needed). - if (!kIsLastIteration) { + if (!LastIteration) { // Move to the next stage for the load (if it makes sense). shared_load_stream.inc_stage(); // Trigger the copy from shared memory for the next loop iteration. shared_load_stream.copy(0); } - // Make sure the values are available for the current iteration to do the multiply-add. - shared_load_stream.commit(kUnrollingSteps - 1); + shared_load_stream.commit(kWarpGemmSteps - 1); // Do the math on the fragments of the current iteration. - typename Traits::MultiplyAdd multiply_add; - multiply_add.multiply_add(shared_load_stream.fragment_a(kUnrollingSteps - 1), - shared_load_stream.fragment_b(kUnrollingSteps - 1), + MultiplyAdd multiply_add; + multiply_add.multiply_add(shared_load_stream.fragment_a(kWarpGemmSteps - 1), + shared_load_stream.fragment_b(kWarpGemmSteps - 1), accumulators, accumulators); } @@ -262,76 +237,112 @@ struct Gemm { CUTLASS_DEVICE void multiply_add() { // Swizzle the IDs of the block (to enable better cache behavior). typename Traits::BlockSwizzle block_swizzle; - dim3 block = block_swizzle.swizzle(); - - // Scale the id. - block.x *= Traits::OutputTile::kW; - block.y *= Traits::OutputTile::kH; + Coord<3> threadblock_offset = + block_swizzle.get_threadblock_offset(make_Coord_from_shape()); // We may want to use shared memory to clear the registers. typedef typename Traits::ClearAccumulators ClearAccumulators; // The streams to read A/B from global memory to shared memory. - typename Traits::GlobalLoadStream global_stream(params, shared_storage, block); + typename Traits::GlobalLoadStream global_to_shared_stream( + params.global_to_shared_stream, + shared_storage.main_loop.global_to_shared_stream, + shared_storage.main_loop.threadblock_tile.reference(), + params.problem_size.knm(), + threadblock_offset); - // Create the accumulator clear. - ClearAccumulators clear(shared_storage.main_loop.clear); + // update A and B pointer offset based on batch_id and batch_stride_offset + //global_to_shared_stream.add_pointer_offset(block_swizzle.get_batch_id(), params.batch_stride_A, params.batch_stride_B); + global_to_shared_stream += make_Coord(block_swizzle.get_batch_id(), 0, 0); - // By how much we unroll the main loop. - Index const kUnroll = static_cast(Traits::OutputTile::kD); + // Create the accumulator clear. + ClearAccumulators clear; - // If we do not have enough steps in the main loop, trigger the residue code. - global_stream.move_to_residue(params.k); + // Deal with residue in prolog. + global_to_shared_stream.move_to_residue(params.problem_size[0], Traits::OutputTile::kD); // Fetch the fragments for A and B from global memory. - global_stream.copy(); + global_to_shared_stream.copy(); // Copy the elements to shared memory (after transformation if needed). - global_stream.commit(); + global_to_shared_stream.commit(); // Make sure the data is in shared memory. Traits::shared_store_fence(false); - // Rollback to the beginning of the GEMM-K dimension. It may have no impact. - global_stream.rollback(); - - // The unrolling steps for the main loop. - int const kUnrollingSteps = - Traits::MultiplyAdd::AccumulatorsPerWarp::kD / Traits::MultiplyAdd::InstructionShape::kD; - - // Make sure we have at least 2 unrolling steps or our pipeling is not going to work. - static_assert(kUnrollingSteps >= 2, "The pipelining assumes at least two steps"); + // Rollback to the beginning of the first tile (if residue exists). + global_to_shared_stream.rollback(params.problem_size[0] % Traits::OutputTile::kD); // The stream of data from shared memory to fragments. - typename Traits::SharedLoadStream shared_load_stream(params, shared_storage); + typename Traits::SharedStream shared_load_stream( + params.shared_stream, + shared_storage.main_loop.threadblock_tile.reference()); // Trigger the copy from shared memory for the 1st stream. shared_load_stream.copy(0); // Allocate the accumulators. - typename Traits::MultiplyAdd::Accumulators accumulators; + typename MultiplyAdd::Accumulators accumulators; + // Clear the accumulators. clear.clear(accumulators); - // The loop index. - Index outer_k = params.k - kUnroll; + // Initial index + Index outer_k = params.problem_size[0] - Traits::OutputTile::kD; - // Enter the main loop and iterate. - for (; outer_k > 0; outer_k -= kUnroll) { - consume_tile(global_stream, shared_load_stream, accumulators, outer_k); - } + // Check if we are computing residue in prolog or not. + if (Traits::GemmConfig::kResidueInProlog) { + + // Execute all mainloop iterations but the last one. + + CUTLASS_GEMM_LOOP + for (; outer_k > 0; outer_k -= Traits::OutputTile::kD) { + consume_tile( + global_to_shared_stream, shared_load_stream, accumulators, outer_k); + + } + + // Don't load data for the last "residue" portion since we've already computed the residue. + CUTLASS_GEMM_LOOP + for (; outer_k > -Traits::OutputTile::kD; outer_k -= Traits::OutputTile::kD) { + consume_tile( + global_to_shared_stream, shared_load_stream, accumulators, outer_k); - // Residual loop. - for (; outer_k > -kUnroll; outer_k -= kUnroll) { - consume_tile(global_stream, shared_load_stream, accumulators, outer_k); + } + } else { + // When kResidueSeparate = true, execute all mainloop iterations but the last two without any + // consideration for K-residue or predicate updates. This improves the steady state of some + // kernels. + if (Traits::GemmConfig::kResidueSeparate) { + + CUTLASS_GEMM_LOOP + for (; outer_k > Traits::OutputTile::kD; outer_k -= Traits::OutputTile::kD) { + consume_tile( + global_to_shared_stream, shared_load_stream, accumulators, outer_k); + + } + } + + // Execute remaining tiles with K-residue predicate updates enabled. + + CUTLASS_GEMM_LOOP + for (; outer_k > -Traits::OutputTile::kD; outer_k -= Traits::OutputTile::kD) { + consume_tile( + global_to_shared_stream, shared_load_stream, accumulators, outer_k); + + } } // Epilogue. typedef typename Traits::Epilogue Epilogue; - Epilogue epilogue(params.epilogue, shared_storage.epilogue, params.m, params.n); - epilogue.epilogue(cutlass::make_Coord(0, block.y, block.x), accumulators); + Epilogue epilogue(params.epilogue, shared_storage.epilogue, params.problem_size.knm()); + epilogue.epilogue(accumulators, threadblock_offset, block_swizzle.get_batch_id()); } + // + // Data members + // + /// The params. Params const& params; /// The shared storage. diff --git a/cutlass/gemm/gemm_config.h b/cutlass/gemm/gemm_config.h new file mode 100644 index 0000000000..76df0add62 --- /dev/null +++ b/cutlass/gemm/gemm_config.h @@ -0,0 +1,145 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Defines properties of GEMM computation that impose some constraints on caller. +*/ +#pragma once + +#include "cutlass/shape.h" + +namespace cutlass { +namespace gemm { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template < + /// The scalar type for A. + typename ScalarA_, + /// The scalar type for B. + typename ScalarB_, + /// The scalar type for C. + typename ScalarC_, + /// The scalar type for D. + typename ScalarD_, + /// The threadblock tile size for the GEMM KxNxM. + typename OutputTile_, + /// The functor to do the math. + typename MultiplyAdd_, + /// The number of scalars per LDG for A. + int kScalarsPerLdgA_, + /// The number of scalars per STS for A. + int kScalarsPerStsA_, + /// The number of scalars per LDG for A. + int kScalarsPerLdsA_, + /// The number of scalars per LDG for B. + int kScalarsPerLdgB_, + /// The number of scalars per STS for B. + int kScalarsPerStsB_, + /// The number of scalars per LDS for B. + int kScalarsPerLdsB_, + /// The number of scalars per LDG for C and STG for D. + int kScalarsPerLdgCAndStgD_, + /// The number of scalars per STS for D. + int kScalarsPerStsD_, + /// The number of scalars per LDS for D. + int kScalarsPerLdsD_, + /// The number of stages in shared memory to do single/double/triple-buffering. + int kStages_, + /// If true, residue is computed in mainloop. If false, separate loops are instantiated. + bool kResidueSeparate_ = false, + /// Is residue performed in prologue? + bool kResidueInProlog_ = false, + /// If true, kernel is launched with CUDA launch bounds specified + bool kLaunchBounds_ = true> +struct GemmConfig { + // + /// The scalar for A. + typedef ScalarA_ ScalarA; + /// The scalar for B. + typedef ScalarB_ ScalarB; + /// The scalar for C. + typedef ScalarC_ ScalarC; + /// The scalar for D. + typedef ScalarD_ ScalarD; + + /// The tile. + typedef OutputTile_ OutputTile; + /// The functor to do D = A*B + C. + typedef MultiplyAdd_ MultiplyAdd; + /// The shape of the instruction. + typedef typename MultiplyAdd::InstructionShape InstructionShape; + /// The shape of warp-level GEMM + typedef typename MultiplyAdd::AccumulatorsPerWarp AccumulatorsPerWarp; + /// The accumulators. + typedef typename MultiplyAdd::Accumulators Accumulators; + + /// The number of warps. + typedef typename ShapeDiv::Shape Warps; + /// The default warp size (32 threads per warp). + static int const kWarpSize = cutlass::kWarpSize; + /// The numnber of threads. + static int const kThreads = ShapeCount::kCount * kWarpSize; + + /// The number of scalars per LDG/STS/LDS for A. + static int const kScalarsPerLdgA = kScalarsPerLdgA_; + static int const kScalarsPerStsA = kScalarsPerStsA_; + static int const kScalarsPerLdsA = kScalarsPerLdsA_; + + /// The number of scalars per LDG/STS/LDS for B. + static int const kScalarsPerLdgB = kScalarsPerLdgB_; + static int const kScalarsPerStsB = kScalarsPerStsB_; + static int const kScalarsPerLdsB = kScalarsPerLdsB_; + + /// The number of scalars per LDG for C. + static int const kScalarsPerLdgC = kScalarsPerLdgCAndStgD_; + + /// The number of scalars per STS/LDS/STG for D. + static int const kScalarsPerStgD = kScalarsPerLdgCAndStgD_; + static int const kScalarsPerStsD = kScalarsPerStsD_; + static int const kScalarsPerLdsD = kScalarsPerLdsD_; + + /// The number of accumulators that are going to be fed from one LDS A/B. + static int const kAccumulatorsPerLdsA = kScalarsPerLdsA / InstructionShape::kD; + static int const kAccumulatorsPerLdsB = kScalarsPerLdsB / InstructionShape::kD; + + /// The number of stages in shared memory to implement double, triple, more-buffering. + static int const kStages = kStages_; + + /// If true, mainloop is instantiated twice. The first instantiation contains no predicate + // updates and is more efficient for some kernels. If false, only a single mainloop is + // instantaited. + static bool const kResidueSeparate = kResidueSeparate_; + + /// If true, residue is computed in the prologue. + static bool const kResidueInProlog = kResidueInProlog_; + + /// If true, kernel is launched with launch bounds specified + static bool const kLaunchBounds = kLaunchBounds_; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace gemm +} // namespace cutlass diff --git a/cutlass/gemm/gemm_coord.h b/cutlass/gemm/gemm_coord.h new file mode 100644 index 0000000000..8e36bb0430 --- /dev/null +++ b/cutlass/gemm/gemm_coord.h @@ -0,0 +1,203 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief GemmCoord is a structure derived from Coord<4> that specifies a location within the + coordinate system of a GEMM problem. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/coord.h" +#include "cutlass/util/platform.h" + +namespace cutlass { +namespace gemm { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// GemmCoord is a structure derived from Coord<4> that specifies a location within the +/// coordinate space of a GEMM problem. +struct GemmCoord : public Coord<4, int> { + + /// Integer-valued index + typedef int Index; + + /// Base type is a Coord of rank=4 + typedef Coord<4, Index> Base; + + /// GEMM K dimension - inner dimension of the GEMM problem + static int const kK = 0; + + /// GEMM N dimension - columns of the output C matrix + static int const kN = 1; + + /// GEMM M dimension - rows of the output C matrix + static int const kM = 2; + + /// Batch dimension - for generalizing to larger problems + static int const kBatch = 3; + + // + // Methods + // + + /// Default ctor + CUTLASS_HOST_DEVICE + GemmCoord() { } + + /// Constructs from Coord<3> and a batch + CUTLASS_HOST_DEVICE + GemmCoord(Coord<3, Index> const &coord, Index _batch = 0): Base(make_Coord(coord[0], coord[1], coord[2], _batch)) { } + + /// Constructs from Coord<4> + CUTLASS_HOST_DEVICE + GemmCoord(Coord<4, Index> const &coord): Base(coord) { } + + /// Constructs from an array of coordinate elements + CUTLASS_HOST_DEVICE + GemmCoord(Index coord[4]): Base(coord) { } + + /// Helper to construct from a K, N, M, batch variables + CUTLASS_HOST_DEVICE + GemmCoord(Index k, Index n, Index m, Index batch = 0): Base(make_Coord(k, n, m, batch)) { } + + /// Returns the GEMM M coordinate + CUTLASS_HOST_DEVICE + Index const & m() const { return this->at(kM); } + + /// Returns reference to the GEMM M coordinate + CUTLASS_HOST_DEVICE + Index & m() { return this->at(kM); } + + /// Returns the GEMM N coordinate + CUTLASS_HOST_DEVICE + Index const & n() const { return this->at(kN); } + + /// Returns reference to the GEMM N coordinate + CUTLASS_HOST_DEVICE + Index & n() { return this->at(kN); } + + /// Returns the GEMM K coordinate + CUTLASS_HOST_DEVICE + Index const & k() const { return this->at(kK); } + + /// Returns reference to the GEMM K coordinate + CUTLASS_HOST_DEVICE + Index & k() { return this->at(kK); } + + /// Returns the GEMM batch coordinate + CUTLASS_HOST_DEVICE + Index const & batch() const { return this->at(kBatch); } + + /// Returns reference to the GEMM batch coordinate + CUTLASS_HOST_DEVICE + Index & batch() { return this->at(kBatch); } + + /// Obtains a Coord<3> from GemmCoord + CUTLASS_HOST_DEVICE + Coord<3> knm() const { + return make_Coord(k(), n(), m()); + } + + /// Obtains a Coord<2> from GemmCoord + CUTLASS_HOST_DEVICE + Coord<2> nm() const { + return make_Coord(n(), m()); + } + + /// Obtains a Coord<2> from GemmCoord + CUTLASS_HOST_DEVICE + Coord<2> km() const { + return make_Coord(k(), m()); + } + + /// Obtains a Coord<2> from GemmCoord + CUTLASS_HOST_DEVICE + Coord<2> kn() const { + return make_Coord(k(), n()); + } + + // + // Coord operators + // + + /// Element-wise addition + CUTLASS_HOST_DEVICE + GemmCoord operator+(Base const& b) const { + return GemmCoord(Base::operator+(b)); + } + + /// Element-wise subtraction + CUTLASS_HOST_DEVICE + GemmCoord operator-(Base const& b) const { + return GemmCoord(Base::operator-(b)); + } + + /// Element-wise multiplication + CUTLASS_HOST_DEVICE + GemmCoord operator*(Base const& b) const { + return GemmCoord(Base::operator*(b)); + } + + /// Element-wise division + CUTLASS_HOST_DEVICE + GemmCoord operator/(Base const& b) const { + return GemmCoord(Base::operator/(b)); + } + + /// In-place addition + CUTLASS_HOST_DEVICE + GemmCoord& operator+=(Base const& b) { + Base::operator+=(b); + return *this; + } + + /// In-place subtraction + CUTLASS_HOST_DEVICE + GemmCoord& operator-=(Base const& b) { + Base::operator-=(b); + return *this; + } + + /// In-place multiplication + CUTLASS_HOST_DEVICE + GemmCoord& operator*=(Base const& b) { + Base::operator*=(b); + return *this; + } + + /// In-place division + CUTLASS_HOST_DEVICE + GemmCoord& operator/=(Base const& b) { + Base::operator/=(b); + return *this; + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace gemm +} // namespace cutlass diff --git a/cutlass/gemm/gemm_desc.h b/cutlass/gemm/gemm_desc.h new file mode 100644 index 0000000000..80f4b36557 --- /dev/null +++ b/cutlass/gemm/gemm_desc.h @@ -0,0 +1,205 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Implements a software-pipelined efficient GEMM. +*/ +#pragma once + +#include "cutlass/tensor_ref.h" +#include "cutlass/gemm/gemm_coord.h" + +namespace cutlass { +namespace gemm { + +/// GEMM problem description +template < + /// Source accumulator matrix type + typename AType_, + /// Destination accumulator type + typename BType_, + /// Source accumulator matrix type + typename CType_, + /// Destination accumulator type + typename DType_, + /// Scalar type for alpha and beta + typename SType_, + /// Index type for dimensions and strides + typename Index_ = int +> struct GemmDesc { + // + // Type definitions + // + + /// Index type for dimensions and strides + typedef Index_ Index; + + /// Source accumulator matrix type + typedef AType_ AType; + + /// Tensor reference to A operand + typedef TensorRef TensorRefA; + + /// Destination accumulator type + typedef BType_ BType; + + /// Tensor reference to B operand + typedef TensorRef TensorRefB; + + /// Source accumulator matrix type + typedef CType_ CType; + + /// Tensor reference to C operand + typedef TensorRef TensorRefC; + + /// Destination accumulator type + typedef DType_ DType; + + /// Tensor reference to D operand + typedef TensorRef TensorRefD; + + /// Scalar type for alpha and beta + typedef SType_ SType; + + // + // Data members + // + + /// The dimensions of the GEMM. + GemmCoord problem_size; + + /// The alpha scaling values. + SType alpha; + + /// The source matrix A. + TensorRefA A; + + /// batch stride for A operand + long long batch_stride_A; + + /// The source matrix B. + TensorRefB B; + + /// batch stride for B operand + long long batch_stride_B; + + /// The beta scaling values. + SType beta; + + /// The source matrix C. + TensorRefC C; + + /// batch stride for C operand + long long batch_stride_C; + + /// The destination matrix D. + TensorRefD D; + + /// batch stride for D operand + long long batch_stride_D; + + // + // Methods + // + + /// Default ctor + CUTLASS_HOST_DEVICE + GemmDesc(): problem_size(0, 0, 0, 1), alpha(1), beta(0) {} + + /// Constructor for basic GEMM with batch count = 1 + CUTLASS_HOST_DEVICE + GemmDesc(Coord<3> _problem_size, + SType _alpha, + TensorRefA const &_A, + TensorRefB const &_B, + SType _beta, + TensorRefC const &_C, + TensorRefD const &_D + ): + problem_size(_problem_size[0], _problem_size[1], _problem_size[2], 1), + alpha(_alpha), + A(_A), + batch_stride_A(0), + B(_B), + batch_stride_B(0), + beta(_beta), + C(_C), + batch_stride_C(0), + D(_D), + batch_stride_D(0) {} + + /// Constructor for basic GEMM with batch count = 1 + CUTLASS_HOST_DEVICE + GemmDesc(GemmCoord _problem_size, + SType _alpha, + TensorRefA const &_A, + TensorRefB const &_B, + SType _beta, + TensorRefC const &_C, + TensorRefD const &_D + ): + problem_size(_problem_size.k(), _problem_size.n(), _problem_size.m(), 1), + alpha(_alpha), + A(_A), + batch_stride_A(0), + B(_B), + batch_stride_B(0), + beta(_beta), + C(_C), + batch_stride_C(0), + D(_D), + batch_stride_D(0) { + + assert(_problem_size.batch() == 1); + } + + /// Constructor for strided batch GEMM GEMM + CUTLASS_HOST_DEVICE + GemmDesc(GemmCoord _problem_size, + SType _alpha, + TensorRefA const &_A, + long long _batch_stride_A, + TensorRefB const &_B, + long long _batch_stride_B, + SType _beta, + TensorRefC const &_C, + long long _batch_stride_C, + TensorRefD const &_D, + long long _batch_stride_D + ): + problem_size(_problem_size), + alpha(_alpha), + A(_A), + batch_stride_A(_batch_stride_A), + B(_B), + batch_stride_B(_batch_stride_B), + beta(_beta), + C(_C), + batch_stride_C(_batch_stride_C), + D(_D), + batch_stride_D(_batch_stride_D) {} +}; + +} // namespace gemm +} // namespace cutlass diff --git a/cutlass/gemm/gemm_epilogue.h b/cutlass/gemm/gemm_epilogue.h index bc25307775..d9469bb550 100644 --- a/cutlass/gemm/gemm_epilogue.h +++ b/cutlass/gemm/gemm_epilogue.h @@ -29,26 +29,15 @@ */ #pragma once -#include -#include -#include +#include "cutlass/convert.h" +#include "cutlass/coord.h" +#include "cutlass/fragment.h" namespace cutlass { namespace gemm { //////////////////////////////////////////////////////////////////////////////////////////////////// -template -CUTLASS_DEVICE bool is_zero(T x) { - return x == T(0); -} - -#if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16) -CUTLASS_DEVICE bool is_zero(half x) { return reinterpret_cast(x) == int16_t(0); } -#endif - -//////////////////////////////////////////////////////////////////////////////////////////////////// - template struct GemmEpilogue { /// The traits class. @@ -85,9 +74,7 @@ struct GemmEpilogue { /// The shared store transformer for D. typedef typename Traits::SharedStoreTransformerD SharedStoreTransformerD; /// The iterator to load D in shared memory. - typedef typename Traits::SharedLoadIteratorD SharedLoadIteratorD; - /// The shared load transformer for D. - typedef Copy SharedLoadTransformerD; + typedef typename Traits::SharedLoadStreamD SharedLoadStreamD; /// The index. typedef typename Traits::Index Index; @@ -100,33 +87,28 @@ struct GemmEpilogue { /// Ctor. CUTLASS_DEVICE GemmEpilogue(Params const& params_, SharedStorage& shared_storage_, - Index m_, - Index n_) - : params(params_), shared_storage(shared_storage_), m(m_), n(n_) {} + Coord<3> const& _problem_size) + : params(params_), shared_storage(shared_storage_), problem_size(_problem_size), functor(params_.functor) {} /// Execute the epilogue. - CUTLASS_DEVICE void epilogue(Coord<3> const& block, Accumulators& accumulators) { - if (is_zero(params.functor.beta)) { - epilogue_with_or_without_beta(block, accumulators); + CUTLASS_DEVICE void epilogue(Accumulators& accumulators, + Coord<3> const& block = make_Coord(0, 0, 0), + int batch_id = 0) { + if (functor.source_required()) { + epilogue_with_or_without_beta(accumulators, block, batch_id); } else { - epilogue_with_or_without_beta(block, accumulators); + epilogue_with_or_without_beta(accumulators, block, batch_id); } } - template - CUTLASS_DEVICE void epilogue_with_or_without_beta(Coord<3> const& block, - Accumulators& accumulators) { - - // The problem size. - Coord<3> const bounds = cutlass::make_Coord(0, n, m); - - // The functor. - Functor functor(params.functor); + template + CUTLASS_DEVICE void epilogue_with_or_without_beta(Accumulators& accumulators, + Coord<3> const& block, + int batch_id) { // The C fragment. typename GlobalLoadIteratorC::Fragment fragment_c; // The transformed C fragment. typename GlobalTransformerC::OutputFragment transformed_c; - CUTLASS_PRAGMA_UNROLL for (int h = 0; h < Iterations::kH; ++h) { // Compute pointer and predicate offsets for C and D global iterators. @@ -136,6 +118,7 @@ struct GemmEpilogue { Iterations::kW + params.stride_h) * h; + int const predicate_offset = ((params.iterator_d.predicate_inc_h * (GlobalStoreIteratorD::Iterations::kH - 1) + params.iterator_d.predicate_inc_advance) * @@ -145,32 +128,40 @@ struct GemmEpilogue { // The iterator to load the elements of the C matrix. GlobalLoadIteratorC global_load_iterator( - params.iterator_c, bounds, block, pointer_offset, predicate_offset); + params.iterator_c, problem_size, block, pointer_offset, predicate_offset); + + // update C pointer offset based on batch_id and batch_stride_offset + //global_load_iterator.add_pointer_offset(batch_id * params.batch_stride_offset_c); + global_load_iterator += make_Coord(batch_id, 0, 0); + // The transformer for C. GlobalTransformerC transformer_c; // The transformer for D. GlobalTransformerD transformer_d; // The iterator to store into the D matrix. GlobalStoreIteratorD global_store_iterator( - params.iterator_d, bounds, block, pointer_offset, predicate_offset); + params.iterator_d, problem_size, block, pointer_offset, predicate_offset); + + // update D pointer offset based on batch_id and batch_stride_offset + //global_store_iterator.add_pointer_offset(batch_id * params.batch_stride_offset_d); + global_store_iterator += make_Coord(batch_id, 0, 0); - // The transformer to transform before storing to shared memory. SharedStoreTransformerD shared_store_transformer; typename SharedStoreTransformerD::OutputFragment shared_store_transformed_d; - // The iterator to store to shared memory. - SharedStoreIteratorD shared_store_iterator(params.shared_store_iterator_d, - shared_storage.shared_stream.store); + SharedStoreIteratorD shared_store_iterator( + params.shared_store_iterator_d, + reinterpret_cast(shared_storage.data())); - // The iterator to load from shared memory. TODO: Use a stream. - SharedLoadIteratorD shared_load_iterator(params.shared_load_iterator_d, - shared_storage.shared_stream.load); + SharedLoadStreamD shared_load_stream( + params.shared_load_stream_d, + reinterpret_cast(shared_storage.data())); CUTLASS_PRAGMA_UNROLL for (int w = 0; w < Iterations::kW; ++w) { // Load the C matrix into fragment. - if (!kBetaIsZero_) { - iterator_load(global_load_iterator, fragment_c); + if (kSourceRequired) { + global_load_iterator.load_post_increment(fragment_c); } // Make sure we can write to shared memory. @@ -180,33 +171,33 @@ struct GemmEpilogue { int const offset = (h * Iterations::kW + w) * SharedStoreIteratorD::Fragment::kElements; shared_store_transformer.transform(accumulators, offset, shared_store_transformed_d); - shared_iterator_store(shared_store_iterator, shared_store_transformed_d); + shared_store_iterator.store_post_increment(shared_store_transformed_d); // Make sure the data is in shared memory. shared_store_fence(); // Copy the accumulators back to registers from shared memory. - typename SharedLoadIteratorD::Fragment fetched_d; - shared_iterator_load(shared_load_iterator, fetched_d); + shared_load_stream.copy(); + shared_load_stream.commit(); // Do the math. typename GlobalTransformerD::InputFragment fragment_d; - if (kBetaIsZero_) { - functor.evaluate(fetched_d, fragment_d); - } else { + if (kSourceRequired) { // Transform C fragment. transformer_c.transform(fragment_c, transformed_c); // Do the math. - functor.evaluate(fetched_d, transformed_c, fragment_d); + functor.evaluate(shared_load_stream.fragment(), transformed_c, fragment_d); + } else { + functor.evaluate(shared_load_stream.fragment(), fragment_d); } // Transform D fragment. - typename GlobalTransformerD::OutputFragment transformed_d; - transformer_d.transform(fragment_d, transformed_d); + typename GlobalTransformerD::OutputFragment global_transformed_d; + transformer_d.transform(fragment_d, global_transformed_d); // Copy the results to global memory. - iterator_store(global_store_iterator, transformed_d); + global_store_iterator.store_post_increment(global_transformed_d); } } } @@ -222,7 +213,9 @@ struct GemmEpilogue { /// The shared storage. SharedStorage& shared_storage; /// The dimensions of the GEMM. - Index m, n; + Coord<3> problem_size; + // The functor. + Functor functor; }; //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cutlass/gemm/gemm_epilogue_traits.h b/cutlass/gemm/gemm_epilogue_traits.h index c06fc25026..c6aff71e14 100644 --- a/cutlass/gemm/gemm_epilogue_traits.h +++ b/cutlass/gemm/gemm_epilogue_traits.h @@ -27,13 +27,13 @@ */ #pragma once -#include -#include -#include -#include -#include -#include -#include +#include "cutlass/convert.h" +#include "cutlass/coord.h" +#include "cutlass/gemm/gemm_global_stream.h" +#include "cutlass/gemm/gemm_shared_stream.h" +#include "cutlass/gemm/linear_scaling.h" +#include "cutlass/reshape_tile.h" +#include "cutlass/tile_iterator.h" namespace cutlass { namespace gemm { @@ -57,8 +57,8 @@ template < typename SharedStoreIteratorD_, /// The shared store transformer for D. typename SharedStoreTransformerD_, - /// The iterator to load D from shared memory. - typename SharedLoadIteratorD_, + /// The stream to load D from shared memory. + typename SharedLoadStreamD_, /// The number of iterations in the epilogue. typename Iterations_, /// The iterations strides. @@ -86,8 +86,8 @@ struct GemmEpilogueTraits { typedef SharedStoreIteratorD_ SharedStoreIteratorD; /// The shared store transformer for D. typedef SharedStoreTransformerD_ SharedStoreTransformerD; - /// The iterator to store D in shared memory. - typedef SharedLoadIteratorD_ SharedLoadIteratorD; + /// The stream to store D in shared memory. + typedef SharedLoadStreamD_ SharedLoadStreamD; /// typedef typename GemmConfig::EpilogueIterations Iterations; typedef Iterations_ Iterations; /// The iterations strides. @@ -118,14 +118,15 @@ struct GemmEpilogueTraits { typename GlobalStoreIteratorD::Params iterator_d; /// The params for the D shared store iterator. typename SharedStoreIteratorD::Params shared_store_iterator_d; - /// The params for the D shared load iterator. - typename SharedLoadIteratorD::Params shared_load_iterator_d; + /// The params for the D shared load stream. + typename SharedLoadStreamD::Params shared_load_stream_d; /// The functor params. typename Functor::Params functor; /// Setup the params. template CUTLASS_HOST_DEVICE int initialize(GemmDesc_ const& desc) { + // The parameters for the functor. int error_code = functor.initialize(desc); if (error_code) { @@ -133,20 +134,27 @@ struct GemmEpilogueTraits { } // At the end of the H iteration, we jump over a number of columns. - this->stride_h = desc.ldd * Delta::kH; + this->stride_h = desc.D.leading_dim() * Delta::kH; // Nothing to do here. this->stride_w = 0; - // Setup the params for the global memory iterator for C. - error_code = iterator_c.initialize( - reinterpret_cast(desc.d_c), desc.ldc, desc.n, stride_w, Delta::kW); + error_code = iterator_c.initialize(desc.C.data(), + desc.batch_stride_C, + desc.C.leading_dim(), + desc.problem_size[1], + stride_w, + Delta::kW); if (error_code) { return error_code; } // Setup the params for the global memory iterator for D. - return iterator_d.initialize( - reinterpret_cast(desc.d_d), desc.ldd, desc.n, stride_w, Delta::kW); + return iterator_d.initialize(desc.D.data(), + desc.batch_stride_D, + desc.D.leading_dim(), + desc.problem_size[1], + stride_w, + Delta::kW); } }; @@ -155,13 +163,20 @@ struct GemmEpilogueTraits { // The storage for the store iterator. typename SharedStoreIteratorD::SharedStorage store; // The storage for the store iterator. - typename SharedLoadIteratorD::SharedStorage load; + typename SharedLoadStreamD::SharedStorage load; }; /// The shared memory to swizzle the data in the epilogue. struct SharedStorage { // The storage for the shared stream D. StreamSharedStorage shared_stream; + + // + // + // + + CUTLASS_DEVICE + ScalarD* data() { return reinterpret_cast(&shared_stream.load); } }; }; @@ -192,7 +207,10 @@ struct GemmEpilogueTraitsHelper { /// The traits class to build the iterator to store to shared memory for D. typedef GemmSharedStoreTileDTraits< // The pointer is float. - typename Functor::Scalar, + // typename Functor::Scalar, + // Functor::Scalar is alpha, beta type, in mixed precision, alpha and beta may not be the same with accumulation. + // In this case Functor::ScalarAccum is needed + typename Functor::ScalarAccum, // The output tile size. typename GemmConfig_::OutputTile, // The number of warps. @@ -221,7 +239,10 @@ struct GemmEpilogueTraitsHelper { /// The traits class to build the iterator to load from shared memory for D. typedef GemmSharedLoadTileDTraits< // The pointer is float. - typename Functor::Scalar, + // typename Functor::Scalar, + // Functor::Scalar is alpha, beta type, in mixed precision, alpha and beta may not be the same with accumulation. + // In this case Functor::ScalarAccum is needed + typename Functor::ScalarAccum, // The output tile size. typename GemmConfig_::OutputTile, // The number of warps. @@ -242,6 +263,8 @@ struct GemmEpilogueTraitsHelper { IteratorAdvance::kH, MemorySpace::kShared> SharedLoadIteratorD; + /// The stream to load D. + typedef SharedLoadStream SharedLoadStreamD; /// The traits class to build the iterator to load data from global memory for C^N. typedef GemmGlobalTileCdTraits< @@ -314,8 +337,8 @@ struct SimplifiedGemmEpilogueTraits : public GemmEpilogueTraits< typename Helper_::SharedStoreIteratorD, // The shared store transformer for D. typename Helper_::SharedStoreTransformerD, - // The iterator to load D from shared memory. - typename Helper_::SharedLoadIteratorD, + // The stream to load D from shared memory. + typename Helper_::SharedLoadStreamD, // The number of iterations. typename Helper_::Iterations, // The strides between iterations. diff --git a/cutlass/gemm/gemm_global_stream.h b/cutlass/gemm/gemm_global_stream.h index ec675a38fe..6ea72cf30c 100644 --- a/cutlass/gemm/gemm_global_stream.h +++ b/cutlass/gemm/gemm_global_stream.h @@ -29,9 +29,10 @@ */ #pragma once -#include -#include -#include +#include "cutlass/coord.h" +#include "cutlass/convert.h" +#include "cutlass/gemm/gemm_global_tile.h" +#include "cutlass/tile_allocation.h" namespace cutlass { namespace gemm { @@ -39,6 +40,8 @@ namespace gemm { //////////////////////////////////////////////////////////////////////////////////////////////////// template < + /// Identifies multiplicand + GemmOperand::Kind Operand, /// The load iterator. typename LoadIterator_, /// The store iterator to copy to shared memory. @@ -46,7 +49,9 @@ template < /// The transformer to be applied after the data has been copied from global memory. typename Transformer_> -struct GlobalLoadStreamBase { +struct GlobalLoadStream { + /// Indicates the type of GEMM operand + static GemmOperand::Kind const kOperand = Operand; /// The load iterator. typedef LoadIterator_ LoadIterator; /// The transformer. @@ -75,6 +80,15 @@ struct GlobalLoadStreamBase { typedef typename LoadIterator::Pointer Pointer; /// The index. typedef typename LoadIterator::Index Index; + /// The tile + typedef typename LoadIterator::Tile Tile; + + /// Shared memory allocation for the tile + typedef TileAllocation + ThreadblockTileStorage; + + /// Tensor reference to threadblock tile + typedef typename ThreadblockTileStorage::TensorRef ThreadblockTileRef; /// The params. struct Params { @@ -82,56 +96,73 @@ struct GlobalLoadStreamBase { typename LoadIterator::Params load_iterator; // The store iterator. typename StoreIterator::Params store_iterator; + // Offset to residue. + Index offset_to_residue; /// Setup the params. - template - CUTLASS_HOST_DEVICE int initialize(GemmDesc_ const& desc, Pointer pointer, Index ld) { - int error_code = load_iterator.initialize(desc, pointer, ld); + CUTLASS_HOST_DEVICE int initialize(Pointer pointer, + long long batch_stride, + Index ldm, + Index _offset_to_residue) { + + offset_to_residue = _offset_to_residue; + int error_code = load_iterator.initialize(pointer, batch_stride, ldm); if (error_code) { return error_code; } - return store_iterator.initialize(); } }; - /// The amount of storage in shared memory needed to store the tile. - typedef typename StoreIterator::SharedStorage SharedStoreStorage; - - /// The storage in shared memory needed by that stream. - union SharedStorage { - // The load iterator. - typename LoadIterator::SharedStorage load_iterator; - // The store iterator. - SharedStoreStorage store_iterator; - }; + /// Contains private storage in shared memory needed by the objects within this class. Note, + /// this is *NOT* the shared memory allocation for the GEMM threadblock tile. That necessarily + /// exists outside this class, as it is also needed by the warp-level shared=>RF stream. + struct SharedStorage {}; + + // + // Static member functions + // + + /// Maps a coordinate in the GEMM's (K, N, M) coordinate system to global memory + CUTLASS_DEVICE static Coord<3> project_coordinate(Coord<3> const& coord, Index d_offset = 0) { + bool const kKstrided = + GemmMultiplicandTraits::kKstrided; + Coord<3> tile_coord = ProjectOperand::project(coord); + return make_Coord( + tile_coord[0] + d_offset, tile_coord[1], tile_coord[2] / LoadIterator::Tile::kC); + } /// Ctor. - CUTLASS_DEVICE GlobalLoadStreamBase(Params const& params, - SharedStorage& shared_storage, - Coord<3> const bounds, - Coord<3> const& block) - : load_iterator(params.load_iterator, bounds, block), + CUTLASS_DEVICE GlobalLoadStream( + Params const& _params, + SharedStorage& shared_storage, + ThreadblockTileRef const& threadblock_tile_ref, + Coord<3> const bounds, + Coord<3> const& _threadblock_offset) + : params(_params), + multiplicand_bounds(project_coordinate(bounds, 1)), + threadblock_offset(project_coordinate(_threadblock_offset)), + load_iterator(params.load_iterator, + project_coordinate(bounds, 1), /*multiplicant_bounds*/ + project_coordinate(_threadblock_offset) /*threablock_offset*/), transformer(), - store_iterator(params.store_iterator, shared_storage.store_iterator) - + store_iterator(params.store_iterator, threadblock_tile_ref.data()) { + load_iterator.initialize_predicates(multiplicand_bounds, threadblock_offset); fetched_fragment.clear(); } + /// Load the data from shared memory to the fetch fragment. - CUTLASS_DEVICE void copy() { iterator_load(load_iterator, fetched_fragment); } + CUTLASS_DEVICE void copy() { load_iterator.load_post_increment(fetched_fragment); } /// Commit the data. CUTLASS_DEVICE void commit() { transformer.transform(fetched_fragment, transformed_fragment); - iterator_store(store_iterator, transformed_fragment); + store_iterator.store_post_increment(transformed_fragment); store_iterator.inc_stage(); } - /// Move to the beginning of the residue code. That's a new code path in CUTLASS 1.0.1. - CUTLASS_DEVICE void move_to_residue(Index k) { load_iterator.move_to_residue(k); } - /// Execute the residue code. CUTLASS_DEVICE void residue(Index k, bool skip_clear = false) { load_iterator.residue(k); @@ -140,9 +171,43 @@ struct GlobalLoadStreamBase { } } - /// Rollback to the beginning of the GEMM-k dimension. - CUTLASS_DEVICE void rollback() { load_iterator.rollback(); } + /// Move to the residue portion. + CUTLASS_DEVICE void move_to_residue(Index k, Index kTileK) { + Index kResidue = k % kTileK; + if (kResidue) { + residue(kResidue); + } + load_iterator.add_pointer_offset(params.offset_to_residue * load_iterator.stride_advance()); + } + + /// Rollback to the beginning of the first tile + CUTLASS_DEVICE void rollback(void) { + load_iterator.initialize_predicates(multiplicand_bounds, threadblock_offset); + + int const kBlock = kOperand == GemmOperand::kA + ? (kLayout == MatrixLayout::kColumnMajor ? Tile::kH : Tile::kW) + : (kLayout == MatrixLayout::kRowMajor ? Tile::kH : Tile::kW); + + load_iterator.add_pointer_offset(-(params.offset_to_residue + kBlock) * + load_iterator.stride_advance()); + } + + /// Adds a Coord<3> to the underlying global load iterator + CUTLASS_DEVICE GlobalLoadStream &operator+=(Coord<3> const &offset) { + load_iterator += offset; + return *this; + } + // + // Data members + // + + /// Parameters + Params params; + /// Multiplicand bounds + Coord<3> multiplicand_bounds; + /// Threadblock offset + Coord<3> threadblock_offset; /// The iterator. LoadIterator load_iterator; /// The fragment to fetch from shared memory. @@ -155,28 +220,6 @@ struct GlobalLoadStreamBase { StoreIterator store_iterator; }; -//////////////////////////////////////////////////////////////////////////////////////////////////// - -template < - /// The load iterator. - typename LoadIterator_, - /// The store iterator to copy to shared memory. - typename StoreIterator_, - /// The transformer to be applied after the data has been copied from global memory. - typename Transformer_ = Copy > - -struct GlobalLoadStream : public GlobalLoadStreamBase { - /// The base class. - typedef GlobalLoadStreamBase Base; - - /// Ctor. - CUTLASS_DEVICE GlobalLoadStream(typename Base::Params const& params, - typename Base::SharedStorage& shared_storage, - Coord<3> const& bounds, - Coord<3> const& block) - : Base(params, shared_storage, bounds, block) {} -}; - //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace gemm } // namespace cutlass diff --git a/cutlass/gemm/gemm_global_tile.h b/cutlass/gemm/gemm_global_tile.h index 1cc3b3377a..a355ebea0e 100644 --- a/cutlass/gemm/gemm_global_tile.h +++ b/cutlass/gemm/gemm_global_tile.h @@ -27,14 +27,14 @@ */ #pragma once -#include -#include +#include "cutlass/coord.h" +#include "cutlass/util/platform.h" -#include -#include -#include -#include -#include +#include "cutlass/gemm/gemm_operand.h" +#include "cutlass/matrix_traits.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/reshape_tile.h" +#include "cutlass/tile_iterator.h" namespace cutlass { namespace gemm { @@ -80,20 +80,24 @@ struct GemmGlobalTileTraits { static int const kAccessSize = kAccessSize_; /// The memory space. static MemorySpace::Kind const kMemorySpace = MemorySpace::kGlobal; - /// The tile shape - typedef typename ReshapeTile::Tile Tile; + typedef Tile_ Tile; + /// The vectorized tile shape + typedef typename ReshapeTile::Tile VectorizedTile; /// The threads shape - typedef typename ReshapeThreads::Threads Threads; + typedef typename ReshapeThreads::Threads Threads; /// The relative offset between two elements in the H/W dimension in adjacent threads. - typedef Shape<1, 1, Tile::kC> ThreadsDelta; - + typedef Shape<1, 1, VectorizedTile::kC> ThreadsDelta; /// The strides in each dimension between different loads/stores. typedef Shape<0, Threads::kH, Threads::kW * kAccessSize> Delta; + /// Strides for immediate offset computation typedef Shape<0, 0, Threads::kW * ThreadsDelta::kW, kAccessSize> ImmediateOffsetStrides; /// The number of iterations needed to load/store the tile. - typedef Shape<1, Tile::kH / Threads::kH, Tile::kW / Threads::kW, Tile::kC / kAccessSize> + typedef Shape<1, + VectorizedTile::kH / Threads::kH, + VectorizedTile::kW / Threads::kW, + VectorizedTile::kC / kAccessSize> Iterations; typedef GemmMultiplicandTraits MultiplicandTraits; @@ -165,7 +169,6 @@ struct GemmGlobalIteratorAb Index_> { /// This class. typedef GemmGlobalIteratorAb This_; /// The base class. - typedef TileLoadIterator - CUTLASS_HOST_DEVICE int initialize(GemmDesc_ const& desc, Scalar const* ptr, Index stride_h) { + CUTLASS_HOST_DEVICE int initialize(Scalar const* ptr, + long long stride_d, + Index stride_h) { Index inc_d = 0; Index inc_advance = 0; // Move by some columns for each iteration in the H dimension. @@ -221,99 +227,36 @@ struct GemmGlobalIteratorAb (Base::Iterations::kH - 1) * inc_h; } - // The dimensions of the tile. - int const kH = TileTraits_::Tile::kH; - int const kW = TileTraits_::Tile::kW * TileTraits_::kAccessSize; - - // Move to the residue. - Index const kBlock = kAdvance == IteratorAdvance::kH ? kH : kW; - // The jump in the gemm-k dimension. - Index const stride = kAdvance == IteratorAdvance::kH ? stride_h : 1; - - // Compute the offset to the residue and how to "come" back. - Index const kResidue = desc.k % kBlock; - if (kResidue > 0) { - move_to_residue_offset = (desc.k - kResidue) * stride; - } else { - move_to_residue_offset = (desc.k - kBlock) * stride; - } - - Base::Params::initialize(ptr, 0, stride_h, 1, inc_d, inc_h, 0, inc_advance); + Base::Params::initialize( + ptr, stride_d, stride_h, 1, inc_d, inc_h, 0, inc_advance); return 0; } - - // The extra offset to control moving to the residue. - Index move_to_residue_offset; }; - /// Ctor. - CUTLASS_DEVICE GemmGlobalIteratorAb(Params const& _params, - const Coord<3>& bounds, - const Coord<3>& block, - ThreadOffset thread_offset_func = ThreadOffset()) - : params(_params) { - thread_offset = thread_offset_func(); - // The column. - Index block_h = thread_offset[1]; - // The contiguous dimension. - Index block_w = thread_offset[2]; - - // Add the blocks indices. - if (kAdvance == IteratorAdvance::kH) { - block_h += block[1]; - block_w += block[2]; - - } else { - block_h += block[2]; - block_w += block[1]; - } - - // Setup the pointer. - params.pointer += (block_h * params.stride_h + block_w); - - // Initialize predicates - initialize_predicates(bounds, make_Coord(0, block_h, block_w)); - } - - /// The accessor. - CUTLASS_DEVICE void get(typename Base::AccessType& value, int d, int h, int w, int c) const { - int const imm = - ComputeOffsetFromStrides::get(0, 0, w, c); - Load::load(value, params.pointer, imm); - } - - /// Increment the pointer in the H dimension. - CUTLASS_DEVICE void inc_h() { params.pointer += params.inc_h; } - /// Increment the pointer in the D dimension. - CUTLASS_DEVICE void inc_d() { params.pointer += params.inc_d; } - /// Increment the pointer to move to the next iteration. - CUTLASS_DEVICE void inc_advance() { params.pointer += params.inc_advance; } + /// Offset of an individual lane from the start of the tile + Coord<4> thread_offset; + /// The parameters + Params params; + /// The predicates. + PredicateVector predicates; - /// Initialize the predicates. - CUTLASS_DEVICE void initialize_predicates(const Coord<3>& bounds, const Coord<3>& block) { + CUTLASS_HOST_DEVICE void initialize_predicates(const Coord<3>& bounds, const Coord<3>& block_offset) { // Setup the masks to control loads. predicates.fill(0); - int bounds_h, bounds_w; - if (kAdvance == IteratorAdvance::kH) { - bounds_w = bounds[2] - block[2]; - bounds_h = bounds[1]; - - } else { - bounds_w = bounds[1]; - bounds_h = bounds[2] - block[1]; - } - // Fill in the bits of the predicate vector. for (int d = 0; d < Base::Iterations::kD; ++d) { for (int h = 0; h < Base::Iterations::kH; ++h) { for (int w = 0; w < Base::Iterations::kW; ++w) { for (int c = 0; c < Base::Iterations::kC; ++c) { - bool flag = w * Base::Delta::kW < bounds_w; + bool flag = w * Base::Delta::kW + thread_offset[2] + block_offset[2] < bounds[2]; if (kAdvance == IteratorAdvance::kH) { - flag = flag && (h * Base::Delta::kH + d * Base::Delta::kD) < bounds_h; + flag = + flag && + (h * Base::Delta::kH + d * Base::Delta::kD) + thread_offset[1] + block_offset[1] < + bounds[1]; } else { - flag = flag && (h * Base::Delta::kH) < bounds_h; + flag = flag && (h * Base::Delta::kH) + thread_offset[1] + block_offset[1] < bounds[1]; } int const bit = ComputeOffsetFromShape::get(d, h, w, c); predicates.set(bit, flag); @@ -323,31 +266,44 @@ struct GemmGlobalIteratorAb } } - /// Move to residue portion. - CUTLASS_DEVICE void move_to_residue(Index k) { - // Store the pointer and the predicates. - stored_pointer = params.pointer; - stored_predicates = predicates; - - // Move the pointer to the residue. - params.pointer += params.move_to_residue_offset; + /// Ctor. + CUTLASS_HOST_DEVICE GemmGlobalIteratorAb(Params const& _params, + const Coord<3>& bounds, + const Coord<3>& threadblock_offset, + ThreadOffset thread_offset_func = ThreadOffset()) + : params(_params) { + thread_offset = thread_offset_func(); + // Setup the pointer. + params.pointer += ((threadblock_offset[1] + thread_offset[1]) * params.stride_h + + (threadblock_offset[2] + thread_offset[2])); - // The dimensions of the tile. - int const kH = TileTraits_::Tile::kH; - int const kW = TileTraits_::Tile::kW * TileTraits_::kAccessSize; + } - // The unrolling factor. - int const kUnroll = kAdvance == IteratorAdvance::kH ? kH : kW; + /// Increment the pointer in the W dimension. + CUTLASS_HOST_DEVICE void inc_w() { Base::inc_w(); } + /// Increment the pointer in the H dimension. + CUTLASS_HOST_DEVICE void inc_h() { params.pointer += params.inc_h; } + /// Increment the pointer in the D dimension. + CUTLASS_HOST_DEVICE void inc_d() { params.pointer += params.inc_d; } + /// Increment the pointer to move to the next iteration. + CUTLASS_HOST_DEVICE void inc_advance() { params.pointer += params.inc_advance; } - // Clear the predicates for the residue. TODO: We can do something smarter. - int const kResidue = (int)(k % (Index)kUnroll); - if (kResidue > 0) { - residue(kResidue); - } + /// Loads a single fragment element from memory + CUTLASS_HOST_DEVICE void load_element( + typename Base::AccessType& value, int d, int h, int w, int c) const { + int const offset = + ComputeOffsetFromStrides::get(0, 0, w, c); + Load::load(value, params.pointer, offset); } /// That's the residue! Update the predicates. - CUTLASS_DEVICE void residue(Index k) { + CUTLASS_HOST_DEVICE void residue(Index k) { // The coordinates of the thread. Index block_h = thread_offset[1]; // The contiguous dimension. @@ -375,26 +331,63 @@ struct GemmGlobalIteratorAb } } - /// Rollback to beginning of first tile and initialize predicates. - CUTLASS_DEVICE void rollback() { - params.pointer = stored_pointer; - predicates = stored_predicates; - } - - /// Is the iterator valid? - CUTLASS_DEVICE bool valid(int d, int h, int w, int c) const { + /// Is the valid? + CUTLASS_HOST_DEVICE bool valid(int d, int h, int w, int c) const { int const bit = ComputeOffsetFromShape::get(d, h, w, c); return predicates[bit]; } - /// Offset of an individual lane from the start of the tile - Coord<4> thread_offset; - /// The parameters - Params params; - /// The pointer. - typename Base::Scalar const* stored_pointer; - /// The predicates. - PredicateVector predicates, stored_predicates; + /// Adds a vector offset to the iterator + CUTLASS_HOST_DEVICE GemmGlobalIteratorAb & operator+=(Coord<3> const &offset) { + + long long _offset = offset.template dot( + make_Coord(params.stride_d, params.stride_h, params.stride_w) + ); + + params.pointer += _offset; + return *this; + } + + CUTLASS_HOST_DEVICE void add_pointer_offset(Index offset) { params.pointer += offset; } + + CUTLASS_HOST_DEVICE Index stride_advance(void) { + Index stride = params.stride_h; + if (kAdvance == IteratorAdvance::kW) { + stride = params.stride_w; + } + return stride; + } + + template + CUTLASS_HOST_DEVICE void load_post_increment(Fragment& fragment) { + typename Base::FragmentIterator frag_iterator(fragment); + for (int d = 0; d < Base::Iterations::kD; ++d) { + for (int h = 0; h < Base::Iterations::kH; ++h) { + for (int w = 0; w < Base::Iterations::kW; ++w) { + for (int c = 0; c < Base::Iterations::kC; ++c) { + if (valid(d, h, w, c)) { + load_element( + reinterpret_cast(frag_iterator.at(d, h, w, c)), + d, + h, + w, + c); + } + } + if (w < Base::Iterations::kW - 1) { + inc_w(); + } + } + if (h < Base::Iterations::kH - 1) { + inc_h(); + } + } + if (d < Base::Iterations::kD - 1) { + inc_d(); + } + } + inc_advance(); + } }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -433,6 +426,8 @@ struct GemmGlobalIteratorCd : public TileIteratorBasepointer = pointer; + // Stride per batch + stride_d = batch_stride; // Each column of the matrix. - stride_h = TileTraits_::ThreadsDelta::kH * ld; + stride_h = TileTraits_::ThreadsDelta::kH * ldm; // Each thread output 1 column per iteration. The stride between columns is given by the // number of scalars that are loaded per LDS for B. - inc_h = ld * TileTraits_::kStrideH; + inc_h = ldm * TileTraits_::kStrideH; inc_advance = - (ld - ld * TileTraits_::kStrideH * (Base::Iterations::kH - 1)) + epilogue_stride_w; + (ldm - ldm * TileTraits_::kStrideH * (Base::Iterations::kH - 1)) + epilogue_stride_w; predicate_offset = bound; predicate_inc_h = TileTraits_::kStrideH; @@ -464,75 +465,173 @@ struct GemmGlobalIteratorCd : public TileIteratorBase thread_offset; + /// The predicates for the row. + cutlass::PredicateVector predicates; /// Ctor. - CUTLASS_DEVICE GemmGlobalIteratorCd() {} + CUTLASS_HOST_DEVICE GemmGlobalIteratorCd(Params const& _params, + const Coord<3>& bounds, + const Coord<3>& block_offset, + ThreadOffset thread_offset_func = ThreadOffset()) + : params(_params) { + thread_offset = thread_offset_func(); + // Prepare the vector of predicates. + for (int i = 0; i < Base::Iterations::kW; ++i) { + predicates.set(i, thread_offset[2] + i * Base::Delta::kW < bounds[2]); + } + } /// Ctor. - CUTLASS_DEVICE GemmGlobalIteratorCd(Params const& params, - const Coord<3>& bounds, - const Coord<3>& block, - int offset = 0, - int pred_offset = 0, - ThreadOffset thread_offset_func = ThreadOffset()) - : params(params) { + CUTLASS_HOST_DEVICE GemmGlobalIteratorCd(Params const& _params, + const Coord<3>& bounds, + const Coord<3>& block, + int offset = 0, + int pred_offset = 0, + ThreadOffset thread_offset_func = ThreadOffset()) + : params(_params) { thread_offset = thread_offset_func(); // Each warp works on a different column of the tile. int const h = thread_offset[1] + block[1]; // Each lane writes a different element. int const w = thread_offset[2] + block[2]; // Setup the pointer. - this->params.pointer += ((h * params.stride_h + w) + offset); + params.pointer += ((h * params.stride_h + w) + offset); // Prepare the vector of predicates. for (int i = 0; i < Base::Iterations::kW; ++i) { predicates.set(i, w + i * Base::Delta::kW < bounds[2]); } - this->params.predicate_offset -= (h + pred_offset); - } - - /// The accessor. - CUTLASS_DEVICE void get(typename Base::AccessType& value, int d, int h, int w, int c) const { - int const imm = - ComputeOffsetFromStrides::get(0, 0, w, c); - Load::load(value, params.pointer, imm); + params.predicate_offset -= (h + pred_offset); } /// Increment the pointer in the C dimension. - CUTLASS_DEVICE void inc_c() {} + CUTLASS_HOST_DEVICE void inc_c() {} /// Increment the pointer in the W dimension. - CUTLASS_DEVICE void inc_w() {} + CUTLASS_HOST_DEVICE void inc_w() {} /// Increment the pointer in the H dimension. - CUTLASS_DEVICE void inc_h() { + CUTLASS_HOST_DEVICE void inc_h() { params.pointer += params.inc_h; params.predicate_offset -= params.predicate_inc_h; } /// Increment the pointer in the D dimension. - CUTLASS_DEVICE void inc_d() {} + CUTLASS_HOST_DEVICE void inc_d() {} /// Increment the pointer to move to the next iteration. - CUTLASS_DEVICE void inc_advance() { + CUTLASS_HOST_DEVICE void inc_advance() { params.pointer += params.inc_advance; - this->params.predicate_offset -= params.predicate_inc_advance; + params.predicate_offset -= params.predicate_inc_advance; } - /// The accessor. - CUTLASS_DEVICE void set(typename Base::AccessType const& value, int d, int h, int w, int c) { - int const imm = - ComputeOffsetFromStrides::get(0, 0, w, c); - Store::store( - value, params.pointer, imm); + /// Adds a vector offset to the iterator + CUTLASS_HOST_DEVICE GemmGlobalIteratorCd & operator+=(Coord<3> const &offset) { + long long _offset = offset.template dot( + make_Coord(params.stride_d, params.stride_h, 1) + ); + params.pointer += _offset; + return *this; + } + + /// Loads a single fragment element from memory. + CUTLASS_HOST_DEVICE void load_element( + typename Base::AccessType& value, int d, int h, int w, int c) const { + int const offset = + ComputeOffsetFromStrides::get(d, h, w, c); + Load::load(value, params.pointer, offset); } - /// Test the validity of the iterator. - CUTLASS_DEVICE bool valid(int d, int h, int w, int c) const { + /// Stores a single fragment element into memory. + CUTLASS_HOST_DEVICE void store_element( + typename Base::AccessType const& value, int d, int h, int w, int c) { + int const offset = + ComputeOffsetFromStrides::get(d, h, w, c); + Store::store(value, params.pointer, offset); + } + + /// Test the validity of the + CUTLASS_HOST_DEVICE bool valid(int d, int h, int w, int c) const { return predicates.at(w) && params.predicate_offset > 0; } - /// The predicates for the row. - cutlass::PredicateVector predicates; + /// add pointer offset + CUTLASS_HOST_DEVICE void add_pointer_offset(Index offset) { params.pointer += offset; } + + /// Loads and increments iterator + template + CUTLASS_HOST_DEVICE void load_post_increment(Fragment& fragment) { + typename Base::FragmentIterator frag_iterator(fragment); + for (int d = 0; d < Base::Iterations::kD; ++d) { + for (int h = 0; h < Base::Iterations::kH; ++h) { + for (int w = 0; w < Base::Iterations::kW; ++w) { + for (int c = 0; c < Base::Iterations::kC; ++c) { + if (valid(d, h, w, c)) { + load_element( + reinterpret_cast(frag_iterator.at(d, h, w, c)), + d, + h, + w, + c); + } + } + if (w < Base::Iterations::kW - 1) { + inc_w(); + } + } + if (h < Base::Iterations::kH - 1) { + inc_h(); + } + } + if (d < Base::Iterations::kD - 1) { + inc_d(); + } + } + inc_advance(); + } + + template + CUTLASS_HOST_DEVICE void store_post_increment(Fragment& fragment) { + typename Base::FragmentIterator frag_iterator(fragment); + for (int d = 0; d < Base::Iterations::kD; ++d) { + for (int h = 0; h < Base::Iterations::kH; ++h) { + for (int w = 0; w < Base::Iterations::kW; ++w) { + for (int c = 0; c < Base::Iterations::kC; ++c) { + if (valid(d, h, w, c)) { + store_element( + reinterpret_cast(frag_iterator.at(d, h, w, c)), + d, + h, + w, + c); + } + } + if (w < Base::Iterations::kW - 1) { + inc_w(); + } + } + if (h < Base::Iterations::kH - 1) { + inc_h(); + } + } + if (d < Base::Iterations::kD - 1) { + inc_d(); + } + } + inc_advance(); + } }; //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cutlass/gemm/gemm_operand.h b/cutlass/gemm/gemm_operand.h index 737f993f01..2b4dcdc916 100644 --- a/cutlass/gemm/gemm_operand.h +++ b/cutlass/gemm/gemm_operand.h @@ -28,9 +28,9 @@ */ #pragma once -#include -#include -#include +#include "cutlass/matrix_traits.h" +#include "cutlass/reshape_tile.h" +#include "cutlass/util/platform.h" namespace cutlass { namespace gemm { diff --git a/cutlass/gemm/gemm_shared_stream.h b/cutlass/gemm/gemm_shared_stream.h index c6ff7bd973..df20bd6ca5 100644 --- a/cutlass/gemm/gemm_shared_stream.h +++ b/cutlass/gemm/gemm_shared_stream.h @@ -28,7 +28,8 @@ */ #pragma once -#include +#include "cutlass/tensor_ref.h" +#include "cutlass/gemm/gemm_shared_tile.h" namespace cutlass { namespace gemm { @@ -56,6 +57,11 @@ struct SharedLoadStream { ""); /// The output fragment. typedef TransformedFragment Fragment; + /// Scalar data type + typedef typename Iterator::Scalar Scalar; + + /// Reference type to a tensor + typedef TensorRef TensorRef; /// The params. struct Params { @@ -73,29 +79,38 @@ struct SharedLoadStream { CUTLASS_DEVICE SharedLoadStream() {} /// Ctor. - CUTLASS_DEVICE SharedLoadStream(Params const ¶ms, SharedStorage &shared_storage) { - this->initialize(params, shared_storage); + CUTLASS_DEVICE SharedLoadStream(Params const ¶ms, TensorRef const &ref) { + this->initialize(params, ref); } /// Initialize the stream. - CUTLASS_DEVICE void initialize(Params const ¶ms, SharedStorage &shared_storage) { + CUTLASS_DEVICE void initialize(Params const ¶ms, TensorRef const &ref) { // The iterator. - iterator = Iterator(params.iterator, shared_storage); + iterator = Iterator(params.iterator, ref.data()); // The transformer. transformer = Transformer(); } /// Load the data from shared memory to the fetch fragment. - CUTLASS_DEVICE void copy(FetchedFragment &fetched) { shared_iterator_load(iterator, fetched); } + CUTLASS_DEVICE void copy() { iterator.load_post_increment(fetched[0]); } /// Load the data from shared memory to the fetch fragment. - CUTLASS_DEVICE void copy(int d, FetchedFragment &fetched) { - shared_iterator_load(iterator, fetched, d); - } + CUTLASS_DEVICE void copy(int step) { iterator.load(fetched[step % 2], step); } + + /// Commit the data. + CUTLASS_DEVICE void commit() { transformer.transform(fetched[0], transformed[0]); } /// Commit the data. - CUTLASS_DEVICE void commit(FetchedFragment &fetched, TransformedFragment &transformed) { - transformer.transform(fetched, transformed); + CUTLASS_DEVICE void commit(int step) { + transformer.transform(fetched[step % 2], transformed[step % 2]); + } + + /// Returns the fragment for the given step + CUTLASS_DEVICE TransformedFragment &fragment(int step = 0) { return transformed[step % 2]; } + + /// Returns the fragment for the given step + CUTLASS_DEVICE TransformedFragment const &fragment(int step = 0) const { + return transformed[step % 2]; } /// Increment the stage. @@ -103,8 +118,12 @@ struct SharedLoadStream { /// The iterator. Iterator iterator; + /// Fetched fragment + FetchedFragment fetched[2]; /// The transformer. Transformer transformer; + /// Transformed fragment + TransformedFragment transformed[2]; }; //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cutlass/gemm/gemm_shared_tile.h b/cutlass/gemm/gemm_shared_tile.h index 7c61e02297..78fb1f2054 100644 --- a/cutlass/gemm/gemm_shared_tile.h +++ b/cutlass/gemm/gemm_shared_tile.h @@ -27,7 +27,7 @@ */ #pragma once -#include +#include "cutlass/gemm/gemm_operand.h" namespace cutlass { namespace gemm { diff --git a/cutlass/gemm/gemm_stream_pair.h b/cutlass/gemm/gemm_stream_pair.h new file mode 100644 index 0000000000..0a6df15ed4 --- /dev/null +++ b/cutlass/gemm/gemm_stream_pair.h @@ -0,0 +1,251 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Defines a pair of GEMM tile streams +*/ +#pragma once + +#include "cutlass/convert.h" +#include "cutlass/matrix_traits.h" +#include "cutlass/reshape_tile.h" +#include "cutlass/tile_allocation.h" +#include "cutlass/tile_iterator.h" + +#include "cutlass/gemm/clear_accumulators.h" +#include "cutlass/gemm/gemm_config.h" +#include "cutlass/gemm/gemm_global_stream.h" +#include "cutlass/gemm/gemm_operand.h" +#include "cutlass/gemm/gemm_shared_stream.h" +#include "cutlass/gemm/threadblock_swizzle.h" + +namespace cutlass { +namespace gemm { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Collect the global load streams for multiplicands. +template +struct GlobalLoadStreamPair { + // + // Type definitions + // + + /// Stream for A multiplicand + typedef StreamA_ StreamA; + + /// Stream for B multiplicand + typedef StreamB_ StreamB; + + /// Parameters object + struct Params { + /// Parameters object for StreamA + typename StreamA::Params stream_a; + + /// Parameters object for StreamB + typename StreamB::Params stream_b; + + /// Default constructor + CUTLASS_HOST_DEVICE + Params() {} + + /// Constructs a global load stream pair Params object + CUTLASS_HOST_DEVICE + Params(typename StreamA::Params const &_params_A, typename StreamB::Params const &_params_B) + : stream_a(_params_A), stream_b(_params_B) {} + }; + + /// Assumes the A stream defines the index type + typedef typename StreamA::Index Index; + + /// Shared memory allocation for threadblock-scoped GEMM tile + typedef ZipTileAllocation + ThreadblockTileStorage; + + /// ZipTensorRef to threadblock tiles + typedef typename ThreadblockTileStorage::TensorRef ThreadblockTileRef; + + /// Defines a structure containing shared storage for each pair + struct SharedStorage { + typename StreamA::SharedStorage stream_a; + typename StreamB::SharedStorage stream_b; + }; + + // + // Data members + // + + /// Stream for A multiplicand + StreamA stream_a; + + /// Stream for B multiplicand + StreamB stream_b; + + // + // Methods + // + + /// Ctor. + CUTLASS_DEVICE GlobalLoadStreamPair(Params const ¶ms, + SharedStorage &shared_storage, + ThreadblockTileRef const &threadblock_tile_ref, + Coord<3> const &bounds, + Coord<3> const &block_offset = make_Coord(0, 0, 0)) + : stream_a(params.stream_a, + shared_storage.stream_a, + threadblock_tile_ref.first, + bounds, + block_offset), + stream_b(params.stream_b, + shared_storage.stream_b, + threadblock_tile_ref.second, + bounds, + block_offset) {} + + CUTLASS_DEVICE + GlobalLoadStreamPair & operator+=(Coord<3> const offset) { + stream_a += offset; + stream_b += offset; + return *this; + } + + /// Trigger the copies from shared memory to registers. + CUTLASS_DEVICE void copy() { + stream_a.copy(); + stream_b.copy(); + } + + /// Commit the data. + CUTLASS_DEVICE void commit() { + stream_a.commit(); + stream_b.commit(); + } + + /// Execute the residue code. + CUTLASS_DEVICE void residue(Index k, bool skip_clear = false) { + stream_a.residue(k, skip_clear); + stream_b.residue(k, skip_clear); + } + + /// Move to residue. + CUTLASS_DEVICE void move_to_residue(Index k, Index kTileK) { + if (kResidueInProlog_) { + stream_a.move_to_residue(k, kTileK); + stream_b.move_to_residue(k, kTileK); + } else if (k < kTileK) { + residue(k, true); + } + } + + /// Rollback to beginning of first tile. + CUTLASS_DEVICE void rollback(bool kRollback) { + if (kResidueInProlog_ && kRollback) { + stream_a.rollback(); + stream_b.rollback(); + } + } +}; + +/// Collect the global load streams for multiplicands. +template +struct SharedStreamPair { + // + // Type definitions + // + + /// Stream for A multiplicand + typedef StreamA_ StreamA; + + /// Stream for B multiplicand + typedef StreamB_ StreamB; + + /// Parameters object passed to load iterators + struct Params { + /// + typename StreamA::Params stream_a; + + /// + typename StreamB::Params stream_b; + }; + + /// Shared memory allocation for threadblock-scoped GEMM tile + typedef ZipTensorRef + ThreadblockTileRef; + + // + // Data members + // + + /// The stream for A. + StreamA stream_a; + + /// The stream for B. + StreamB stream_b; + + // + // Methods + // + + /// Construct with the composable structure + CUTLASS_DEVICE SharedStreamPair(Params const ¶ms, ThreadblockTileRef const &threadblock_tile_ref) + : stream_a(params.stream_a, threadblock_tile_ref.first), + stream_b(params.stream_b, threadblock_tile_ref.second) {} + + /// Trigger the copies from shared memory to registers. + CUTLASS_DEVICE void copy(int step) { + stream_a.copy(step); + stream_b.copy(step); + } + + /// Commit the data. + CUTLASS_DEVICE void commit(int step) { + stream_a.commit(step); + stream_b.commit(step); + } + + /// The fragment A. + CUTLASS_DEVICE + typename StreamA::TransformedFragment const &fragment_a(int step) const { + return stream_a.fragment(step); + } + + /// The fragment B. + CUTLASS_DEVICE + typename StreamB::TransformedFragment const &fragment_b(int step) const { + return stream_b.fragment(step); + } + + /// Increment the stage. + CUTLASS_DEVICE void inc_stage() { + stream_a.inc_stage(); + stream_b.inc_stage(); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace gemm +} // namespace cutlass diff --git a/cutlass/gemm/gemm_traits.h b/cutlass/gemm/gemm_traits.h index cb57c4d5cf..fd6efb4669 100644 --- a/cutlass/gemm/gemm_traits.h +++ b/cutlass/gemm/gemm_traits.h @@ -27,117 +27,27 @@ */ #pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include - +#include "cutlass/convert.h" +#include "cutlass/matrix_traits.h" +#include "cutlass/reshape_tile.h" +#include "cutlass/tile_allocation.h" +#include "cutlass/tile_iterator.h" +#include "cutlass/kernel_launch.h" + +#include "cutlass/gemm/clear_accumulators.h" +#include "cutlass/gemm/gemm_config.h" +#include "cutlass/gemm/gemm_desc.h" +#include "cutlass/gemm/gemm_stream_pair.h" +#include "cutlass/gemm/gemm_global_stream.h" +#include "cutlass/gemm/gemm_operand.h" +#include "cutlass/gemm/gemm_shared_stream.h" +#include "cutlass/gemm/threadblock_swizzle.h" +#include "cutlass/gemm/gemm.h" namespace cutlass { namespace gemm { //////////////////////////////////////////////////////////////////////////////////////////////////// -template < - /// The scalar type for A. - typename ScalarA_, - /// The scalar type for B. - typename ScalarB_, - /// The scalar type for C. - typename ScalarC_, - /// The scalar type for D. - typename ScalarD_, - /// The output tile size for the GEMM KxNxM. - typename OutputTile_, - /// The functor to do the math. - typename MultiplyAdd_, - /// The number of scalars per LDG for A. - int kScalarsPerLdgA_, - /// The number of scalars per STS for A. - int kScalarsPerStsA_, - /// The number of scalars per LDG for A. - int kScalarsPerLdsA_, - /// The number of scalars per LDG for B. - int kScalarsPerLdgB_, - /// The number of scalars per STS for B. - int kScalarsPerStsB_, - /// The number of scalars per LDS for B. - int kScalarsPerLdsB_, - /// The number of scalars per LDG for C and STG for D. - int kScalarsPerLdgCAndStgD_, - /// The number of scalars per STS for D. - int kScalarsPerStsD_, - /// The number of scalars per LDS for D. - int kScalarsPerLdsD_, - /// The number of stages in shared memory to do single/double/triple-buffering. - int kStages_, - /// Do we do the residue in the prologue? - bool kResidueInPrologue_ = false> - -struct GemmConfig { - // - /// The scalar for A. - typedef ScalarA_ ScalarA; - /// The scalar for B. - typedef ScalarB_ ScalarB; - /// The scalar for C. - typedef ScalarC_ ScalarC; - /// The scalar for D. - typedef ScalarD_ ScalarD; - - /// The tile. - typedef OutputTile_ OutputTile; - /// The functor to do D = A*B + C. - typedef MultiplyAdd_ MultiplyAdd; - /// The shape of the instruction. - typedef typename MultiplyAdd::InstructionShape InstructionShape; - /// The number of accumulators per warp. - typedef typename MultiplyAdd::AccumulatorsPerWarp AccumulatorsPerWarp; - /// The accumulators. - typedef typename MultiplyAdd::Accumulators Accumulators; - - /// The number of warps. - typedef typename ShapeDiv::Shape Warps; - /// The default warp size (32 threads per warp). - static int const kWarpSize = cutlass::kWarpSize; - /// The numnber of threads. - static int const kThreads = ShapeCount::kCount * kWarpSize; - - /// The number of scalars per LDG/STS/LDS for A. - static int const kScalarsPerLdgA = kScalarsPerLdgA_; - static int const kScalarsPerStsA = kScalarsPerStsA_; - static int const kScalarsPerLdsA = kScalarsPerLdsA_; - - /// The number of scalars per LDG/STS/LDS for B. - static int const kScalarsPerLdgB = kScalarsPerLdgB_; - static int const kScalarsPerStsB = kScalarsPerStsB_; - static int const kScalarsPerLdsB = kScalarsPerLdsB_; - - /// The number of scalars per LDG for C. - static int const kScalarsPerLdgC = kScalarsPerLdgCAndStgD_; - - /// The number of scalars per STS/LDS/STG for D. - static int const kScalarsPerStgD = kScalarsPerLdgCAndStgD_; - static int const kScalarsPerStsD = kScalarsPerStsD_; - static int const kScalarsPerLdsD = kScalarsPerLdsD_; - - /// The number of accumulators that are going to be fed from one LDS A/B. - static int const kAccumulatorsPerLdsA = kScalarsPerLdsA / InstructionShape::kD; - static int const kAccumulatorsPerLdsB = kScalarsPerLdsB / InstructionShape::kD; - - /// The number of stages in shared memory to implement double, triple, more-buffering. - static int const kStages = kStages_; - - /// Do we do the residue in the prologue? - static bool const kResidueInPrologue = kResidueInPrologue_; -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////// - template struct GemmTileTraitsHelperA {}; @@ -416,60 +326,6 @@ struct GemmTileTraitsHelperB { //////////////////////////////////////////////////////////////////////////////////////////////////// -template -struct GemmResidue { - /// Move to residue portion. - template - static CUTLASS_DEVICE void move_to_residue(typename GemmTraits_::GlobalLoadStreamA& stream_a, - typename GemmTraits_::GlobalLoadStreamB& stream_b, - typename GemmTraits_::Index k) { - // The new code path in CUTLASS 1.0.1: We treat the residue in the prologue so we can have - // complete main loops after that. It helps simplify the logic in the main loop. - if (kIsPrologue) { - stream_a.move_to_residue(k); - stream_b.move_to_residue(k); - } - } - - /// Rollback to beginning of first tile and initialize predicates. - static CUTLASS_DEVICE void rollback(typename GemmTraits_::GlobalLoadStreamA& stream_a, - typename GemmTraits_::GlobalLoadStreamB& stream_b) { - stream_a.rollback(); - stream_b.rollback(); - } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -template -struct GemmResidue { - /// Move to residue portion. - template - static CUTLASS_DEVICE void move_to_residue(typename GemmTraits_::GlobalLoadStreamA& stream_a, - typename GemmTraits_::GlobalLoadStreamB& stream_b, - typename GemmTraits_::Index k) { - // The index. - typedef typename GemmTraits_::Index Index; - // By how much we unroll the main loop. - Index const kUnroll = static_cast(GemmTraits_::OutputTile::kD); - - // Call the residue code. That's the same path as CUTLASS 1.0.0. - if (kIsPrologue && k < kUnroll) { - stream_a.residue(k, true); - stream_b.residue(k, true); - } else if (k <= kUnroll) { - stream_a.residue(k, false); - stream_b.residue(k, false); - } - } - - /// Rollback to beginning of first tile and initialize predicates. - static CUTLASS_DEVICE void rollback(typename GemmTraits_::GlobalLoadStreamA& stream_a, - typename GemmTraits_::GlobalLoadStreamB& stream_b) {} -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////// - template < /// The GEMM configuration. typename GemmConfig_, @@ -488,27 +344,27 @@ template < /// The index. typename Index_ = int, /// The tool used to clear accumulators. - typename ClearAccumulators_ = ClearAccumulators > + typename ClearAccumulators_ = ClearAccumulators > struct GemmTraits { - /// This class. + /// This traits typedef GemmTraits - This_; + GlobalLoadStreamA_, + GlobalLoadStreamB_, + SharedLoadStreamA_, + SharedLoadStreamB_, + Epilogue_, + BlockSwizzle_, + Index_, + ClearAccumulators_> This_; + + /// The struct that consumes this Traits + typedef typename cutlass::gemm::Gemm KernelClass; /// The configuration. typedef GemmConfig_ GemmConfig; /// The output tile. typedef typename GemmConfig::OutputTile OutputTile; - /// Is the residue treated in the prologue? - static bool const kResidueInPrologue = GemmConfig::kResidueInPrologue; /// The stream to load A from global memory to shared memory. typedef GlobalLoadStreamA_ GlobalLoadStreamA; @@ -544,18 +400,30 @@ struct GemmTraits { /// Clear the accumulators. typedef ClearAccumulators_ ClearAccumulators; - /// The params. - struct Params { - /// The dimensions of the GEMM. - Index m, n, k; - /// The params for the A stream. - typename GlobalLoadStreamA::Params global_stream_a; - /// The params for the B stream. - typename GlobalLoadStreamB::Params global_stream_b; - /// The params for the A stream from shared memory. - typename SharedLoadStreamA::Params shared_stream_a; - /// The params for the B stream from shared memory. - typename SharedLoadStreamB::Params shared_stream_b; + /// Assemble the global load streams for A/B. + typedef GlobalLoadStreamPair + GlobalLoadStream; + + /// Memory needed to store the threadblock-scoped GEMM tile + typedef typename GlobalLoadStream::ThreadblockTileStorage ThreadblockTileStorage; + + /// Assemble the shared load streams for A/B. + typedef SharedStreamPair SharedStream; + + /// Parameters object constructable on the host. + struct Params : public KernelLaunchConfiguration { + + /// GEMM problem size + GemmCoord problem_size; + + /// Parameters object for the global load stream + typename GlobalLoadStream::Params global_to_shared_stream; + + /// Parameters object for the shared load stream + typename SharedStream::Params shared_stream; + /// The params for the epilogue. typename Epilogue::Params epilogue; @@ -563,21 +431,36 @@ struct GemmTraits { template CUTLASS_HOST_DEVICE int initialize(GemmDesc_ const& desc) { // Set the problem size. - this->m = desc.m; - this->n = desc.n; - this->k = desc.k; - - // Initialize the iterator for A. - int error_code = - global_stream_a.initialize(desc, reinterpret_cast(desc.d_a), desc.lda); - + problem_size = desc.problem_size; + + // Compute grid dimensions + BlockSwizzle block_swizzle; + this->block = dim3(GemmConfig::kThreads); + this->grid = block_swizzle.get_grid_layout( + problem_size, + make_Coord_from_shape()); + + // Compute offset to residue. + Index gemm_k = problem_size[0]; + Index offset_to_residue = (gemm_k % OutputTile::kD) ? gemm_k - (gemm_k % OutputTile::kD) : 0; + + // Initialize parameters objects for + int error_code = global_to_shared_stream.stream_a.initialize( + desc.A.data(), + desc.batch_stride_A, + desc.A.leading_dim(), + offset_to_residue + ); if (error_code) { return error_code; } - // Initialize the iterator for B. - error_code = - global_stream_b.initialize(desc, reinterpret_cast(desc.d_b), desc.ldb); + error_code = global_to_shared_stream.stream_b.initialize( + desc.B.data(), + desc.batch_stride_B, + desc.B.leading_dim(), + offset_to_residue + ); if (error_code) { return error_code; @@ -586,24 +469,81 @@ struct GemmTraits { // The epilogue. return epilogue.initialize(desc); } - }; - // The storage for A. - template - union StreamSharedStorage { - // The storage needed by the global stream. - typename GlobalLoadStream_::SharedStorage global; - // The storage needed by the shared stream. - typename SharedLoadStream_::SharedStorage shared; + /// Helper to construct a GEMM params using a BLAS-like API + CUTLASS_HOST_DEVICE int initialize(Index m, + Index n, + Index k, + typename Epilogue::Scalar alpha, + ScalarA const* d_a, + Index lda, + ScalarB const* d_b, + Index ldb, + typename Epilogue::Scalar beta, + ScalarC const* d_c, + Index ldc, + ScalarD* d_d, + Index ldd) { + GemmDesc desc( + GemmCoord(k, n, m, 1), + alpha, + TensorRef(d_a, lda), + TensorRef(d_b, ldb), + beta, + TensorRef(d_c, ldc), + TensorRef(d_d, ldd) + ); + + return this->initialize(desc); + } + + /// Helper to construct a batched GEMM params + CUTLASS_HOST_DEVICE int initialize(Index m, + Index n, + Index k, + typename Epilogue::Scalar alpha, + ScalarA const* d_a, + Index lda, + long long int batch_stride_A, + ScalarB const* d_b, + Index ldb, + long long int batch_stride_B, + typename Epilogue::Scalar beta, + ScalarC const* d_c, + Index ldc, + long long int batch_stride_C, + ScalarD* d_d, + Index ldd, + long long int batch_stride_D, + Index batch_count) { + + GemmDesc desc( + GemmCoord(k, n, m, batch_count), + alpha, + TensorRef(d_a, lda), + batch_stride_A, + TensorRef(d_b, ldb), + batch_stride_B, + beta, + TensorRef(d_c, ldc), + batch_stride_C, + TensorRef(d_d, ldd), + batch_stride_D + ); + + return this->initialize(desc); + } }; // The storage for the main loop + prologue. struct MainLoopSharedStorage { - // The storage to shuffle the A matrix in shared memory. - StreamSharedStorage stream_a; - // The storage to shuffle the B matrix in shared memory. - StreamSharedStorage stream_b; - // The storage to clear the accumulators if needed. + /// Stores the threadblock tile + ThreadblockTileStorage threadblock_tile; + + /// Storage for GEMM global stream + typename GlobalLoadStream::SharedStorage global_to_shared_stream; + + /// Storage for clearing accumulators typename ClearAccumulators::SharedStorage clear; }; @@ -615,108 +555,18 @@ struct GemmTraits { typename Epilogue::SharedStorage epilogue; }; - /// Assemble the global load streams for A/B. - struct GlobalLoadStream { - /// Ctor. - CUTLASS_DEVICE GlobalLoadStream(Params const& params, - SharedStorage& shared_storage, - dim3 const& block) - : stream_a(params.global_stream_a, - shared_storage.main_loop.stream_a.global, - cutlass::make_Coord(0, params.k, params.m), - cutlass::make_Coord(0, 0, block.x)), - stream_b(params.global_stream_b, - shared_storage.main_loop.stream_b.global, - cutlass::make_Coord(0, params.k, params.n), - make_Coord(0, 0, block.y)) {} - - /// Trigger the copies from shared memory to registers. - CUTLASS_DEVICE void copy() { - stream_a.copy(); - stream_b.copy(); - } - - /// Commit the data. - CUTLASS_DEVICE void commit() { - stream_a.commit(); - stream_b.commit(); - } - - /// Move to residue portion. - template - CUTLASS_DEVICE void move_to_residue(Index k) { - GemmResidue::move_to_residue(stream_a, stream_b, k); - } - - /// Rollback to beginning of first tile and initialize predicates. - CUTLASS_DEVICE void rollback() { GemmResidue::rollback(stream_a, stream_b); } - - /// The stream for A. - GlobalLoadStreamA stream_a; - /// The stream for B. - GlobalLoadStreamB stream_b; - }; - - /// Assemble the shared load stream for A/B. - struct SharedLoadStream { - /// Ctor. - CUTLASS_DEVICE SharedLoadStream(Params const& params, SharedStorage& shared_storage) { - stream_a.initialize(params.shared_stream_a, shared_storage.main_loop.stream_a.shared); - stream_b.initialize(params.shared_stream_b, shared_storage.main_loop.stream_b.shared); - } - - /// Trigger the copies from shared memory to registers. - CUTLASS_DEVICE void copy(int step) { - stream_a.copy(step, fetched_a[step % 2]); - stream_b.copy(step, fetched_b[step % 2]); - } - - /// Commit the data. - CUTLASS_DEVICE void commit(int step) { - stream_a.commit(fetched_a[step % 2], transformed_a[step % 2]); - stream_b.commit(fetched_b[step % 2], transformed_b[step % 2]); - } - - /// The fragment A. - CUTLASS_DEVICE typename SharedLoadStreamA::Fragment const& fragment_a(int step) const { - return transformed_a[step % 2]; - } - - /// The fragment B. - CUTLASS_DEVICE typename SharedLoadStreamB::Fragment const& fragment_b(int step) const { - return transformed_b[step % 2]; - } - - /// Increment the stage. - CUTLASS_DEVICE void inc_stage() { - stream_a.inc_stage(); - stream_b.inc_stage(); - } - - /// The stream for A. - SharedLoadStreamA stream_a; - /// The fragments to fetch A. - typename SharedLoadStreamA::FetchedFragment fetched_a[2]; - /// The fragments to transform A. - typename SharedLoadStreamA::TransformedFragment transformed_a[2]; - /// The stream for B. - SharedLoadStreamB stream_b; - /// The fragments to fetch B. - typename SharedLoadStreamB::FetchedFragment fetched_b[2]; - /// The fragments to transform B. - typename SharedLoadStreamB::TransformedFragment transformed_b[2]; - }; - /// The memory fence for shared loads. static CUTLASS_DEVICE void shared_load_fence(bool in_loop) { if (SharedLoadStreamA::Iterator::kRequiresLoadFence || SharedLoadStreamB::Iterator::kRequiresLoadFence) { - __syncthreads(); + __syncthreads(); } } /// The memory fence for shared stores. - static CUTLASS_DEVICE void shared_store_fence(bool in_loop) { __syncthreads(); } + static CUTLASS_DEVICE void shared_store_fence(bool in_loop) { + __syncthreads(); + } }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -735,7 +585,10 @@ struct SimplifiedGemmTraitsHelper { MemorySpace::kShared> SharedStoreIteratorA; /// The stream to load A from global memory to shared memory. - typedef GlobalLoadStream + typedef GlobalLoadStream GlobalLoadStreamA; /// The global iterator to load B from global memory. @@ -750,7 +603,10 @@ struct SimplifiedGemmTraitsHelper { MemorySpace::kShared> SharedStoreIteratorB; /// The stream to load B from global memory to shared memory. - typedef GlobalLoadStream + typedef GlobalLoadStream GlobalLoadStreamB; /// The iterator to load A from shared memory. diff --git a/cutlass/gemm/hgemm_global_tile.h b/cutlass/gemm/hgemm_global_tile.h index f14dbb311a..9d5ffe8508 100644 --- a/cutlass/gemm/hgemm_global_tile.h +++ b/cutlass/gemm/hgemm_global_tile.h @@ -29,10 +29,10 @@ */ #pragma once -#include -#include -#include -#include +#include "cutlass/coord.h" +#include "cutlass/gemm/gemm_global_tile.h" +#include "cutlass/matrix_traits.h" +#include "cutlass/reshape_tile.h" namespace cutlass { namespace gemm { @@ -63,14 +63,14 @@ struct HgemmCrosswiseGlobalTileTraits : public GemmGlobalTileTraits< /// The threads. typedef typename Base::Threads Threads; /// The threads strides. - typedef Shape<1, 2, Base::Tile::kC> ThreadsDelta; + typedef Shape<1, 2, Base::VectorizedTile::kC> ThreadsDelta; /// The strides in each dimension between different loads/stores. typedef Shape Delta; /// The number of iterations needed to load/store the tile. - typedef Shape + Base::VectorizedTile::kW / Base::Threads::kW, + Base::VectorizedTile::kC / Base::kAccessSize> Iterations; /// Computes the thread offset in (H, W) based on thread ID struct ThreadOffset { diff --git a/cutlass/gemm/hgemm_multiply_add.h b/cutlass/gemm/hgemm_multiply_add.h index ebbdd06e87..7217d82c58 100644 --- a/cutlass/gemm/hgemm_multiply_add.h +++ b/cutlass/gemm/hgemm_multiply_add.h @@ -28,9 +28,9 @@ */ #pragma once -#include +#include "cutlass/fragment.h" -#include +#include "cutlass/gemm/thread_multiply_add.h" namespace cutlass { namespace gemm { @@ -38,16 +38,18 @@ namespace gemm { //////////////////////////////////////////////////////////////////////////////////////////////////// /// Template performing matrix multiply-add operation within a thread -template -struct ThreadMultiplyAdd { +template +struct ThreadMultiplyAdd { /// The shape of the instruction. typedef Shape<1, 1, 2, 1> InstructionShape; /// The number of accumulators per thread. - typedef AccumulatorsPerThread_ AccumulatorsPerThread; + typedef ThreadGemmShape_ ThreadGemmShape; + /// Aliased for compatibility. Will be removed for CUTLASS v2.0. + typedef ThreadGemmShape AccumulatorsPerThread; /// The number of threads per warp. typedef ThreadsPerWarp_ ThreadsPerWarp; /// The number of accumulators per warp. - typedef typename ShapeMul::Shape AccumulatorsPerWarp; + typedef typename ShapeMul::Shape AccumulatorsPerWarp; /// The type for A. typedef half ScalarA; /// The fragment for A. @@ -88,9 +90,9 @@ struct ThreadMultiplyAdd -#include +#include "cutlass/fragment.h" namespace cutlass { namespace gemm { diff --git a/cutlass/gemm/hgemm_traits.h b/cutlass/gemm/hgemm_traits.h index b08645bf40..2261bb4b3e 100644 --- a/cutlass/gemm/hgemm_traits.h +++ b/cutlass/gemm/hgemm_traits.h @@ -27,18 +27,18 @@ */ #pragma once -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "cutlass/convert.h" +#include "cutlass/reshape_tile.h" + +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/gemm_epilogue.h" +#include "cutlass/gemm/gemm_epilogue_traits.h" +#include "cutlass/gemm/gemm_global_tile.h" +#include "cutlass/gemm/gemm_shared_tile.h" +#include "cutlass/gemm/gemm_traits.h" +#include "cutlass/gemm/hgemm_global_tile.h" +#include "cutlass/gemm/hgemm_multiply_add.h" +#include "cutlass/gemm/hgemm_swizzle.h" namespace cutlass { namespace gemm { @@ -48,46 +48,52 @@ namespace gemm { template < /// The tile size for the GEMM KxNxM. typename OutputTile_, - /// The number of accumulators per thread. - typename AccumulatorsPerThread_, + /// Tile size for thread-level GEMM (K-by-N-by-M) + typename ThreadGemmShape_, /// The number of scalars per LDG for A. int kScalarsPerLdgA_ = 2, /// The number of scalars per LDG for B. int kScalarsPerLdgB_ = 2> -struct HgemmConfig - : public GemmConfig< - /// The scalar type for A. - half, - /// The scalar type for B. - half, - /// The scalar type for C. - half, - /// The scalar type for D. - half, - /// The tile size for the GEMM KxNxM. - OutputTile_, - /// The functor to do the math in the main loop. - ThreadMultiplyAdd, half, half, half>, - /// The number of scalars per LDG for A. - kScalarsPerLdgA_, - /// The number of scalars per STS for A. - kScalarsPerLdgA_, - /// The number of scalars per LDS for A. - 8, - /// The number of scalars per LDG for B. - kScalarsPerLdgB_, - /// The number of scalars per STS for B. - kScalarsPerLdgB_, - /// The number of scalars per LDS for B. - 8, - /// The number of scalars per LDG for C and STG for D. - 2, - /// The number of scalars per STS for D. - 8, - /// The number of scalars per LDS for D. - 2, - /// The number of stages in shared memory. - 2> {}; +struct HgemmConfig : public GemmConfig< + /// The scalar type for A. + half, + /// The scalar type for B. + half, + /// The scalar type for C. + half, + /// The scalar type for D. + half, + /// The tile size for the GEMM KxNxM. + OutputTile_, + /// The functor to do the math in the main loop. + ThreadMultiplyAdd, half, half, half>, + /// The number of scalars per LDG for A. + kScalarsPerLdgA_, + /// The number of scalars per STS for A. + kScalarsPerLdgA_, + /// The number of scalars per LDS for A. + 8, + /// The number of scalars per LDG for B. + kScalarsPerLdgB_, + /// The number of scalars per STS for B. + kScalarsPerLdgB_, + /// The number of scalars per LDS for B. + 8, + /// The number of scalars per LDG for C and STG for D. + 2, + /// The number of scalars per STS for D. + 8, + /// The number of scalars per LDS for D. + 2, + /// The number of stages in shared memory. + 2, + /// kResidueSeparate + false, + /// kResidueInPrologue + true, + /// kLaunchBounds + false + > {}; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -147,7 +153,6 @@ struct HgemmTileTraitsHelperA GemmConfig_::kScalarsPerLdgA> GlobalTileTraits; - /// The skew. static int const kSkewA = 128 / sizeof(half) / GlobalTileTraits::Threads::kW / 2; /// The traits class to build the iterator to store data to shared memory for A^T. @@ -215,7 +220,6 @@ struct HgemmTileTraitsHelperB GemmConfig_::kScalarsPerLdgB> GlobalTileTraits; - /// The skew for B. static int const kSkewB = 128 / sizeof(half) / GlobalTileTraits::Threads::kW / 2; /// The traits class to build the iterator to store data to shared memory for B^N. @@ -266,8 +270,8 @@ template < typename OutputTile_, /// The functor to do the math in the epilogue. typename EpilogueFunctor_, - /// The number of accumulators per thread. - typename AccumulatorsPerThread_ = Shape<8, 8, 16>, + /// Tile size for thread-level GEMM (K-by-N-by-M) + typename ThreadGemmShape_, /// The number of halfs loaded in one LDG for A. int kScalarsPerLdgA_ = 2, /// The number of halfs loaded in one LDG for B. @@ -276,8 +280,7 @@ template < typename Index_ = int> struct HgemmTraitsHelper { /// The HGEMM config. - typedef HgemmConfig - GemmConfig; + typedef HgemmConfig GemmConfig; /// The GEMM config for A. typedef HgemmTileTraitsHelperA GemmTileTraitsHelperA; /// The GEMM config for B. @@ -296,7 +299,10 @@ struct HgemmTraitsHelper { MemorySpace::kShared> SharedStoreIteratorA; /// The stream to load A from global memory to shared memory. - typedef GlobalLoadStream + typedef GlobalLoadStream GlobalLoadStreamA; /// The iterator to load B from global memory. @@ -312,7 +318,10 @@ struct HgemmTraitsHelper { MemorySpace::kShared> SharedStoreIteratorB; /// The stream to load B from global memory to shared memory. - typedef GlobalLoadStream + typedef GlobalLoadStream GlobalLoadStreamB; /// The iterator to load A from shared memory @@ -354,8 +363,8 @@ template < typename OutputTile_ = Shape<8, 128, 128>, /// The functor to do the math in the epilogue. typename EpilogueFunctor_ = LinearScaling, - /// The number of accumulators per thread. - typename AccumulatorsPerThread_ = Shape<8, 8, 16>, + /// Tile size for warp-level GEMM (K-by-N-by-M) + typename ThreadGemmShape_ = Shape<8, 8, 16>, /// The number of halfs loaded in one LDG for A. int kScalarsPerLdgA_ = 2, /// The number of halfs loaded in one LDG for B. @@ -367,7 +376,7 @@ template < kLayoutB_, OutputTile_, EpilogueFunctor_, - AccumulatorsPerThread_, + ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_> > diff --git a/cutlass/gemm/igemm_epilogue.h b/cutlass/gemm/igemm_epilogue.h index 0d69980316..2ad24f32cc 100644 --- a/cutlass/gemm/igemm_epilogue.h +++ b/cutlass/gemm/igemm_epilogue.h @@ -28,13 +28,13 @@ */ #pragma once -#include -#include -#include -#include -#include -#include -#include +#include "cutlass/convert.h" +#include "cutlass/fragment.h" +#include "cutlass/gemm/gemm_global_stream.h" +#include "cutlass/gemm/gemm_shared_stream.h" +#include "cutlass/gemm/igemm_global_tile.h" +#include "cutlass/reshape_tile.h" +#include "cutlass/tile_iterator.h" namespace cutlass { namespace gemm { @@ -269,8 +269,8 @@ struct IgemmEpilogueTraits : public GemmEpilogueTraits< typename Helper_::SharedStoreIteratorD, // The shared store transformer for D. typename Helper_::SharedStoreTransformerD, - // The iterator to load D from shared memory. - typename Helper_::SharedLoadIteratorD, + // The stream to load D from shared memory. + typename Helper_::SharedLoadStreamD, // The iterations. typename Helper_::Iterations, // The strides between iterations. @@ -294,9 +294,8 @@ struct IgemmEpilogue : public GemmEpilogue { /// Ctor. CUTLASS_DEVICE IgemmEpilogue(typename Base::Params const& params_, typename Base::SharedStorage& shared_storage_, - typename Base::Index m_, - typename Base::Index n_) - : Base(params_, shared_storage_, m_, n_) {} + Coord<3> const& _problem_size) + : Base(params_, shared_storage_, _problem_size) {} }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -309,9 +308,8 @@ struct IgemmEpilogue : public GemmEpilogue const& _problem_size) + : Base(params_, shared_storage_, _problem_size) {} }; //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cutlass/gemm/igemm_global_tile.h b/cutlass/gemm/igemm_global_tile.h index 3f594ac6ad..7a9c1573ae 100644 --- a/cutlass/gemm/igemm_global_tile.h +++ b/cutlass/gemm/igemm_global_tile.h @@ -32,9 +32,9 @@ */ #pragma once -#include -#include -#include +#include "cutlass/coord.h" +#include "cutlass/gemm/gemm_global_tile.h" +#include "cutlass/matrix_traits.h" namespace cutlass { namespace gemm { @@ -67,10 +67,10 @@ struct IgemmGlobalTileTraits : public GemmGlobalTileTraits< /// The strides in each dimension between different loads/stores. typedef Shape Delta; /// The number of iterations needed to load/store the tile. - typedef Shape + Base::VectorizedTile::kW / Base::Threads::kW, + Base::VectorizedTile::kC / Base::kAccessSize> Iterations; /// Computes the thread offset in (H, W) based on thread ID @@ -86,24 +86,11 @@ struct IgemmGlobalTileTraits : public GemmGlobalTileTraits< public: /// The threads strides. - typedef Shape<1, 4, Base::Tile::kC> ThreadsDelta; + typedef Shape<1, 4, Base::VectorizedTile::kC> ThreadsDelta; }; //////////////////////////////////////////////////////////////////////////////////////////////////// -/// Deprecated. Please use IgemmGlobalTileTraits instead. - -template -struct IgemmContiguousGlobalTileTraits - : public IgemmGlobalTileTraits {}; - -//////////////////////////////////////////////////////////////////////////////////////////////////// - template struct IgemmGlobalIteratorAb : public GemmGlobalIteratorAb { /// The base class. @@ -114,11 +101,11 @@ struct IgemmGlobalIteratorAb : public GemmGlobalIteratorAb /// Constructor. CUTLASS_DEVICE IgemmGlobalIteratorAb(typename Base::Params const& _params, const Coord<3>& bounds, - const Coord<3>& block, + const Coord<3>& threadblock_offset, ThreadOffset thread_offset_func = ThreadOffset()) - : Base(_params, bounds, block, thread_offset_func), in_residue_(false), mask_(0xffffffff) { + : Base(_params, bounds, threadblock_offset, thread_offset_func), mask_(0xffffffff) { // The number of elements read in a single iteration. - int const kBlock = TileTraits_::Tile::kW * TileTraits_::kAccessSize; + int const kBlock = TileTraits_::Tile::kW; // The residue. int const kResidue = (int)(bounds[1] % kBlock); @@ -129,28 +116,12 @@ struct IgemmGlobalIteratorAb : public GemmGlobalIteratorAb } } - /// The accessor. - CUTLASS_DEVICE void get(typename Base::AccessType& value, int d, int h, int w, int c) const { - Base::get(value, d, h, w, c); - if (in_residue_) { - reinterpret_cast(value) &= mask_; - } - } - - /// Move to residue portion. - CUTLASS_DEVICE void move_to_residue(typename Base::Index k) { - Base::move_to_residue(k); - in_residue_ = true; - } - - /// Move back to the beginning of the first tile. - CUTLASS_DEVICE void rollback() { - Base::rollback(); - in_residue_ = false; + CUTLASS_DEVICE void load_element( + typename Base::AccessType& value, int d, int h, int w, int c) const { + Base::load_element(value, d, h, w, c); + reinterpret_cast(value) &= mask_; } - /// Are we in the residue? - bool in_residue_; /// The mask to clean up the values. uint32_t mask_; }; diff --git a/cutlass/gemm/igemm_multiply_add.h b/cutlass/gemm/igemm_multiply_add.h index 5a8baec533..5ff6c7c1b9 100644 --- a/cutlass/gemm/igemm_multiply_add.h +++ b/cutlass/gemm/igemm_multiply_add.h @@ -28,9 +28,9 @@ */ #pragma once -#include +#include "cutlass/fragment.h" -#include +#include "cutlass/gemm/thread_multiply_add.h" namespace cutlass { namespace gemm { @@ -38,16 +38,18 @@ namespace gemm { //////////////////////////////////////////////////////////////////////////////////////////////////// /// Template performing matrix multiply-add operation within a thread -template -struct ThreadMultiplyAdd { +template +struct ThreadMultiplyAdd { /// The shape of the instruction. typedef Shape<4, 1, 1> InstructionShape; - /// The number of accumulators per thread. - typedef AccumulatorsPerThread_ AccumulatorsPerThread; + /// Shape of the thread-level GEMM (K-by-N-by-M) + typedef ThreadGemmShape_ ThreadGemmShape; + /// Aliased for compatibility. Will be removed in CUTLASS v2.0 + typedef ThreadGemmShape AccumulatorsPerThread; /// The number of threads per warp. typedef ThreadsPerWarp_ ThreadsPerWarp; /// The number of accumulators per warp. - typedef typename ShapeMul::Shape AccumulatorsPerWarp; + typedef typename ShapeMul::Shape AccumulatorsPerWarp; /// The type for A. typedef int8_t ScalarA; /// The fragment for A. diff --git a/cutlass/gemm/igemm_swizzle.h b/cutlass/gemm/igemm_swizzle.h index 77cf7118df..fbb68d1434 100644 --- a/cutlass/gemm/igemm_swizzle.h +++ b/cutlass/gemm/igemm_swizzle.h @@ -27,7 +27,7 @@ */ #pragma once -#include +#include "cutlass/fragment.h" namespace cutlass { namespace gemm { @@ -82,6 +82,11 @@ struct IgemmSwizzle { int a2 = src_int[i2]; int a3 = src_int[i3]; + // // DEBUG. + // if (threadIdx.x == 0) { + // printf("a=0x%08x 0x%08x 0x%08x 0x%08x\n", a0, a1, a2, a3); + // } + int b0, b1, b2, b3, c0; asm volatile("prmt.b32 %0, %1, %2, 0x0040;" : "=r"(b0) : "r"(a0), "r"(a1)); asm volatile("prmt.b32 %0, %1, %2, 0x0040;" : "=r"(c0) : "r"(a2), "r"(a3)); @@ -99,6 +104,11 @@ struct IgemmSwizzle { asm volatile("prmt.b32 %0, %1, %2, 0x0073;" : "=r"(c0) : "r"(a2), "r"(a3)); asm volatile("prmt.b32 %0, %1, %2, 0x5410;" : "=r"(b3) : "r"(b3), "r"(c0)); + // // DEBUG. + // if (threadIdx.x == 0) { + // printf("b=0x%08x 0x%08x 0x%08x 0x%08x\n", b0, b1, b2, b3); + // } + dst_int[i0] = b0; dst_int[i1] = b1; dst_int[i2] = b2; diff --git a/cutlass/gemm/igemm_traits.h b/cutlass/gemm/igemm_traits.h index 82f8de5cd0..5bceeda92e 100644 --- a/cutlass/gemm/igemm_traits.h +++ b/cutlass/gemm/igemm_traits.h @@ -29,18 +29,18 @@ */ #pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "cutlass/convert.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/gemm_epilogue.h" +#include "cutlass/gemm/gemm_epilogue_traits.h" +#include "cutlass/gemm/gemm_global_tile.h" +#include "cutlass/gemm/gemm_shared_tile.h" +#include "cutlass/gemm/gemm_traits.h" +#include "cutlass/gemm/igemm_epilogue.h" +#include "cutlass/gemm/igemm_global_tile.h" +#include "cutlass/gemm/igemm_multiply_add.h" +#include "cutlass/gemm/igemm_swizzle.h" +#include "cutlass/reshape_tile.h" namespace cutlass { namespace gemm { @@ -52,49 +52,52 @@ template < typename OutputTile_, /// The output type. typename ScalarD_, - /// The number of accumulators per thread. - typename AccumulatorsPerThread_> -struct IgemmConfig - : public GemmConfig< - /// The scalar type for A. - int8_t, - /// The scalar type for B. - int8_t, - /// The scalar type for C. - ScalarD_, - /// The scalar type for D. - ScalarD_, - /// The tile size for the GEMM KxNxM. - OutputTile_, - /// The functor to do the math in the main loop. - ThreadMultiplyAdd, int8_t, int8_t, int>, - /// The number of scalars per LDG for A. - 4, - /// The number of scalars per STS for A. - 4, - /// The number of scalars per LDS for A. - 16, - /// The number of scalars per LDG for B. - 4, - /// The number of scalars per STS for B. - 4, - /// The number of scalars per LDS for B. - 16, - /// The number of scalars per LDG for C and STG for D. - 1, - /// The number of scalars per STS for D. - 4, - /// The number of scalars per LDS for D. - 1, - /// The number of stages in shared memory. - 2, - /// Enable the code path that deals with the residue in epilogue. - true> {}; + /// Tile size for thread-level GEMM (K-by-N-by-M) + typename ThreadGemmShape_> +struct IgemmConfig : public GemmConfig< + /// The scalar type for A. + int8_t, + /// The scalar type for B. + int8_t, + /// The scalar type for C. + ScalarD_, + /// The scalar type for D. + ScalarD_, + /// The tile size for the GEMM KxNxM. + OutputTile_, + /// The functor to do the math in the main loop. + ThreadMultiplyAdd, int8_t, int8_t, int>, + /// The number of scalars per LDG for A. + 4, + /// The number of scalars per STS for A. + 4, + /// The number of scalars per LDS for A. + 16, + /// The number of scalars per LDG for B. + 4, + /// The number of scalars per STS for B. + 4, + /// The number of scalars per LDS for B. + 16, + /// The number of scalars per LDG for C and STG for D. + 1, + /// The number of scalars per STS for D. + 4, + /// The number of scalars per LDS for D. + 1, + /// The number of stages in shared memory. + 2, + /// kResidueSeparate + false, + /// kResidueInPrologue + false, + /// kLaunchBounds + false> {}; //////////////////////////////////////////////////////////////////////////////////////////////////// -template -struct IgemmConfig +template +struct IgemmConfig : public GemmConfig< /// The scalar type for A. int8_t, @@ -107,7 +110,7 @@ struct IgemmConfig /// The tile size for the GEMM KxNxM. OutputTile_, /// The functor to do the math in the main loop. - ThreadMultiplyAdd, int8_t, int8_t, int>, + ThreadMultiplyAdd, int8_t, int8_t, int>, /// The number of scalars per LDG for A. 4, /// The number of scalars per STS for A. @@ -128,8 +131,12 @@ struct IgemmConfig 4, /// The number of stages in shared memory. 2, - /// Enable the code path that deals with the residue in epilogue. - true> {}; + /// If true, separate mainloop is instantiated from residue + false, + /// Compute residue in prolog? + true, + /// Launch bounds? + false> {}; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -162,7 +169,7 @@ struct IgemmTileTraitsHelperA GemmConfig_::kScalarsPerLdgA> GlobalTileTraits; - // The iterator. + /// The global load iterator. typedef GemmGlobalIteratorAb GlobalLoadIterator; /// The traits class to build the iterator to store data to shared memory for A^N. @@ -208,7 +215,7 @@ struct IgemmTileTraitsHelperA { GemmConfig_::kScalarsPerLdgA> GlobalTileTraits; - // The iterator. + /// The global load iterator. typedef IgemmGlobalIteratorAb GlobalLoadIterator; /// The traits class to build the iterator to store data to shared memory for A^N. @@ -281,7 +288,7 @@ struct IgemmTileTraitsHelperB { GemmConfig_::kScalarsPerLdgB> GlobalTileTraits; - // The iterator. + /// The global load iterator. typedef IgemmGlobalIteratorAb GlobalLoadIterator; /// The traits class to build the iterator to store data to shared memory for B^N. @@ -345,7 +352,7 @@ struct IgemmTileTraitsHelperB GemmConfig_::kScalarsPerLdgB> GlobalTileTraits; - // The iterator. + /// The global load iterator. typedef GemmGlobalIteratorAb GlobalLoadIterator; /// The traits class to build the iterator to store data to shared memory for B^N. @@ -404,13 +411,13 @@ template < typename ScalarD_, /// The functor to do the math in the epilogue. typename EpilogueFunctor_, - /// The number of accumulators per thread. - typename AccumulatorsPerThread_ = Shape<32, 8, 8>, + /// Tile size for thread-level GEMM (K-by-N-by-M) + typename ThreadGemmShape_ = Shape<32, 8, 8>, /// The index. typename Index_ = int> struct IgemmTraitsHelper { /// The IGEMM config. - typedef IgemmConfig GemmConfig; + typedef IgemmConfig GemmConfig; /// The GEMM config for A. typedef IgemmTileTraitsHelperA GemmTileTraitsHelperA; /// The GEMM config for B. @@ -418,7 +425,6 @@ struct IgemmTraitsHelper { /// The iterator to load A from global memory. typedef typename GemmTileTraitsHelperA::GlobalLoadIterator GlobalLoadIteratorA; - /// The default transformer for A. typedef typename IgemmTransformerA::Transformer GlobalTransformerA; @@ -429,12 +435,14 @@ struct IgemmTraitsHelper { MemorySpace::kShared> SharedStoreIteratorA; /// The stream to load A from global memory to shared memory. - typedef GlobalLoadStream + typedef GlobalLoadStream GlobalLoadStreamA; /// The iterator to load B from global memory. typedef typename GemmTileTraitsHelperB::GlobalLoadIterator GlobalLoadIteratorB; - // The default transformer for B. typedef typename IgemmTransformerB::Transformer GlobalTransformerB; @@ -445,7 +453,10 @@ struct IgemmTraitsHelper { MemorySpace::kShared> SharedStoreIteratorB; /// The stream to load B from global memory to shared memory. - typedef GlobalLoadStream + typedef GlobalLoadStream GlobalLoadStreamB; /// The iterator to load A from shared memory. @@ -501,8 +512,8 @@ template < typename ScalarD_ = int, /// The functor to do the math in the epilogue. typename EpilogueFunctor_ = LinearScaling::Scalar>, - /// The number of accumulators per thread. - typename AccumulatorsPerThread_ = Shape<32, 8, 8>, + /// Tile size for thread-level GEMM (K-by-N-by-M) + typename ThreadGemmShape_ = Shape<32, 8, 8>, /// The index. typename Index_ = int, /// The helper class. @@ -511,7 +522,7 @@ template < OutputTile_, ScalarD_, EpilogueFunctor_, - AccumulatorsPerThread_, + ThreadGemmShape_, Index_> > struct IgemmTraits : public GemmTraits< // The config. diff --git a/cutlass/gemm/linear_scaling.h b/cutlass/gemm/linear_scaling.h index 979c93f962..a12fc5f19f 100644 --- a/cutlass/gemm/linear_scaling.h +++ b/cutlass/gemm/linear_scaling.h @@ -1,3 +1,4 @@ + /*************************************************************************************************** * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. * @@ -27,18 +28,31 @@ */ #pragma once -#include +#include "cutlass/fragment_multiply_add.h" namespace cutlass { namespace gemm { //////////////////////////////////////////////////////////////////////////////////////////////////// +template +CUTLASS_DEVICE bool is_zero(T x) { + return x == T(0); +} + +#if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16) +CUTLASS_DEVICE bool is_zero(half x) { return reinterpret_cast(x) == int16_t(0); } +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + /// Functor to compute linear combination of fragments -template > +template > struct LinearScaling { // The scalar. typedef Scalar_ Scalar; + // The accumulator Type + typedef typename FragmentMultiplyAdd_::ScalarAccum ScalarAccum; // The adapater. typedef FragmentMultiplyAdd_ FragmentMultiplyAdd; @@ -47,6 +61,21 @@ struct LinearScaling { /// The alpha/beta scaling params. Scalar alpha, beta; + // + // Methods + // + + // Constructor + CUTLASS_HOST_DEVICE + Params(Scalar _alpha = 0, Scalar _beta = 0) : alpha(_alpha), beta(_beta) {} + + /// Initialize the parameters + CUTLASS_HOST_DEVICE int initialize(Scalar _alpha, Scalar _beta) { + alpha = _alpha; + beta = _beta; + return 0; + } + /// Initialize the parameters. template CUTLASS_HOST_DEVICE int initialize(GemmDesc_ const& desc) { @@ -56,14 +85,53 @@ struct LinearScaling { } }; + // + // Data members + // + + Params params; + + // + // Methods + // + /// Ctor. - CUTLASS_DEVICE LinearScaling(Params const& params) : alpha(params.alpha), beta(params.beta) {} + CUTLASS_DEVICE LinearScaling() { } + + /// Ctor. + CUTLASS_DEVICE LinearScaling(Params const& _params) : params(_params) {} + + /// Method to determine whether the source accumulator matrix C is ever needed. This method + /// may always safely return true, though better performance is possible if the source accumulator + /// matrix is never loaded unnecessarily. + CUTLASS_DEVICE + bool source_required() const { + return !is_zero(params.beta); + } /// Evaluate the functor. template CUTLASS_DEVICE void evaluate(FragmentA_ const& accum, FragmentB_& output) { FragmentMultiplyAdd mad; - mad.multiply(alpha, accum, output); + mad.multiply(params.alpha, accum, output); + + } + + /// Evaluate the functor, without using fragment in the API + template + CUTLASS_DEVICE void evaluate(ScalarAccum const *accum, ScalarOutput *output) { + Fragment FragAccum; + Fragment FragOutput; +#pragma unroll + for (int i = 0; i < size; i++) { + FragAccum[i] = accum[i]; + FragOutput[i] = output[i]; + } + evaluate(FragAccum, FragOutput); +#pragma unroll + for (int i = 0; i < size; i++) { + output[i] = FragOutput[i]; + } } /// Evaluate the functor. @@ -71,12 +139,28 @@ struct LinearScaling { CUTLASS_DEVICE void evaluate(FragmentA_ const& accum, FragmentB_ const& old, FragmentB_& output) { FragmentMultiplyAdd mad; FragmentB_ tmp; - mad.multiply(beta, old, tmp); - mad.multiply_add(alpha, accum, tmp, output); + mad.multiply(params.beta, old, tmp); + mad.multiply_add(params.alpha, accum, tmp, output); } - /// The alpha/beta scaling factors. - Scalar alpha, beta; + /// Evaluate the functor, without using fragment in the API + template + CUTLASS_DEVICE void evaluate(ScalarAccum const *accum, ScalarOutput const *old, ScalarOutput *output) { + Fragment FragAccum; + Fragment FragOutput; + Fragment FragOld; +#pragma unroll + for (int i = 0; i < size; i++) { + FragAccum[i] = accum[i]; + FragOutput[i] = output[i]; + FragOld[i] = old[i]; + } + evaluate(FragAccum, FragOld, FragOutput); +#pragma unroll + for (int i = 0; i < size; i++) { + output[i] = FragOutput[i]; + } + } }; //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cutlass/gemm/linear_scaling_device_ptr.h b/cutlass/gemm/linear_scaling_device_ptr.h new file mode 100644 index 0000000000..5dc845da4a --- /dev/null +++ b/cutlass/gemm/linear_scaling_device_ptr.h @@ -0,0 +1,149 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Implements the BLAS linear scaling function alpha*AB + beta*C +*/ +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/scalar_or_pointer.h" +#include "cutlass/gemm/linear_scaling.h" + +namespace cutlass { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace gemm { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Functor to compute linear combination of fragments. This is intended to support passing scalars +/// either by value from the host or by reference to device-side scalar elements. This is inspired +/// by cuBLAS's device pointer mode. +template > +struct LinearScalingDevicePtr : public LinearScaling { + + /// Linear Scaling class used + typedef LinearScaling Base; + + // The scalar. + typedef typename Base::Scalar Scalar; + + /// The parameters. + class Params { + private: + /// Alpha scalar + detail::ScalarOrPointer alpha_; + + /// Beta sclaar + detail::ScalarOrPointer beta_; + + public: + // + // Methods + // + + // Constructor + CUTLASS_HOST_DEVICE + Params() {} + + // Constructor + CUTLASS_HOST_DEVICE + Params( + Scalar alpha, + Scalar beta + ): + alpha_(alpha), + beta_(beta) {} + + // Constructor + CUTLASS_HOST_DEVICE + Params( + Scalar const *alpha_ptr, + Scalar const *beta_ptr + ): + alpha_(alpha_ptr), + beta_(alpha_ptr) {} + + /// Initialize the parameters + CUTLASS_HOST_DEVICE int initialize( + Scalar alpha, + Scalar beta) { + + alpha_ = alpha; + beta_ = beta; + + return 0; + } + + /// Initialize the parameters + CUTLASS_HOST_DEVICE int initialize( + Scalar const *alpha, + Scalar const *beta) { + + alpha_ = alpha; + beta_= beta; + + return 0; + } + + /// Initialize the parameters. + template + CUTLASS_HOST_DEVICE int initialize(GemmDesc_ const& desc) { + + alpha_ = desc.alpha; + beta_ = desc.beta; + + return 0; + } + + /// Gets the alpha scalar + CUTLASS_HOST_DEVICE + Scalar alpha() const { + return alpha_; + } + + /// Gets the beta scalar + CUTLASS_HOST_DEVICE + Scalar beta() const { + return beta_; + } + }; + + // + // Methods + // + + /// Ctor. + CUTLASS_HOST_DEVICE LinearScalingDevicePtr(Params const& _params) { + this->params.alpha = _params.alpha(); + this->params.beta = _params.beta(); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace gemm +} // namespace cutlass diff --git a/cutlass/gemm/scalar_or_pointer.h b/cutlass/gemm/scalar_or_pointer.h new file mode 100644 index 0000000000..7c4b4b75d0 --- /dev/null +++ b/cutlass/gemm/scalar_or_pointer.h @@ -0,0 +1,129 @@ + +/*************************************************************************************************** + * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Implements the BLAS linear scaling function alpha*AB + beta*C +*/ +#pragma once + +#include "cutlass/cutlass.h" + +namespace cutlass { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace detail { + +/// Helper class defines an object which operates as either a scalar or a pointer. If the pointer +/// is non-null, it is dereferenced when the object is accessed. +template +class ScalarOrPointer { +public: + /// Underlying scalar type + typedef Scalar_ Scalar; + +private: + // + // Data members + // + + /// Scalar value + Scalar scalar; + + /// Pointer to use if non null + Scalar const *ptr; + +public: + + // + // Methods + // + + /// Default ctor + CUTLASS_HOST_DEVICE + ScalarOrPointer(): scalar(0), ptr(nullptr) {} + + /// Object behaves as a scalar + CUTLASS_HOST_DEVICE + ScalarOrPointer(Scalar const &val): scalar(val), ptr(nullptr) {} + + /// Object behaves as a scalar + CUTLASS_HOST_DEVICE + ScalarOrPointer(Scalar const *ptr_): scalar(0), ptr(ptr_) {} + + /// Returns true if is pointer + CUTLASS_HOST_DEVICE + bool is_pointer() const { + return bool(ptr); + } + + /// Gets the pointer value + CUTLASS_HOST_DEVICE + Scalar const *get_ptr() const { + return ptr; + } + + /// Gets the pointer value + CUTLASS_HOST_DEVICE + Scalar get_scalar() const { + return scalar; + } + + /// Assigns to a scalar and sets pointer to nullptr + CUTLASS_HOST_DEVICE + ScalarOrPointer &operator=(Scalar const &scalar_) { + scalar = scalar_; + ptr = nullptr; + return *this; + } + + /// Assigns to a pointer value + CUTLASS_HOST_DEVICE + ScalarOrPointer &operator=(Scalar const *ptr_) { + ptr = ptr_; + return *this; + } + + /// Access the element + CUTLASS_HOST_DEVICE + Scalar get() const { + if (ptr) { + return *ptr; + } + return scalar; + } + + /// Accesses the element + CUTLASS_HOST_DEVICE + operator Scalar() const { + return get(); + } +}; + +} // namespace detail + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass diff --git a/cutlass/gemm/sgemm_traits.h b/cutlass/gemm/sgemm_traits.h index 66b7677486..8ce7f58e26 100644 --- a/cutlass/gemm/sgemm_traits.h +++ b/cutlass/gemm/sgemm_traits.h @@ -27,13 +27,13 @@ */ #pragma once -#include -#include -#include -#include -#include -#include -#include +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/gemm_epilogue.h" +#include "cutlass/gemm/gemm_epilogue_traits.h" +#include "cutlass/gemm/gemm_global_tile.h" +#include "cutlass/gemm/gemm_shared_tile.h" +#include "cutlass/gemm/gemm_traits.h" +#include "cutlass/gemm/thread_multiply_add.h" namespace cutlass { namespace gemm { @@ -43,46 +43,53 @@ namespace gemm { template < /// The tile size for the GEMM KxNxM. typename OutputTile_, - /// The number of accumulators per thread. - typename AccumulatorsPerThread_, + /// Tile size for thread-level GEMM (K-by-N-by-M) + typename ThreadGemmShape_, /// The number of scalars per LDG for A. int kScalarsPerLdgA_ = 1, /// The number of scalars per LDG for B. - int kScalarsPerLdgB_ = 1> -struct SgemmConfig - : public GemmConfig< - /// The scalar type for A. - float, - /// The scalar type for B. - float, - /// The scalar type for C. - float, - /// The scalar type for D. - float, - /// The tile size for the GEMM KxNxM. - OutputTile_, - /// The functor to do the math in the main loop. - ThreadMultiplyAdd, float, float, float>, - /// The number of scalars per LDG for A. - kScalarsPerLdgA_, - /// The number of scalars per STS for A. - kScalarsPerLdgA_, - /// The number of scalars per LDS for A. - 4, - /// The number of scalars per LDG for B. - kScalarsPerLdgB_, - /// The number of scalars per STS for B. - kScalarsPerLdgB_, - /// The number of scalars per LDS for B. - 4, - /// The number of scalars per LDG for C and STG for D. - 1, - /// The number of scalars per STS for D. - 4, - /// The number of scalars per LDS for D. - 1, - /// The number of stages in shared memory. - 2> {}; + int kScalarsPerLdgB_ = 1, + /// Whether to specify launch bounds + bool kLaunchBounds = true> +struct SgemmConfig : public GemmConfig< + /// The scalar type for A. + float, + /// The scalar type for B. + float, + /// The scalar type for C. + float, + /// The scalar type for D. + float, + /// The tile size for the GEMM KxNxM. + OutputTile_, + /// The functor to do the math in the main loop. + ThreadMultiplyAdd, float, float, float>, + /// The number of scalars per LDG for A. + kScalarsPerLdgA_, + /// The number of scalars per STS for A. + kScalarsPerLdgA_, + /// The number of scalars per LDS for A. + 4, + /// The number of scalars per LDG for B. + kScalarsPerLdgB_, + /// The number of scalars per STS for B. + kScalarsPerLdgB_, + /// The number of scalars per LDS for B. + 4, + /// The number of scalars per LDG for C and STG for D. + 1, + /// The number of scalars per STS for D. + 4, + /// The number of scalars per LDS for D. + 1, + /// The number of stages in shared memory. + 2, + /// kResidueSeparate + false, + /// kResidueInPrologue + true, + /// kLaunchBounds + kLaunchBounds> {}; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -95,8 +102,8 @@ template < typename OutputTile_ = Shape<8, 128, 128>, /// The functor to use in the epilogue. typename EpilogueFunctor_ = LinearScaling, - /// The number of accumulators per thread. - typename AccumulatorsPerThread_ = Shape<8, 8, 8>, + /// Tile size for thread-level GEMM (K-by-N-by-M) + typename ThreadGemmShape_ = Shape<8, 8, 8>, /// The number of floats loaded in one LDG for A. int kScalarsPerLdgA_ = 1, /// The number of floats loaded in one LDG for B. @@ -105,7 +112,7 @@ template < typename Index_ = int, /// The SGEMM config. typename GemmConfig_ = - SgemmConfig, + SgemmConfig, /// The traits class for the epilogue. typename GemmEpilogueTraits_ = SimplifiedGemmEpilogueTraits > @@ -123,5 +130,43 @@ struct SgemmTraits : public SimplifiedGemmTraits< //////////////////////////////////////////////////////////////////////////////////////////////////// +/// Helper to define SGEMM traits using Launch Bounds +template < + /// The layout for A. + MatrixLayout::Kind kLayoutA_, + /// The layout for B. + MatrixLayout::Kind kLayoutB_, + /// The output tile. + typename OutputTile_ = Shape<8, 128, 128>, + /// The functor to use in the epilogue. + typename EpilogueFunctor_ = LinearScaling, + /// Tile size for thread-level GEMM (K-by-N-by-M) + typename ThreadGemmShape_ = Shape<8, 8, 8>, + /// The number of floats loaded in one LDG for A. + int kScalarsPerLdgA_ = 1, + /// The number of floats loaded in one LDG for B. + int kScalarsPerLdgB_ = 1, + /// The index. + typename Index_ = int, + /// The SGEMM config. + typename GemmConfig_ = + SgemmConfig, + /// The traits class for the epilogue. + typename GemmEpilogueTraits_ = + SimplifiedGemmEpilogueTraits > +struct SgemmLBTraits : public SimplifiedGemmTraits< + // The layout for A. + kLayoutA_, + // The layout for B. + kLayoutB_, + // The config. + GemmConfig_, + // The epilogue. + GemmEpilogue, + // The index. + Index_> {}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace gemm } // namespace cutlass diff --git a/cutlass/gemm/thread_multiply_add.h b/cutlass/gemm/thread_multiply_add.h index 20dca15965..b95dee58a0 100644 --- a/cutlass/gemm/thread_multiply_add.h +++ b/cutlass/gemm/thread_multiply_add.h @@ -27,7 +27,7 @@ */ #pragma once -#include +#include "cutlass/fragment.h" namespace cutlass { namespace gemm { @@ -35,20 +35,23 @@ namespace gemm { //////////////////////////////////////////////////////////////////////////////////////////////////// /// Template performing matrix multiply-add operation within a thread -template + typename ScalarC_, + MatrixLayout::Kind kLayout_ = MatrixLayout::kColumnMajor> struct ThreadMultiplyAdd { /// The shape of the instruction. typedef Shape<1, 1, 1, 1> InstructionShape; - /// The number of accumulators per thread. - typedef AccumulatorsPerThread_ AccumulatorsPerThread; + /// The shape of a thread-leveel matrix multiply accumulate. + typedef ThreadGemmShape_ ThreadGemmShape; + /// Aliased to "AccumulatorsPerThread" for compatibility. Expect to be renamed in CUTLASS v2.0 + typedef ThreadGemmShape AccumulatorsPerThread; /// The number of threads per warp. typedef ThreadsPerWarp_ ThreadsPerWarp; /// The number of accumulators per warp. - typedef typename ShapeMul::Shape AccumulatorsPerWarp; + typedef typename ShapeMul::Shape AccumulatorsPerWarp; /// The type for A. typedef ScalarA_ ScalarA; /// The fragment for A. @@ -70,9 +73,18 @@ struct ThreadMultiplyAdd { FragmentB const& b, Accumulators const& c, Accumulators& d) { - for (int j = 0; j < AccumulatorsPerThread::kH; ++j) { - for (int i = 0; i < AccumulatorsPerThread::kW; ++i) { - d[j * AccumulatorsPerThread::kW + i] = a[i] * b[j] + c[j * AccumulatorsPerThread::kW + i]; + if(kLayout_ == MatrixLayout::kColumnMajor) { + for (int j = 0; j < AccumulatorsPerThread::kH; ++j) { + for (int i = 0; i < AccumulatorsPerThread::kW; ++i) { + d[j * AccumulatorsPerThread::kW + i] = a[i] * b[j] + c[j * AccumulatorsPerThread::kW + i]; + } + } + } + else { + for(int i = 0; i < AccumulatorsPerThread::kW; ++i) { + for(int j = 0; j < AccumulatorsPerThread::kH; ++j) { + d[i * AccumulatorsPerThread::kH + j] = a[i] * b[j] + c[i * AccumulatorsPerThread::kH + j]; + } } } } diff --git a/cutlass/gemm/threadblock_swizzle.h b/cutlass/gemm/threadblock_swizzle.h new file mode 100644 index 0000000000..fe7a3be7f5 --- /dev/null +++ b/cutlass/gemm/threadblock_swizzle.h @@ -0,0 +1,387 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Defies functors for mapping blockIdx to partitions of the GEMM computation. +*/ +#pragma once + +#include "cutlass/coord.h" +#include "cutlass/gemm/gemm_coord.h" + +namespace cutlass { +namespace gemm { + +struct swizzleDirection { + enum Kind { Boustrophedon, OneDirection }; +}; +// helper template function +template +CUTLASS_DEVICE int getLinearIdx(int groups) { + // groupCols is not needed for OneDirection Swizzle + return blockIdx.y * gridDim.x + blockIdx.x; +} +template <> +CUTLASS_DEVICE int getLinearIdx(int groups) { + // reverse blockIdx.x for some columns + if ((blockIdx.y / groups) % 2 == 1) + return blockIdx.y * gridDim.x + (gridDim.x - blockIdx.x - 1); + else + return blockIdx.y * gridDim.x + blockIdx.x; +} +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/*!@defgroup IdentityBlockSwizzle Identity Block Swizzle +@{ + Block Swizzle provides the mapping logic between a block in the physical memory of Matrix C and +Thread Block + Identiy Block Swizzle effective maps blocks in leading dimension order (column major) with +thread block + in leading dimension order (blockIdx.x) + blockIdx.z is mapped with batch_count for batched GEMM +@} +*/ +struct IdentityBlockSwizzle { + /// Ctor. aka ColumnMajorBlockSwizzle<1> + CUTLASS_HOST_DEVICE IdentityBlockSwizzle() {} + + /// Swizzle the block index. + CUTLASS_DEVICE dim3 swizzle() { return blockIdx; } + + /// + CUTLASS_HOST_DEVICE dim3 get_grid_layout(GemmCoord const &problem_size, + Coord<3> const &OutputTile) { + /*OutputTile and problem_size are both in KNM order*/ + dim3 grid; + grid.x = (problem_size.m() + OutputTile[2] - 1) / OutputTile[2]; + grid.y = (problem_size.n() + OutputTile[1] - 1) / OutputTile[1]; + grid.z = problem_size.batch(); + return grid; + } + + /// + CUTLASS_DEVICE Coord<3> get_threadblock_offset(Coord<3> const &OutputTile) { + dim3 block = swizzle(); + Coord<3> threadblock_offset = + make_Coord(0, block.y * OutputTile[1], block.x * OutputTile[2]); + return threadblock_offset; + } + + /// + CUTLASS_DEVICE int get_batch_id() { + dim3 block = swizzle(); + return block.z; + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/* +ColumnMajorBlockSwizzle<1, OneDirection> is equivalent with IdentityBlockSwizzle +groupCols has the effect of controlling the schedulling of thread blocks +settings with different groupCols can contribute to the overall performance by affecting L2 cache +hit rate + +consider a regular thread block mapping btween matrix C and different thread blocks +note that C is column major, and the leading dimension of thread block id is blockIdx.x + +let's look at an example where gridIdx.x = 6, gridIdx.y = 7, gridIdx.z = 1 +(blockIdx.x, blockIdx.y) +mapping between threadblockID and C matrix: +------------------------------------------------------- +(0,0) | (0,1) | (0,2) | (0,3) | (0,4) | (0,5) | (0,6) | +------------------------------------------------------- +(1,0) | (1,1) | (1,2) | (1,3) | (1,4) | (1,5) | (1,6) | +------------------------------------------------------- +(2,0) | (2,1) | (2,2) | (2,3) | (2,4) | (2,5) | (2,6) | +------------------------------------------------------- +(3,0) | (3,1) | (3,2) | (3,3) | (3,4) | (3,5) | (3,6) | +------------------------------------------------------- +(4,0) | (4,1) | (4,2) | (4,3) | (4,4) | (4,5) | (4,6) | +------------------------------------------------------- +(5,0) | (5,1) | (5,2) | (5,3) | (5,4) | (5,5) | (5,6) | +------------------------------------------------------- + +A ColumnMajorBlockSwizzle<1, OneDirection> will imply the above order where threadblocks are +launched in a column major + +A ColumnMajorBlockSwizzle<2, OneDirection> swizzles things a little, +------------------------------------------------------- +(0,0) | (3,0) | (0,2) | (3,2) | (0,4) | (3,4) | (0,6) | +------------------------------------------------------- +(0,1) | (3,1) | (0,3) | (3,3) | (0,5) | (3,5) | (1,6) | +------------------------------------------------------- +(1,0) | (4,0) | (1,2) | (4,2) | (1,4) | (4,4) | (2,6) | +------------------------------------------------------- +(1,1) | (4,1) | (1,3) | (4,3) | (1,5) | (4,5) | (3,6) | +------------------------------------------------------- +(2,0) | (5,0) | (2,2) | (5,2) | (2,4) | (5,4) | (4,6) | +------------------------------------------------------- +(2,1) | (5,1) | (2,3) | (5,3) | (2,5) | (5,5) | (5,6) | +------------------------------------------------------- + +so in memory, it would apprear that we work on 2 columns at a time rather than 1 +Note that the index here really represent how each block maps to memory + +A ColumnMajorBlockSwizzle<1, Boustrophedon> is similar to ColumnMajorBlockSwizzle<1, OneDirection> +except that every column flips the ordering against the previous one +------------------------------------------------------- +(0,0) | (5,1) | (0,2) | (5,3) | (0,4) | (5,5) | (0,6) | +------------------------------------------------------- +(1,0) | (4,1) | (1,2) | (4,3) | (1,4) | (4,5) | (1,6) | +------------------------------------------------------- +(2,0) | (3,1) | (2,2) | (3,3) | (2,4) | (3,5) | (2,6) | +------------------------------------------------------- +(3,0) | (2,1) | (3,2) | (2,3) | (3,4) | (2,5) | (3,6) | +------------------------------------------------------- +(4,0) | (1,1) | (4,2) | (1,3) | (4,4) | (1,5) | (4,6) | +------------------------------------------------------- +(5,0) | (0,1) | (5,2) | (0,3) | (5,4) | (0,5) | (5,6) | +------------------------------------------------------- + +similarily, A ColumnMajorBlockSwizzle<2, Boustrophedon> looks like +------------------------------------------------------- +(0,0) | (3,0) | (2,3) | (5,3) | (0,4) | (3,4) | (5,6) | +------------------------------------------------------- +(0,1) | (3,1) | (2,2) | (5,2) | (0,5) | (3,5) | (4,6) | +------------------------------------------------------- +(1,0) | (4,0) | (1,3) | (4,3) | (1,4) | (4,4) | (3,6) | +------------------------------------------------------- +(1,1) | (4,1) | (1,2) | (4,2) | (1,5) | (4,5) | (2,6) | +------------------------------------------------------- +(2,0) | (5,0) | (0,3) | (3,3) | (2,4) | (5,4) | (1,6) | +------------------------------------------------------- +(2,1) | (5,1) | (0,2) | (3,2) | (2,5) | (5,5) | (0,6) | +------------------------------------------------------- + +*/ + +template +struct ColumnMajorBlockSwizzle { + /// Ctor. + CUTLASS_HOST_DEVICE ColumnMajorBlockSwizzle() {} + + /// Swizzle the block index. + CUTLASS_DEVICE dim3 swizzle() { + assert(gridDim.z == 1); + int linearIdx = getLinearIdx(groupCols); + dim3 swizzledBlockIdx; + int currGroupCols = groupCols; + int prevGroupCols = groupCols; + + if ((gridDim.y % groupCols != 0) && ((blockIdx.y + (gridDim.y % groupCols)) >= gridDim.y)) { + // last colmuns if gridDim.y is not divisble by groupCols + currGroupCols = gridDim.y % groupCols; + } + + swizzledBlockIdx.x = (linearIdx / currGroupCols) % gridDim.x; + swizzledBlockIdx.y = + linearIdx % currGroupCols + prevGroupCols * (linearIdx / (prevGroupCols * gridDim.x)); + swizzledBlockIdx.z = blockIdx.z; + + return swizzledBlockIdx; + } + + /// + CUTLASS_HOST_DEVICE dim3 get_grid_layout(GemmCoord const &problem_size, + Coord<3> const &OutputTile) { + dim3 grid; + grid.x = (problem_size.m() + OutputTile[2] - 1) / OutputTile[2]; + grid.y = (problem_size.n() + OutputTile[1] - 1) / OutputTile[1]; + grid.z = problem_size.batch(); + return grid; + } + + /// + CUTLASS_DEVICE Coord<3> get_threadblock_offset(Coord<3> const &OutputTile) { + dim3 block = swizzle(); + Coord<3> threadblock_offset = + make_Coord(0, block.y * OutputTile[1], block.x * OutputTile[2]); + return threadblock_offset; + } + + /// + CUTLASS_DEVICE int get_batch_id() { + dim3 block = swizzle(); + return block.z; + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/* + +consider a regular thread block mapping btween matrix C and different thread blocks +note that C is column major, and the leading dimension of thread block id is blockIdx.x + +let's look at an example where gridIdx.x = 6, gridIdx.y = 7, gridIdx.z = 1 +(blockIdx.x, blockIdx.y) +mapping between threadblockID and C matrix: +------------------------------------------------------- +(0,0) | (0,1) | (0,2) | (0,3) | (0,4) | (0,5) | (0,6) | +------------------------------------------------------- +(1,0) | (1,1) | (1,2) | (1,3) | (1,4) | (1,5) | (1,6) | +------------------------------------------------------- +(2,0) | (2,1) | (2,2) | (2,3) | (2,4) | (2,5) | (2,6) | +------------------------------------------------------- +(3,0) | (3,1) | (3,2) | (3,3) | (3,4) | (3,5) | (3,6) | +------------------------------------------------------- +(4,0) | (4,1) | (4,2) | (4,3) | (4,4) | (4,5) | (4,6) | +------------------------------------------------------- +(5,0) | (5,1) | (5,2) | (5,3) | (5,4) | (5,5) | (5,6) | +------------------------------------------------------- + +A RowMajorBlockSwizzle<1, OneDirection> will effectively transpose the map + +----------------------------------------------- +(0,0) | (1,0) | (2,0) | (3,0) | (4,0) | (5,0) | +----------------------------------------------- +(0,1) | (1,1) | (2,1) | (3,1) | (4,1) | (5,1) | +----------------------------------------------- +(0,2) | (1,2) | (2,2) | (3,2) | (4,2) | (5,2) | +----------------------------------------------- +(0,3) | (1,3) | (2,3) | (3,3) | (4,3) | (5,3) | +----------------------------------------------- +(0,4) | (1,4) | (2,4) | (3,4) | (4,4) | (5,4) | +--------------------------------------------- +(0,5) | (1,5) | (2,5) | (3,5) | (4,5) | (5,5) | +----------------------------------------------- +(0,6) | (1,6) | (2,6) | (3,6) | (4,6) | (5,6) | +----------------------------------------------- + +It would aprear in memory we are working on 1 row at a time + +A ColumnMajorBlockSwizzle<2, OneDirection> swizzles things a little bit more +----------------------------------------------- +(0,0) | (1,3) | (2,0) | (3,3) | (4,0) | (5,3) | +----------------------------------------------- +(1,0) | (0,4) | (3,0) | (2,4) | (5,0) | (4,4) | +----------------------------------------------- +(0,1) | (1,4) | (2,1) | (3,4) | (4,1) | (5,4) | +----------------------------------------------- +(1,1) | (0,5) | (3,1) | (2,5) | (5,1) | (4,5) | +----------------------------------------------- +(0,2) | (1,5) | (2,2) | (3,5) | (4,2) | (5,5) | +--------------------------------------------- +(1,2) | (0,6) | (3,2) | (2,6) | (5,2) | (4,6) | +----------------------------------------------- +(0,3) | (1,6) | (2,3) | (3,6) | (4,3) | (5,6) | +----------------------------------------------- + +so in memory, it would apprear that we work on 2 rows at a time rather than 1 row +Note that the index here really represent how each block maps to memory + +A RowMajorBlockSwizzle<1, Boustrophedon> is similar to RowMajorBlockSwizzle<1, OneDirection> +except that every column flips the ordering against the previous one + +----------------------------------------------- +(0,0) | (1,6) | (2,0) | (3,6) | (4,0) | (5,6) | +----------------------------------------------- +(0,1) | (1,5) | (2,1) | (3,5) | (4,1) | (5,5) | +----------------------------------------------- +(0,2) | (1,4) | (2,2) | (3,4) | (4,2) | (5,4) | +----------------------------------------------- +(0,3) | (1,3) | (2,3) | (3,3) | (4,3) | (5,3) | +----------------------------------------------- +(0,4) | (1,2) | (2,4) | (3,2) | (4,4) | (5,2) | +--------------------------------------------- +(0,5) | (1,1) | (2,5) | (3,1) | (4,5) | (5,1) | +----------------------------------------------- +(0,6) | (1,0) | (2,6) | (3,0) | (4,6) | (5,0) | +----------------------------------------------- + +similarily, A RowMajorBlockSwizzle<2, Boustrophedon> looks like +----------------------------------------------- +(0,0) | (1,3) | (2,3) | (3,6) | (4,0) | (5,3) | +----------------------------------------------- +(1,0) | (0,4) | (3,2) | (2,6) | (5,0) | (4,4) | +----------------------------------------------- +(0,1) | (1,4) | (2,2) | (3,5) | (4,1) | (5,4) | +----------------------------------------------- +(1,1) | (0,5) | (3,1) | (2,5) | (5,1) | (4,5) | +----------------------------------------------- +(0,2) | (1,5) | (2,1) | (3,4) | (4,2) | (5,5) | +--------------------------------------------- +(1,2) | (0,6) | (3,0) | (2,4) | (5,2) | (4,6) | +----------------------------------------------- +(0,3) | (1,6) | (2,0) | (3,3) | (4,3) | (5,6) | +----------------------------------------------- + +*/ + +template +struct RowMajorBlockSwizzle { + /// Ctor. + CUTLASS_HOST_DEVICE RowMajorBlockSwizzle() {} + + /// Swizzle the block index. + CUTLASS_DEVICE dim3 swizzle() { + assert(gridDim.z == 1); + int linearIdx = getLinearIdx(groupRows); + dim3 swizzledBlockIdx; + int currGroupRows = groupRows; + int prevGroupRows = groupRows; + + if ((gridDim.y % groupRows != 0) && ((blockIdx.y + (gridDim.y % groupRows)) >= gridDim.y)) { + // last columns + currGroupRows = gridDim.y % groupRows; + } + + swizzledBlockIdx.x = + linearIdx % currGroupRows + prevGroupRows * (linearIdx / (prevGroupRows * gridDim.x)); + swizzledBlockIdx.y = (linearIdx / currGroupRows) % gridDim.x; + swizzledBlockIdx.z = blockIdx.z; + + return swizzledBlockIdx; + } + + /// + CUTLASS_HOST_DEVICE dim3 get_grid_layout(GemmCoord const &problem_size, + Coord<3> const &OutputTile) { + dim3 grid; + grid.x = (problem_size.n() + OutputTile[1] - 1) / OutputTile[1]; + grid.y = (problem_size.m() + OutputTile[2] - 1) / OutputTile[2]; + grid.z = problem_size.batch(); + return grid; + } + + /// + CUTLASS_DEVICE Coord<3> get_threadblock_offset(Coord<3> const &OutputTile) { + dim3 block = swizzle(); + Coord<3> threadblock_offset = + make_Coord(0, block.y * OutputTile[1], block.x * OutputTile[2]); + return threadblock_offset; + } + + /// + CUTLASS_DEVICE int get_batch_id() { + dim3 block = swizzle(); + return block.z; + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace gemm +} // namespace cutlass diff --git a/cutlass/gemm/wmma_gemm_epilogue_traits.h b/cutlass/gemm/wmma_gemm_epilogue_traits.h index 0fafacf90e..f35264dda1 100644 --- a/cutlass/gemm/wmma_gemm_epilogue_traits.h +++ b/cutlass/gemm/wmma_gemm_epilogue_traits.h @@ -27,18 +27,18 @@ */ #pragma once -#include +#include "cutlass/wmma_matrix.h" #ifdef CUTLASS_USE_WMMA_API -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "cutlass/convert.h" +#include "cutlass/coord.h" +#include "cutlass/gemm/gemm_global_stream.h" +#include "cutlass/gemm/gemm_shared_stream.h" +#include "cutlass/gemm/linear_scaling.h" +#include "cutlass/gemm/wmma_gemm_global_tile.h" +#include "cutlass/gemm/wmma_gemm_shared_tile.h" +#include "cutlass/reshape_tile.h" +#include "cutlass/tile_iterator.h" namespace cutlass { namespace gemm { @@ -89,7 +89,7 @@ struct WmmaGemmEpilogueTraitsHelper { MemorySpace::kShared, Index_, WmmaMatrix, - IteratorFragment::kWmmaMatrix> + FragmentElementType::kWmmaMatrix> SharedStoreIteratorD; /// The shared store transformer for D. @@ -114,6 +114,9 @@ struct WmmaGemmEpilogueTraitsHelper { MemorySpace::kShared> SharedLoadIteratorD; + /// The stream to load D. + typedef SharedLoadStream SharedLoadStreamD; + /// The traits class to build the iterator to load data from global memory for C^N. typedef WmmaGemmGlobalIteratorCdTraits< // The pointer is float const. diff --git a/cutlass/gemm/wmma_gemm_global_tile.h b/cutlass/gemm/wmma_gemm_global_tile.h index dbd57f6b5b..ce369d0ebb 100644 --- a/cutlass/gemm/wmma_gemm_global_tile.h +++ b/cutlass/gemm/wmma_gemm_global_tile.h @@ -27,7 +27,7 @@ */ #pragma once -#include +#include "cutlass/gemm/gemm_global_tile.h" namespace cutlass { namespace gemm { @@ -68,22 +68,13 @@ struct WmmaGemmGlobalIteratorCdTraits : public GemmGlobalTileTraits -struct WmmaGemmGlobalIteratorCd : public TileIteratorBase { +struct WmmaGemmGlobalIteratorCd : public GemmGlobalIteratorCd { /// This class. typedef WmmaGemmGlobalIteratorCd This_; /// The traits. typedef TileTraits_ Traits; /// The base class. - typedef TileIteratorBase - Base; + typedef GemmGlobalIteratorCd Base; /// Override the strides in each dimension between different loads/stores. typedef Shape<0, 0, Base::Delta::kW, Base::Delta::kC> ImmediateOffsetStrides; /// The layout. @@ -99,47 +90,36 @@ struct WmmaGemmGlobalIteratorCd : public TileIteratorBasepointer = pointer; + BaseParams::pointer = pointer; + // Stride between GEMMs + BaseParams::stride_d = batch_stride; // Setup the base stride. One "group of threads" per column. - stride_h = ld; + BaseParams::stride_h = ldm; // Each thread output 1 column per iteration. . - inc_h = ld * TileTraits_::Threads::kH; - inc_advance = inc_h + epilogue_stride_w; + BaseParams::inc_h = ldm * TileTraits_::Threads::kH; + BaseParams::inc_advance = BaseParams::inc_h + epilogue_stride_w; - predicate_offset = n; - predicate_inc_h = TileTraits_::Threads::kH; - predicate_inc_advance = predicate_inc_h + epilogue_delta_w; + BaseParams::predicate_offset = n; + BaseParams::predicate_inc_h = TileTraits_::Threads::kH; + BaseParams::predicate_inc_advance = BaseParams::predicate_inc_h + epilogue_delta_w; - // It worked. return 0; } }; - Params params; - - Coord<4> thread_offset; - - /// Ctor. - CUTLASS_DEVICE WmmaGemmGlobalIteratorCd() {} - /// Ctor. CUTLASS_DEVICE WmmaGemmGlobalIteratorCd(Params const& params, const Coord<3>& bounds, @@ -148,61 +128,37 @@ struct WmmaGemmGlobalIteratorCd : public TileIteratorBaseparams.pointer += ((h * params.stride_h + w) + pointer_offset); - - // Prepare the vector of predicates. - for (int i = 0; i < Base::Iterations::kW; ++i) { - predicates.set(i, w + i * Base::Delta::kW < bounds[2]); - } - this->params.predicate_offset -= (h + pred_offset); - } - - /// The accessor. - CUTLASS_DEVICE void get(typename Base::AccessType& value, int d, int h, int w, int c) const { - int const imm = - ComputeOffsetFromStrides::get(0, 0, w, c); - Load::load(value, params.pointer, imm); - } + : Base(params, bounds, block, pointer_offset, pred_offset, thread_offset_func) {} - /// Increment the pointer in the C dimension. - CUTLASS_DEVICE void inc_c() {} - /// Increment the pointer in the W dimension. - CUTLASS_DEVICE void inc_w() {} - /// Increment the pointer in the H dimension. - CUTLASS_DEVICE void inc_h() { - params.pointer += params.inc_h; - params.predicate_offset -= params.predicate_inc_h; - } - /// Increment the pointer in the D dimension. - CUTLASS_DEVICE void inc_d() {} - /// Increment the pointer to move to the next iteration. - CUTLASS_DEVICE void inc_advance() { - params.pointer += params.inc_advance; - params.predicate_offset -= params.predicate_inc_advance; + /// Loads a single fragment element from memory + CUTLASS_DEVICE void load_element( + typename Base::AccessType& value, int d, int h, int w, int c) const { + Base::load_element(value, d, h, w, c); } - /// The accessor. - CUTLASS_DEVICE void set(typename Base::AccessType const& value, int d, int h, int w, int c) { - int const imm = + /// Stores a single fragment element into memory + CUTLASS_DEVICE void store_element( + typename Base::AccessType const& value, int d, int h, int w, int c) { + int const offset = ComputeOffsetFromStrides::get(d, h, w, 0); - Store::store( - value, params.pointer, imm); + Store::store(value, Base::params.pointer, offset); } - /// Test the predicate. - CUTLASS_DEVICE bool valid(int d, int h, int w, int c) const { - return predicates.at(w) && params.predicate_offset > 0; + public: + template + CUTLASS_DEVICE void load_post_increment(Fragment& fragment) { + Base::load_post_increment(fragment); } - /// The predicates for the row. - cutlass::PredicateVector predicates; + template + CUTLASS_DEVICE void store_post_increment(Fragment& fragment) { + Base::store_post_increment(fragment); + } }; //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cutlass/gemm/wmma_gemm_multiply_add.h b/cutlass/gemm/wmma_gemm_multiply_add.h index 5968350e05..328e43adbd 100644 --- a/cutlass/gemm/wmma_gemm_multiply_add.h +++ b/cutlass/gemm/wmma_gemm_multiply_add.h @@ -27,9 +27,9 @@ */ #pragma once -#include +#include "cutlass/wmma_matrix.h" #ifdef CUTLASS_USE_WMMA_API -#include +#include "cutlass/fragment.h" namespace cutlass { namespace gemm { @@ -42,15 +42,17 @@ template struct WmmaGemmMultiplyAdd { /// The shape of the instruction. typedef InstructionShape_ InstructionShape; /// The number of threads per warp. That's a dummy configuration. typedef Shape<1, InstructionShape_::kH, InstructionShape_::kW> ThreadsPerWarp; - /// The dimensions. - typedef AccumulatorsPerWarp_ AccumulatorsPerWarp; + /// Dimensions of the warp-level GEMM (K-by-N-by-M) + typedef WarpGemmShape_ WarpGemmShape; + /// Aliased for compatibility. Will be removed in CUTLASS v2.0 + typedef WarpGemmShape_ AccumulatorsPerWarp; /// The type for A. typedef ScalarA_ ScalarA; /// The type for B. @@ -102,6 +104,251 @@ struct WmmaGemmMultiplyAdd { //////////////////////////////////////////////////////////////////////////////////////////////////// +#ifdef CUTLASS_USE_SUBBYTE_WMMA +/// Specialization for WMMA GEMM with binary operands +template +struct WmmaGemmMultiplyAdd , + MatrixLayout::kColumnMajor, + Vector, + MatrixLayout::kColumnMajor, + int, + WarpGemmShape_, + Shape<128, 8, 8> >{ + /// The shape of the instruction. + typedef Shape<128, 8, 8> InstructionShape; + /// The number of threads per warp. That's a dummy configuration. + typedef Shape<1, 4, 8> ThreadsPerWarp; + /// Dimensions of the warp-level GEMM (K-by-N-by-M) + typedef WarpGemmShape_ WarpGemmShape; + /// Aliased for compatibility. Will be removed in CUTLASS v2.0 + typedef WarpGemmShape_ AccumulatorsPerWarp; + /// The type for A. + typedef Vector ScalarA; + /// The type for B. + typedef Vector ScalarB; + /// The type for C and D. + typedef int ScalarC; + /// The number of iterations. + typedef typename ShapeDiv::Shape Iterations; + + /// The element for A. + typedef WmmaMatrix, + InstructionShape> ElementA; + /// The fragment for A. + typedef Fragment FragmentA; + + /// The element for B. + typedef WmmaMatrix, + InstructionShape> ElementB; + /// The fragment for B. + typedef Fragment FragmentB; + + /// The element for C. + typedef WmmaMatrix ElementC; + /// The fragment for C. + typedef Fragment Accumulators; + + /// Ctor. + CUTLASS_DEVICE WmmaGemmMultiplyAdd() {} + + /// Multiply : d = a*b. + CUTLASS_DEVICE void multiply_add(FragmentA const& a, + FragmentB const& b, + Accumulators const& c, + Accumulators& d) { + for (int j = 0; j < Iterations::kH; ++j) { + for (int i = 0; i < Iterations::kW; ++i) { + // The input elements. + ElementA const& elt_a = a[i]; + ElementB const& elt_b = b[j]; + ElementC const& elt_c = c[j * Iterations::kW + i]; + + // The output element. + ElementC& elt_d = d[j * Iterations::kW + i]; + + // The wmma instruction. + nvcuda::wmma::bmma_sync(elt_d, + elt_a, + elt_b, + elt_c, + nvcuda::wmma::experimental::bmmaBitOpXOR, + nvcuda::wmma::experimental::bmmaAccumulateOpPOPC); + } + } + } +}; +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef CUTLASS_USE_SUBBYTE_WMMA +/// Specialization for WMMA GEMM with signed 4-bit integer operands +template +struct WmmaGemmMultiplyAdd , + MatrixLayout::kColumnMajor, + Vector, + MatrixLayout::kColumnMajor, + int, + WarpGemmShape_, + Shape<32, 8, 8> >{ + /// The shape of the instruction. + typedef Shape<32, 8, 8> InstructionShape; + /// The number of threads per warp. That's a dummy configuration. + typedef Shape<1, 4, 8> ThreadsPerWarp; + /// Dimensions of the warp-level GEMM (K-by-N-by-M) + typedef WarpGemmShape_ WarpGemmShape; + /// Aliased for compatibility. Will be removed in CUTLASS v2.0 + typedef WarpGemmShape_ AccumulatorsPerWarp; + /// The type for A. + typedef Vector ScalarA; + /// The type for B. + typedef Vector ScalarB; + /// The type for C and D. + typedef int ScalarC; + /// The number of iterations. + typedef typename ShapeDiv::Shape Iterations; + + /// The element for A. + typedef WmmaMatrix, + InstructionShape> ElementA; + /// The fragment for A. + typedef Fragment FragmentA; + + /// The element for B. + typedef WmmaMatrix, + InstructionShape> ElementB; + /// The fragment for B. + typedef Fragment FragmentB; + + /// The element for C. + typedef WmmaMatrix ElementC; + /// The fragment for C. + typedef Fragment Accumulators; + + /// Ctor. + CUTLASS_DEVICE WmmaGemmMultiplyAdd() {} + + /// Multiply : d = a*b. + CUTLASS_DEVICE void multiply_add(FragmentA const& a, + FragmentB const& b, + Accumulators const& c, + Accumulators& d) { + for (int j = 0; j < Iterations::kH; ++j) { + for (int i = 0; i < Iterations::kW; ++i) { + // The input elements. + ElementA const& elt_a = a[i]; + ElementB const& elt_b = b[j]; + ElementC const& elt_c = c[j * Iterations::kW + i]; + + // The output element. + ElementC& elt_d = d[j * Iterations::kW + i]; + + // The wmma instruction. + nvcuda::wmma::mma_sync(elt_d, elt_a, elt_b, elt_c); + } + } + } +}; +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef CUTLASS_USE_SUBBYTE_WMMA +/// Specialization for WMMA GEMM with unsigned 4-bit integer operands +template +struct WmmaGemmMultiplyAdd , + MatrixLayout::kColumnMajor, + Vector, + MatrixLayout::kColumnMajor, + int, + WarpGemmShape_, + Shape<32, 8, 8> >{ + /// The shape of the instruction. + typedef Shape<32, 8, 8> InstructionShape; + /// The number of threads per warp. That's a dummy configuration. + typedef Shape<1, 4, 8> ThreadsPerWarp; + /// Dimensions of the warp-level GEMM (K-by-N-by-M) + typedef WarpGemmShape_ WarpGemmShape; + /// Aliased for compatibility. Will be removed in CUTLASS v2.0 + typedef WarpGemmShape_ AccumulatorsPerWarp; + /// The type for A. + typedef Vector ScalarA; + /// The type for B. + typedef Vector ScalarB; + /// The type for C and D. + typedef int ScalarC; + /// The number of iterations. + typedef typename ShapeDiv::Shape Iterations; + + /// The element for A. + typedef WmmaMatrix, + InstructionShape> ElementA; + /// The fragment for A. + typedef Fragment FragmentA; + + /// The element for B. + typedef WmmaMatrix, + InstructionShape> ElementB; + /// The fragment for B. + typedef Fragment FragmentB; + + /// The element for C. + typedef WmmaMatrix ElementC; + /// The fragment for C. + typedef Fragment Accumulators; + + /// Ctor. + CUTLASS_DEVICE WmmaGemmMultiplyAdd() {} + + /// Multiply : d = a*b. + CUTLASS_DEVICE void multiply_add(FragmentA const& a, + FragmentB const& b, + Accumulators const& c, + Accumulators& d) { + for (int j = 0; j < Iterations::kH; ++j) { + for (int i = 0; i < Iterations::kW; ++i) { + // The input elements. + ElementA const& elt_a = a[i]; + ElementB const& elt_b = b[j]; + ElementC const& elt_c = c[j * Iterations::kW + i]; + + // The output element. + ElementC& elt_d = d[j * Iterations::kW + i]; + + // The wmma instruction. + nvcuda::wmma::mma_sync(elt_d, elt_a, elt_b, elt_c); + } + } + } +}; +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace gemm } // namespace cutlass diff --git a/cutlass/gemm/wmma_gemm_shared_tile.h b/cutlass/gemm/wmma_gemm_shared_tile.h index 7d15b260fa..1a90e2f107 100644 --- a/cutlass/gemm/wmma_gemm_shared_tile.h +++ b/cutlass/gemm/wmma_gemm_shared_tile.h @@ -28,18 +28,15 @@ */ #pragma once -#include +#include "cutlass/wmma_matrix.h" #ifdef CUTLASS_USE_WMMA_API -#include -#include +#include "cutlass/gemm/gemm_operand.h" +#include "cutlass/reshape_tile.h" namespace cutlass { namespace gemm { -template -struct Debug {}; - //////////////////////////////////////////////////////////////////////////////////////////////////// template +#include "cutlass/wmma_matrix.h" #ifdef CUTLASS_USE_WMMA_API -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "cutlass/convert.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/gemm_epilogue.h" +#include "cutlass/gemm/gemm_epilogue_traits.h" +#include "cutlass/gemm/gemm_global_tile.h" +#include "cutlass/gemm/gemm_shared_tile.h" +#include "cutlass/gemm/gemm_traits.h" +#include "cutlass/gemm/wmma_gemm_epilogue_traits.h" +#include "cutlass/gemm/wmma_gemm_global_tile.h" +#include "cutlass/gemm/wmma_gemm_multiply_add.h" namespace cutlass { namespace gemm { @@ -53,12 +53,16 @@ template < MatrixLayout::Kind kLayoutB_, /// The tile size for the GEMM KxNxM. typename OutputTile_, + /// The input type. + typename ScalarA_, + /// The input type. + typename ScalarB_, /// The output type. typename ScalarC_, /// The accumulator type. typename Accumulator_, - /// The number of accumulators per warp. - typename AccumulatorsPerWarp_, + /// Tile size for warp-level GEMM (K-by-N-by-M) + typename WarpGemmShape_, /// The shape of the WMMA instruction. typename InstructionShape_, /// The number of scalars per LDG for A. @@ -67,9 +71,9 @@ template < int kScalarsPerLdgB_> struct WmmaGemmConfig : public GemmConfig< /// The scalar type for A. - half, + ScalarA_, /// The scalar type for B. - half, + ScalarB_, /// The scalar type for C. ScalarC_, /// The scalar type for D. @@ -78,12 +82,12 @@ struct WmmaGemmConfig : public GemmConfig< OutputTile_, /// The functor to do the math in the main loop. WmmaGemmMultiplyAdd, /// The number of scalars per LDG for A. kScalarsPerLdgA_, @@ -100,21 +104,29 @@ struct WmmaGemmConfig : public GemmConfig< /// The number of scalars per LDG for C and STG for D. 16 / sizeof(ScalarC_), /// The number of scalars per STS for D. - 16 / sizeof(ScalarC_), + 16 / sizeof(Accumulator_), /// The number of scalars per LDS for D. - 16 / sizeof(ScalarC_), + 16 / sizeof(Accumulator_), /// The number of stages in shared memory. - 1> {}; + 1, + /// If true, residue is computed in mainloop. If false, separate loops are instantiated. + false, + /// Is residue performed in prologue? + true, + /// If true, kernel is launched with CUDA launch bounds specified + false> {}; //////////////////////////////////////////////////////////////////////////////////////////////////// -template +template struct WmmaGemmTileTraitsHelperA {}; //////////////////////////////////////////////////////////////////////////////////////////////////// -template -struct WmmaGemmTileTraitsHelperA +template +struct WmmaGemmTileTraitsHelperA : public GemmTileTraitsHelperA { /// The base config. typedef GemmTileTraitsHelperA Base; @@ -173,8 +185,8 @@ struct WmmaGemmTileTraitsHelperA //////////////////////////////////////////////////////////////////////////////////////////////////// -template -struct WmmaGemmTileTraitsHelperA { +template +struct WmmaGemmTileTraitsHelperA { /// The layout. static MatrixLayout::Kind const kLayout = MatrixLayout::kRowMajor; @@ -251,13 +263,276 @@ struct WmmaGemmTileTraitsHelperA { //////////////////////////////////////////////////////////////////////////////////////////////////// -template -struct WmmaGemmTileTraitsHelperB {}; +#ifdef CUTLASS_USE_SUBBYTE_WMMA +/// Specialization for WMMA GEMM with binary operands +template +struct WmmaGemmTileTraitsHelperA > { + /// The layout. + static MatrixLayout::Kind const kLayout = MatrixLayout::kRowMajor; + + /// The input scalar. + typedef typename GemmConfig_::ScalarA Scalar; + /// The scalar stored in shared memory. + typedef typename GemmConfig_::MultiplyAdd::ScalarA MultiplyAddScalar; + + /// GemmConfig_::OutputTile::kD is in number of 'bits'. TileTraits expects number of 'Scalar'. + /// Divide by 'kBitsPerScalar' to get the number in 'Scalar'. + static int const kBitsPerScalar = sizeof(Scalar) * 8; + + /// WMMA matrix + typedef WmmaMatrix, + typename GemmConfig_::InstructionShape> + WmmaMatrix; + + /// The traits class to build the iterator to load data from global memory for A^T. + typedef GemmGlobalTileTraits< + // That's A. + GemmOperand::kA, + // A is row-major. + MatrixLayout::kRowMajor, + // The pointer is float const. + Scalar const, + // The tile has size KxM in GEMM's terminology. + Shape<1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD / kBitsPerScalar>, + // The threads are distributed as warps x 32 (the traits may reorganize). + Shape<1, + GemmConfig_::kThreads / (GemmConfig_::OutputTile::kD / kBitsPerScalar), + GemmConfig_::OutputTile::kD / kBitsPerScalar>, + // The number of scalars per LDG (LDG.32 or LDG.128, etc). + GemmConfig_::kScalarsPerLdgA / kBitsPerScalar> + GlobalTileTraits; + + /// The skew. + static int const kSkew = 16 / sizeof(MultiplyAddScalar); + /// The tile. + typedef Shape + Tile; + + /// The traits class to build the iterator to store data to shared memory for A^N. + typedef GemmSharedStoreTileAbTraits< + // The pointer. + MultiplyAddScalar, + // The tile has size KxM in GEMM's terminology. + Tile, + // The threads are distributed as warps x 32 (the traits may reorganize). + typename GlobalTileTraits::Threads, + // The number of scalars per STS (STS.32 or STS.128, etc). + GemmConfig_::kScalarsPerStsA / kBitsPerScalar> + SharedStoreTileTraits; + + /// The number of elements loaded in one LDG. + static int const kScalarsPerW = GemmConfig_::InstructionShape::kW * GemmConfig_::Warps::kW; + /// The traits class to build the iterator to load from shared memory for A. + typedef WmmaGemmSharedLoadTileATraits< + // The layout of the matrix. + MatrixLayout::kRowMajor, + // The pointer. + MultiplyAddScalar, + // The tile in shared memory. + Tile, + // The number of warps. + typename GemmConfig_::Warps, + // The strides between warps. + GemmConfig_::InstructionShape::kW * Tile::kW, + // The number of iterations to load the data. + Shape<1, 1, GemmConfig_::OutputTile::kW / kScalarsPerW>, + // The stride between iterations. + Shape, + // The shape of the instruction. + typename GemmConfig_::InstructionShape> + SharedLoadTileTraits; +}; +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef CUTLASS_USE_SUBBYTE_WMMA +/// Specialization for WMMA GEMM with unsigned 4-bit integer operands +template +struct WmmaGemmTileTraitsHelperA > { + /// The layout. + static MatrixLayout::Kind const kLayout = MatrixLayout::kRowMajor; + + /// The input scalar. + typedef typename GemmConfig_::ScalarA Scalar; + /// The scalar stored in shared memory. + typedef typename GemmConfig_::MultiplyAdd::ScalarA MultiplyAddScalar; + + /// GemmConfig_::OutputTile::kD is in number of 'int4'. TileTraits expects number of 'Scalar'. + /// Divide by 'kInt4PerScalar' to get the number in 'Scalar'. + static int const kInt4PerScalar = sizeof(Scalar) * 2; + + /// WMMA matrix + typedef WmmaMatrix, + typename GemmConfig_::InstructionShape> + WmmaMatrix; + + /// The traits class to build the iterator to load data from global memory for A^T. + typedef GemmGlobalTileTraits< + // That's A. + GemmOperand::kA, + // A is row-major. + MatrixLayout::kRowMajor, + // The pointer is float const. + Scalar const, + // The tile has size KxM in GEMM's terminology. + Shape<1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD / kInt4PerScalar>, + // The threads are distributed as warps x 32 (the traits may reorganize). + Shape<1, + GemmConfig_::kThreads / (GemmConfig_::OutputTile::kD / kInt4PerScalar), + GemmConfig_::OutputTile::kD / kInt4PerScalar>, + // The number of scalars per LDG (LDG.32 or LDG.128, etc). + GemmConfig_::kScalarsPerLdgA / kInt4PerScalar> + GlobalTileTraits; + + /// The skew. + static int const kSkew = 16 / sizeof(MultiplyAddScalar); + /// The tile. + typedef Shape + Tile; + + /// The traits class to build the iterator to store data to shared memory for A^N. + typedef GemmSharedStoreTileAbTraits< + // The pointer. + MultiplyAddScalar, + // The tile has size KxM in GEMM's terminology. + Tile, + // The threads are distributed as warps x 32 (the traits may reorganize). + typename GlobalTileTraits::Threads, + // The number of scalars per STS (STS.32 or STS.128, etc). + GemmConfig_::kScalarsPerStsA / kInt4PerScalar> + SharedStoreTileTraits; + + /// The number of elements loaded in one LDG. + static int const kScalarsPerW = GemmConfig_::InstructionShape::kW * GemmConfig_::Warps::kW; + /// The traits class to build the iterator to load from shared memory for A. + typedef WmmaGemmSharedLoadTileATraits< + // The layout of the matrix. + MatrixLayout::kRowMajor, + // The pointer. + MultiplyAddScalar, + // The tile in shared memory. + Tile, + // The number of warps. + typename GemmConfig_::Warps, + // The strides between warps. + GemmConfig_::InstructionShape::kW * Tile::kW, + // The number of iterations to load the data. + Shape<1, 1, GemmConfig_::OutputTile::kW / kScalarsPerW>, + // The stride between iterations. + Shape, + // The shape of the instruction. + typename GemmConfig_::InstructionShape> + SharedLoadTileTraits; +}; +#endif //////////////////////////////////////////////////////////////////////////////////////////////////// +#ifdef CUTLASS_USE_SUBBYTE_WMMA +/// Specialization for WMMA GEMM with signed 4-bit integer operands template -struct WmmaGemmTileTraitsHelperB +struct WmmaGemmTileTraitsHelperA > { + /// The layout. + static MatrixLayout::Kind const kLayout = MatrixLayout::kRowMajor; + + /// The input scalar. + typedef typename GemmConfig_::ScalarA Scalar; + /// The scalar stored in shared memory. + typedef typename GemmConfig_::MultiplyAdd::ScalarA MultiplyAddScalar; + + /// GemmConfig_::OutputTile::kD is in number of 'int4'. TileTraits expects number of 'Scalar'. + /// Divide by 'kInt4PerScalar' to get the number in 'Scalar'. + static int const kInt4PerScalar = sizeof(Scalar) * 2; + + /// WMMA matrix + typedef WmmaMatrix, + typename GemmConfig_::InstructionShape> + WmmaMatrix; + + /// The traits class to build the iterator to load data from global memory for A^T. + typedef GemmGlobalTileTraits< + // That's A. + GemmOperand::kA, + // A is row-major. + MatrixLayout::kRowMajor, + // The pointer is float const. + Scalar const, + // The tile has size KxM in GEMM's terminology. + Shape<1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD / kInt4PerScalar>, + // The threads are distributed as warps x 32 (the traits may reorganize). + Shape<1, + GemmConfig_::kThreads / (GemmConfig_::OutputTile::kD / kInt4PerScalar), + GemmConfig_::OutputTile::kD / kInt4PerScalar>, + // The number of scalars per LDG (LDG.32 or LDG.128, etc). + GemmConfig_::kScalarsPerLdgA / kInt4PerScalar> + GlobalTileTraits; + + /// The skew. + static int const kSkew = 16 / sizeof(MultiplyAddScalar); + /// The tile. + typedef Shape + Tile; + + /// The traits class to build the iterator to store data to shared memory for A^N. + typedef GemmSharedStoreTileAbTraits< + // The pointer. + MultiplyAddScalar, + // The tile has size KxM in GEMM's terminology. + Tile, + // The threads are distributed as warps x 32 (the traits may reorganize). + typename GlobalTileTraits::Threads, + // The number of scalars per STS (STS.32 or STS.128, etc). + GemmConfig_::kScalarsPerStsA / kInt4PerScalar> + SharedStoreTileTraits; + + /// The number of elements loaded in one LDG. + static int const kScalarsPerW = GemmConfig_::InstructionShape::kW * GemmConfig_::Warps::kW; + /// The traits class to build the iterator to load from shared memory for A. + typedef WmmaGemmSharedLoadTileATraits< + // The layout of the matrix. + MatrixLayout::kRowMajor, + // The pointer. + MultiplyAddScalar, + // The tile in shared memory. + Tile, + // The number of warps. + typename GemmConfig_::Warps, + // The strides between warps. + GemmConfig_::InstructionShape::kW * Tile::kW, + // The number of iterations to load the data. + Shape<1, 1, GemmConfig_::OutputTile::kW / kScalarsPerW>, + // The stride between iterations. + Shape, + // The shape of the instruction. + typename GemmConfig_::InstructionShape> + SharedLoadTileTraits; +}; +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct WmmaGemmTileTraitsHelperB {}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct WmmaGemmTileTraitsHelperB : public GemmTileTraitsHelperB { /// The base config. typedef GemmTileTraitsHelperB Base; @@ -316,8 +591,8 @@ struct WmmaGemmTileTraitsHelperB //////////////////////////////////////////////////////////////////////////////////////////////////// -template -struct WmmaGemmTileTraitsHelperB { +template +struct WmmaGemmTileTraitsHelperB { /// The layout. static MatrixLayout::Kind const kLayout = MatrixLayout::kColumnMajor; @@ -394,6 +669,267 @@ struct WmmaGemmTileTraitsHelperB { //////////////////////////////////////////////////////////////////////////////////////////////////// +#ifdef CUTLASS_USE_SUBBYTE_WMMA +/// Specialization for WMMA GEMM with binary operands +template +struct WmmaGemmTileTraitsHelperB > { + /// The layout. + static MatrixLayout::Kind const kLayout = MatrixLayout::kColumnMajor; + + /// The input scalar. + typedef typename GemmConfig_::ScalarB Scalar; + /// The scalar stored in shared memory. + typedef typename GemmConfig_::MultiplyAdd::ScalarB MultiplyAddScalar; + + /// GemmConfig_::OutputTile::kD is in number of 'bits'. TileTraits expects number of 'Scalar'. + /// Divide by 'kBitsPerScalar' to get the number in 'Scalar'. + static int const kBitsPerScalar = sizeof(Scalar) * 8; + + /// WMMA matrix + typedef WmmaMatrix, + typename GemmConfig_::InstructionShape> + WmmaMatrix; + + /// The traits class to build the iterator to load data from global memory for B^N. + typedef GemmGlobalTileTraits< + // That's B. + GemmOperand::kB, + // A is row-major. + MatrixLayout::kColumnMajor, + // The pointer is float const. + Scalar const, + // The tile has size KxM in GEMM's terminology. + Shape<1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD / kBitsPerScalar>, + // The threads are distributed as warps x 32 (the traits may reorganize). + Shape<1, + GemmConfig_::kThreads / (GemmConfig_::OutputTile::kD / kBitsPerScalar), + GemmConfig_::OutputTile::kD / kBitsPerScalar>, + // The number of scalars per LDG (LDG.32 or LDG.128, etc). + GemmConfig_::kScalarsPerLdgB / kBitsPerScalar> + GlobalTileTraits; + + /// The skew. + static int const kSkew = 16 / sizeof(MultiplyAddScalar); + /// The tile. + typedef Shape + Tile; + + /// The traits class to build the iterator to store data to shared memory for B^N. + typedef GemmSharedStoreTileAbTraits< + // The pointer. + MultiplyAddScalar, + // The tile has size KxM in GEMM's terminology. + Tile, + // The threads are distributed as warps x 32 (the traits may reorganize). + typename GlobalTileTraits::Threads, + // The number of scalars per STS (STS.32 or STS.128, etc). + GemmConfig_::kScalarsPerStsB / kBitsPerScalar> + SharedStoreTileTraits; + + /// The number of elements loaded in one LDG. + static int const kScalarsPerW = GemmConfig_::InstructionShape::kH * GemmConfig_::Warps::kH; + /// The traits class to build the iterator to load from shared memory for B. + typedef WmmaGemmSharedLoadTileBTraits< + // The layout of the matrix. + MatrixLayout::kColumnMajor, + // The pointer. + MultiplyAddScalar, + // The tile in shared memory. + Tile, + // The number of warps. + typename GemmConfig_::Warps, + // The strides between warps. + GemmConfig_::InstructionShape::kH * Tile::kW, + // The number of iterations to load the data. + Shape<1, 1, GemmConfig_::OutputTile::kH / kScalarsPerW>, + // The stride between iterations. + Shape, + // The shape of the instruction. + typename GemmConfig_::InstructionShape> + SharedLoadTileTraits; +}; +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef CUTLASS_USE_SUBBYTE_WMMA +/// Specialization for WMMA GEMM with unsigned 4-bit integer operands +template +struct WmmaGemmTileTraitsHelperB > { + /// The layout. + static MatrixLayout::Kind const kLayout = MatrixLayout::kColumnMajor; + + /// The input scalar. + typedef typename GemmConfig_::ScalarB Scalar; + /// The scalar stored in shared memory. + typedef typename GemmConfig_::MultiplyAdd::ScalarB MultiplyAddScalar; + + /// GemmConfig_::OutputTile::kD is in number of 'int4'. TileTraits expects number of 'Scalar'. + /// Divide by 'kInt4PerScalar' to get the number in 'Scalar'. + static int const kInt4PerScalar = sizeof(Scalar) * 2; + + /// WMMA matrix + typedef WmmaMatrix, + typename GemmConfig_::InstructionShape> + WmmaMatrix; + + /// The traits class to build the iterator to load data from global memory for B^N. + typedef GemmGlobalTileTraits< + // That's B. + GemmOperand::kB, + // A is row-major. + MatrixLayout::kColumnMajor, + // The pointer is float const. + Scalar const, + // The tile has size KxM in GEMM's terminology. + Shape<1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD / kInt4PerScalar>, + // The threads are distributed as warps x 32 (the traits may reorganize). + Shape<1, + GemmConfig_::kThreads / (GemmConfig_::OutputTile::kD / kInt4PerScalar), + GemmConfig_::OutputTile::kD / kInt4PerScalar>, + // The number of scalars per LDG (LDG.32 or LDG.128, etc). + GemmConfig_::kScalarsPerLdgB / kInt4PerScalar> + GlobalTileTraits; + + /// The skew. + static int const kSkew = 16 / sizeof(MultiplyAddScalar); + /// The tile. + typedef Shape + Tile; + + /// The traits class to build the iterator to store data to shared memory for B^N. + typedef GemmSharedStoreTileAbTraits< + // The pointer. + MultiplyAddScalar, + // The tile has size KxM in GEMM's terminology. + Tile, + // The threads are distributed as warps x 32 (the traits may reorganize). + typename GlobalTileTraits::Threads, + // The number of scalars per STS (STS.32 or STS.128, etc). + GemmConfig_::kScalarsPerStsB / kInt4PerScalar> + SharedStoreTileTraits; + + /// The number of elements loaded in one LDG. + static int const kScalarsPerW = GemmConfig_::InstructionShape::kH * GemmConfig_::Warps::kH; + /// The traits class to build the iterator to load from shared memory for B. + typedef WmmaGemmSharedLoadTileBTraits< + // The layout of the matrix. + MatrixLayout::kColumnMajor, + // The pointer. + MultiplyAddScalar, + // The tile in shared memory. + Tile, + // The number of warps. + typename GemmConfig_::Warps, + // The strides between warps. + GemmConfig_::InstructionShape::kH * Tile::kW, + // The number of iterations to load the data. + Shape<1, 1, GemmConfig_::OutputTile::kH / kScalarsPerW>, + // The stride between iterations. + Shape, + // The shape of the instruction. + typename GemmConfig_::InstructionShape> + SharedLoadTileTraits; +}; +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef CUTLASS_USE_SUBBYTE_WMMA +/// Specialization for WMMA GEMM with signed 4-bit integer operands +template +struct WmmaGemmTileTraitsHelperB > { + /// The layout. + static MatrixLayout::Kind const kLayout = MatrixLayout::kColumnMajor; + + /// The input scalar. + typedef typename GemmConfig_::ScalarB Scalar; + /// The scalar stored in shared memory. + typedef typename GemmConfig_::MultiplyAdd::ScalarB MultiplyAddScalar; + + /// GemmConfig_::OutputTile::kD is in number of 'int4'. TileTraits expects number of 'Scalar'. + /// Divide by 'kInt4PerScalar' to get the number in 'Scalar'. + static int const kInt4PerScalar = sizeof(Scalar) * 2; + + /// WMMA matrix + typedef WmmaMatrix, + typename GemmConfig_::InstructionShape> + WmmaMatrix; + + /// The traits class to build the iterator to load data from global memory for B^N. + typedef GemmGlobalTileTraits< + // That's B. + GemmOperand::kB, + // A is row-major. + MatrixLayout::kColumnMajor, + // The pointer is float const. + Scalar const, + // The tile has size KxM in GEMM's terminology. + Shape<1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD / kInt4PerScalar>, + // The threads are distributed as warps x 32 (the traits may reorganize). + Shape<1, + GemmConfig_::kThreads / (GemmConfig_::OutputTile::kD / kInt4PerScalar), + GemmConfig_::OutputTile::kD / kInt4PerScalar>, + // The number of scalars per LDG (LDG.32 or LDG.128, etc). + GemmConfig_::kScalarsPerLdgB / kInt4PerScalar> + GlobalTileTraits; + + /// The skew. + static int const kSkew = 16 / sizeof(MultiplyAddScalar); + /// The tile. + typedef Shape + Tile; + + /// The traits class to build the iterator to store data to shared memory for B^N. + typedef GemmSharedStoreTileAbTraits< + // The pointer. + MultiplyAddScalar, + // The tile has size KxM in GEMM's terminology. + Tile, + // The threads are distributed as warps x 32 (the traits may reorganize). + typename GlobalTileTraits::Threads, + // The number of scalars per STS (STS.32 or STS.128, etc). + GemmConfig_::kScalarsPerStsB / kInt4PerScalar> + SharedStoreTileTraits; + + /// The number of elements loaded in one LDG. + static int const kScalarsPerW = GemmConfig_::InstructionShape::kH * GemmConfig_::Warps::kH; + /// The traits class to build the iterator to load from shared memory for B. + typedef WmmaGemmSharedLoadTileBTraits< + // The layout of the matrix. + MatrixLayout::kColumnMajor, + // The pointer. + MultiplyAddScalar, + // The tile in shared memory. + Tile, + // The number of warps. + typename GemmConfig_::Warps, + // The strides between warps. + GemmConfig_::InstructionShape::kH * Tile::kW, + // The number of iterations to load the data. + Shape<1, 1, GemmConfig_::OutputTile::kH / kScalarsPerW>, + // The stride between iterations. + Shape, + // The shape of the instruction. + typename GemmConfig_::InstructionShape> + SharedLoadTileTraits; +}; +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + template < /// The layout for A. MatrixLayout::Kind kLayoutA_, @@ -401,14 +937,18 @@ template < MatrixLayout::Kind kLayoutB_, /// The output tile. typename OutputTile_, + /// The input type. + typename ScalarA_, + /// The input type. + typename ScalarB_, /// The output type. typename ScalarC_, /// The accumulator type. typename Accumulator_, /// The functor to do the math in the epilogue. typename EpilogueFunctor_, - /// The number of accumulators per warp. - typename AccumulatorsPerWarp_, + /// Tile size for warp-level GEMM (K-by-N-by-M) + typename WarpGemmShape_, /// The shape of the WMMA instruction. typename InstructionShape_, /// The number of halfs loaded in one LDG for A. @@ -422,18 +962,20 @@ struct WmmaGemmTraitsHelper { typedef WmmaGemmConfig GemmConfig; /// The GEMM config for A. - typedef WmmaGemmTileTraitsHelperA GemmTileTraitsHelperA; + typedef WmmaGemmTileTraitsHelperA GemmTileTraitsHelperA; /// The GEMM config for B. - typedef WmmaGemmTileTraitsHelperB GemmTileTraitsHelperB; + typedef WmmaGemmTileTraitsHelperB GemmTileTraitsHelperB; /// The iterator to load A from global memory. typedef GemmGlobalIteratorAb @@ -447,7 +989,10 @@ struct WmmaGemmTraitsHelper { MemorySpace::kShared> SharedStoreIteratorA; /// The stream to load A from global memory to shared memory. - typedef GlobalLoadStream + typedef GlobalLoadStream GlobalLoadStreamA; /// The iterator to load B from global memory. @@ -462,7 +1007,10 @@ struct WmmaGemmTraitsHelper { MemorySpace::kShared> SharedStoreIteratorB; /// The stream to load B from global memory to shared memory. - typedef GlobalLoadStream + typedef GlobalLoadStream GlobalLoadStreamB; /// The iterator to load A from shared memory. @@ -472,7 +1020,7 @@ struct WmmaGemmTraitsHelper { MemorySpace::kShared, Index_, typename GemmTileTraitsHelperA::WmmaMatrix, - IteratorFragment::kWmmaMatrix> + FragmentElementType::kWmmaMatrix> SharedLoadIteratorA; /// The stream to load A from shared memory. typedef SharedLoadStream SharedLoadStreamA; @@ -483,7 +1031,7 @@ struct WmmaGemmTraitsHelper { MemorySpace::kShared, Index_, typename GemmTileTraitsHelperB::WmmaMatrix, - IteratorFragment::kWmmaMatrix> + FragmentElementType::kWmmaMatrix> SharedLoadIteratorB; /// The stream to load B from shared memory. typedef SharedLoadStream SharedLoadStreamB; @@ -518,14 +1066,18 @@ template < MatrixLayout::Kind kLayoutB_, /// The tile size for the GEMM KxNxM. typename OutputTile_ = Shape<64, 128, 128>, + /// The input type. + typename ScalarA_ = half, + /// The input type. + typename ScalarB_ = half, /// The output type. typename ScalarC_ = float, /// The functor to do the math in the epilogue. typename EpilogueFunctor_ = LinearScaling, /// The accumulator type. typename Accumulator_ = ScalarC_, - /// The number of accumulators per warp. - typename AccumulatorsPerWarp_ = typename WmmaGemmAccumulatorsPerWarp::Shape, + /// Tile size for warp-level GEMM (K-by-N-by-M) + typename WarpGemmShape_ = typename WmmaGemmAccumulatorsPerWarp::Shape, /// The shape of the WMMA instruction. typename InstructionShape_ = Shape<16, 16, 16>, /// The number of scalars per LDG for A. @@ -538,10 +1090,12 @@ template < typename Helper_ = WmmaGemmTraitsHelper -#include -#include -#include +#include "cutlass/load_store.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/shape.h" namespace cutlass { /////////////////////////////////////////////////////////////////////////////////////////////////// - -/// Loads a fragment from an input iterator +// Used by convolution template CUTLASS_HOST_DEVICE void iterator_load(InputIterator &iterator, Fragment &fragment) { typename InputIterator::FragmentIterator frag_iterator(fragment); @@ -45,12 +43,12 @@ CUTLASS_HOST_DEVICE void iterator_load(InputIterator &iterator, Fragment &fragme for (int w = 0; w < InputIterator::Iterations::kW; ++w) { for (int c = 0; c < InputIterator::Iterations::kC; ++c) { if (iterator.valid(d, h, w, c)) { - iterator.get(reinterpret_cast( - frag_iterator.at(d, h, w, c)), - d, - h, - w, - c); + iterator.load_element(reinterpret_cast( + frag_iterator.at(d, h, w, c)), + d, + h, + w, + c); } } if (w < InputIterator::Iterations::kW - 1) { @@ -68,138 +66,21 @@ CUTLASS_HOST_DEVICE void iterator_load(InputIterator &iterator, Fragment &fragme iterator.inc_advance(); } -/// Loads a fragment from a shared memory input iterator -template -CUTLASS_DEVICE void shared_iterator_load(InputIterator &iterator, Fragment &fragment) { - typename InputIterator::FragmentIterator frag_iterator(fragment); - for (int d = 0; d < InputIterator::Iterations::kD; ++d) { - for (int h = 0; h < InputIterator::Iterations::kH; ++h) { - for (int w = 0; w < InputIterator::Iterations::kW; ++w) { - for (int c = 0; c < InputIterator::Iterations::kC; ++c) { - int const offset = - ComputeOffsetFromStrides::get( - d, h, w, c); - - FragmentLoad::load(frag_iterator.at(d, h, w, c), - iterator.data(), - offset); - } - } - } - } -} - -/// Loads a fragment from a shared memory input iterator -template -CUTLASS_DEVICE void shared_iterator_load(InputIterator &iterator, Fragment &fragment, int d) { - typename InputIterator::FragmentIterator frag_iterator(fragment); - for (int h = 0; h < InputIterator::Iterations::kH; ++h) { - for (int w = 0; w < InputIterator::Iterations::kW; ++w) { - for (int c = 0; c < InputIterator::Iterations::kC; ++c) { - int const offset = - ComputeOffsetFromStrides::get( - d, h, w, c); - - FragmentLoad::load(frag_iterator.at(0, h, w, c), - iterator.data(), - offset); - } - } - } -} - -/// Loads a fragment from an input iterator, masked by a predicate iterator -template -CUTLASS_HOST_DEVICE void iterator_load_post_increment(InputIterator &iterator, - Fragment &fragment, - typename InputIterator::Index offset, - ConstPredicateAdapter predicate_adapter) { - for (int d = 0; d < InputIterator::Iterations::kD; ++d, iterator.inc_d()) { - for (int h = 0; h < InputIterator::Iterations::kH; ++h, iterator.inc_h()) { - for (int w = 0; w < InputIterator::Iterations::kW; ++w, iterator.inc_w()) { - if (predicate_adapter.at(d, h, w, 0)) { - int idx = InputIterator::Tile::kC * - (w + InputIterator::Iterations::kW * (h + InputIterator::Iterations::kH * d)); - - Load:: - load(reinterpret_cast(fragment[idx]), - iterator.data(), - offset); - } - } - } - } -} - -/// Loads a fragment from an input iterator -template -CUTLASS_HOST_DEVICE void iterator_load_post_increment(InputIterator &iterator, - Fragment &fragment, - typename InputIterator::Index offset = 0) { - TrivialPredicateTileAdapter pred; - iterator_load_post_increment(iterator, fragment, offset, pred); -} - -/// Loads a fragment from an input iterator -template -CUTLASS_HOST_DEVICE void iterator_load_post_increment(InputIterator &iterator, - Fragment &fragment, - ConstPredicateAdapter pred_it) { - iterator_load_post_increment(iterator, fragment, 0, pred_it); -} - -template -CUTLASS_HOST_DEVICE void iterator_load(InputIterator const &_iterator, - Fragment &fragment, - typename InputIterator::Index offset, - ConstPredicateAdapter predicate_adapter) { - InputIterator iterator(_iterator); - iterator_load_post_increment(iterator, fragment, offset, predicate_adapter); -} - -/// Loads a fragment from an input iterator -template -CUTLASS_HOST_DEVICE void iterator_load(InputIterator const &iterator, - Fragment &fragment, - typename InputIterator::Index offset = 0) { - TrivialPredicateTileAdapter pred; - iterator_load(iterator, fragment, offset, pred); -} - -/// Loads a fragment from an input iterator -template -CUTLASS_HOST_DEVICE void iterator_load(InputIterator const &iterator, - Fragment &fragment, - ConstPredicateAdapter pred_it) { - iterator_load(iterator, fragment, 0, pred_it); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - -/// Stores a fragment to an output iterator template CUTLASS_HOST_DEVICE void iterator_store(OutputIterator &iterator, Fragment &fragment) { typename OutputIterator::FragmentIterator frag_iterator(fragment); for (int d = 0; d < OutputIterator::Iterations::kD; ++d) { for (int h = 0; h < OutputIterator::Iterations::kH; ++h) { for (int w = 0; w < OutputIterator::Iterations::kW; ++w) { - if (iterator.valid(d, h, w, 0)) { - iterator.set(reinterpret_cast( - frag_iterator.at(d, h, w, 0)), - d, - h, - w, - 0); + for (int c = 0; c < OutputIterator::Iterations::kC; ++c) { + if (iterator.valid(d, h, w, c)) { + iterator.store_element(reinterpret_cast( + frag_iterator.at(d, h, w, c)), + d, + h, + w, + c); + } } if (w < OutputIterator::Iterations::kW - 1) { iterator.inc_w(); @@ -215,104 +96,6 @@ CUTLASS_HOST_DEVICE void iterator_store(OutputIterator &iterator, Fragment &frag } iterator.inc_advance(); } - -/// Stores a fragment to a shared memory output iterator -template -CUTLASS_DEVICE void shared_iterator_store(OutputIterator &iterator, Fragment const &fragment) { - typename OutputIterator::FragmentConstIterator frag_iterator(fragment); - for (int d = 0; d < OutputIterator::Iterations::kD; ++d) { - for (int h = 0; h < OutputIterator::Iterations::kH; ++h) { - for (int w = 0; w < OutputIterator::Iterations::kW; ++w) { - for (int c = 0; c < OutputIterator::Iterations::kC; ++c) { - int const offset = - ComputeOffsetFromStrides::get( - d, h, w, c); - - FragmentStore::store(frag_iterator.at(d, h, w, c), - iterator.data(), - offset); - } - } - } - } -} - //////////////////////////////////////////////////////////////////////////////////////////////////// -/// Stores a fragment to an output iterator, masked by a predicate iterator -template -CUTLASS_HOST_DEVICE void iterator_store_post_increment(OutputIterator &iterator, - Fragment const &fragment, - typename OutputIterator::Index offset, - ConstPredicateAdapter predicate_adapter) { - for (int d = 0; d < OutputIterator::Iterations::kD; ++d, iterator.inc_d()) { - for (int h = 0; h < OutputIterator::Iterations::kH; ++h, iterator.inc_h()) { - for (int w = 0; w < OutputIterator::Iterations::kW; ++w, iterator.inc_w()) { - if (predicate_adapter.at(d, h, w, 0)) { - int idx = OutputIterator::Tile::kC * - (w + OutputIterator::Iterations::kW * (h + OutputIterator::Iterations::kH * d)); - - Store:: - store(reinterpret_cast(fragment[idx]), - iterator.data(), - offset); - } - } - } - } -} - -/// Stores a fragment to an output iterator -template -CUTLASS_HOST_DEVICE void iterator_store_post_increment(OutputIterator &iterator, - Fragment const &fragment, - typename OutputIterator::Index offset = 0) { - TrivialPredicateTileAdapter pred; - iterator_store_post_increment(iterator, fragment, offset, pred); -} - -/// Stores a fragment to an output iterator -template -CUTLASS_HOST_DEVICE void iterator_store_post_increment(OutputIterator &iterator, - Fragment const &fragment, - ConstPredicateAdapter pred_it) { - iterator_store_post_increment(iterator, fragment, 0, pred_it); -} - -/// Stores a fragment to an output iterator, masked by a predicate iterator -template -CUTLASS_HOST_DEVICE void iterator_store(OutputIterator const &_iterator, - Fragment const &fragment, - typename OutputIterator::Index offset, - ConstPredicateAdapter predicate_adapter) { - OutputIterator iterator(_iterator); - iterator_store_post_increment(iterator, fragment, offset, predicate_adapter); -} - -/// Stores a fragment to an output iterator -template -CUTLASS_HOST_DEVICE void iterator_store(OutputIterator const &iterator, - Fragment const &fragment, - typename OutputIterator::Index offset = 0) { - TrivialPredicateTileAdapter pred; - iterator_store(iterator, fragment, offset, pred); -} - -/// Stores a fragment to an output iterator -template -CUTLASS_HOST_DEVICE void iterator_store(OutputIterator const &iterator, - Fragment const &fragment, - ConstPredicateAdapter pred_it) { - iterator_store(iterator, fragment, 0, pred_it); -} - -/////////////////////////////////////////////////////////////////////////////////////////////////// - } // namespace cutlass diff --git a/cutlass/kernel_launch.h b/cutlass/kernel_launch.h new file mode 100644 index 0000000000..ee37b2fda9 --- /dev/null +++ b/cutlass/kernel_launch.h @@ -0,0 +1,67 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Defines structures and helpers to launch CUDA kernels within CUTLASS. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +namespace cutlass { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Structure containing the basic launch configuration of a CUDA kernel. +struct KernelLaunchConfiguration { + + /// CUDA grid dimensions + dim3 grid; + + /// CUDA threablock dimensions + dim3 block; + + /// Bytes of dynamically allocated SMEM in addition to static SMEM + size_t dynamic_smem; + + // + // Methods + // + + /// Constructs a KernellaunchConfiguration object + CUTLASS_HOST_DEVICE + KernelLaunchConfiguration( + dim3 _grid = dim3(1,1,1), + dim3 _block = dim3(1,1,1), + size_t _dynamic_smem = 0 + ): + grid(_grid), + block(_block), + dynamic_smem(_dynamic_smem) { } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass diff --git a/cutlass/load_store.h b/cutlass/load_store.h index 5cb5eb6728..db09dd0a48 100644 --- a/cutlass/load_store.h +++ b/cutlass/load_store.h @@ -27,8 +27,7 @@ */ #pragma once -#include - +#include "cutlass/vector.h" namespace cutlass { //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -44,45 +43,68 @@ struct MemorySpace { }; }; +/// Specifies whether iterator storage fragment consists of Scalar values or WMMA matrix +struct FragmentElementType { + enum Kind { kScalar, kWmmaMatrix }; +}; + //////////////////////////////////////////////////////////////////////////////////////////////////// template 1), - size_t = (sizeof(Scalar_) * Lanes_)> + FragmentElementType::Kind kFragmentElementType = FragmentElementType::kScalar, + typename FragmentElement_ = Scalar_, + int kStride = 1, + size_t size = (sizeof(Scalar_) * kAccessSize)> struct Load { /// The output type. - typedef typename Vectorize::Type AccessType; + typedef typename Vectorize::Type AccessType; /// The load function. - static CUTLASS_DEVICE void load(AccessType& dst, Scalar_ const* pointer, int offset) { - dst = reinterpret_cast(&pointer[offset])[0]; + static CUTLASS_HOST_DEVICE void load(AccessType& dst, Scalar_ const* pointer, int offset) { + dst = *reinterpret_cast(pointer + offset); } + }; //////////////////////////////////////////////////////////////////////////////////////////////////// -template -struct Load { +/// Partial specialization for 16b loads +template +struct Load { /// The output type. - typedef typename Vectorize::Type AccessType; + typedef typename Vectorize::Type AccessType; - /// The store function. - static CUTLASS_DEVICE void load(AccessType& dst, Scalar_ const* pointer, int offset) { + /// The load function. + static CUTLASS_HOST_DEVICE void load(AccessType& dst, Scalar_ const* pointer, int offset) { + reinterpret_cast(dst) = reinterpret_cast(&pointer[offset])[0]; + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Load { + /// The output type. + typedef typename Vectorize::Type AccessType; + + /// The load function. + static CUTLASS_HOST_DEVICE void load(AccessType& dst, Scalar_ const* pointer, int offset) { dst.registers[0] = reinterpret_cast(&pointer[offset])[0]; } + }; //////////////////////////////////////////////////////////////////////////////////////////////////// -template -struct Load { +template +struct Load { /// The output type. - typedef typename Vectorize::Type AccessType; + typedef typename Vectorize::Type AccessType; - /// The store function. - static CUTLASS_DEVICE void load(AccessType& dst, Scalar_ const* pointer, int offset) { + /// The load function. + static CUTLASS_HOST_DEVICE void load(AccessType& dst, Scalar_ const* pointer, int offset) { uint2 tmp = reinterpret_cast(&pointer[offset])[0]; dst.registers[0] = tmp.x; dst.registers[1] = tmp.y; @@ -91,13 +113,13 @@ struct Load { //////////////////////////////////////////////////////////////////////////////////////////////////// -template -struct Load { +template +struct Load { /// The output type. typedef typename Vectorize::Type AccessType; - /// The store function. - static CUTLASS_DEVICE void load(AccessType& dst, double const* pointer, int offset) { + /// The load function. + static CUTLASS_HOST_DEVICE void load(AccessType& dst, double const* pointer, int offset) { double2 tmp = reinterpret_cast(&pointer[offset])[0]; dst[0] = tmp.x; dst[1] = tmp.y; @@ -108,13 +130,13 @@ struct Load { #if defined(__CUDACC_VERSION_MAJOR) && __CUDACC_VERSION_MAJOR < 10 // WAR bug in NVCC where the upper and lower half of the register end up being the same -template -struct Load { +template +struct Load { /// The output type. typedef typename Vectorize::Type AccessType; - /// The store function. - static CUTLASS_DEVICE void load(AccessType& dst, half const* pointer, int offset) { + /// The load function. + static CUTLASS_HOST_DEVICE void load(AccessType& dst, half const* pointer, int offset) { int2 tmp = reinterpret_cast(&pointer[offset])[0]; dst.registers[0] = tmp.x; dst.registers[1] = tmp.y; @@ -129,13 +151,13 @@ struct Load { //////////////////////////////////////////////////////////////////////////////////////////////////// -template -struct Load { +template +struct Load { /// The output type. - typedef typename Vectorize::Type AccessType; + typedef typename Vectorize::Type AccessType; - /// The store function. - static CUTLASS_DEVICE void load(AccessType& dst, Scalar_ const* pointer, int offset) { + /// The load function. + static CUTLASS_HOST_DEVICE void load(AccessType& dst, Scalar_ const* pointer, int offset) { uint4 tmp = reinterpret_cast(&pointer[offset])[0]; dst.registers[0] = tmp.x; dst.registers[1] = tmp.y; @@ -147,29 +169,45 @@ struct Load { //////////////////////////////////////////////////////////////////////////////////////////////////// template 1), - size_t = (sizeof(Scalar_) * Lanes_)> + FragmentElementType::Kind kFragmentElementType = FragmentElementType::kScalar, + typename FragmentElement_ = Scalar_, + int kStride = 1, + size_t size = (sizeof(Scalar_) * kAccessSize)> struct Store { /// The output type. - typedef typename Vectorize::Type AccessType; + typedef typename Vectorize::Type AccessType; /// The store function. - static CUTLASS_DEVICE void store(AccessType const& src, Scalar_* pointer, int offset) { - pointer[offset] = src; + static CUTLASS_HOST_DEVICE void store(AccessType const& src, Scalar_* pointer, int offset) { + pointer[offset] = *reinterpret_cast(&src); } }; //////////////////////////////////////////////////////////////////////////////////////////////////// -template -struct Store { +template +struct Store { /// The output type. - typedef typename Vectorize::Type AccessType; + typedef typename Vectorize::Type AccessType; /// The store function. - static CUTLASS_DEVICE void store(AccessType const& src, Scalar_* pointer, int offset) { + static CUTLASS_HOST_DEVICE void store(AccessType const& src, Scalar_* pointer, int offset) { + uint16_t* addr = reinterpret_cast(&pointer[offset]); + addr[0] = reinterpret_cast(src); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Store { + /// The output type. + typedef typename Vectorize::Type AccessType; + + /// The store function. + static CUTLASS_HOST_DEVICE void store(AccessType const& src, Scalar_* pointer, int offset) { uint32_t* addr = reinterpret_cast(&pointer[offset]); addr[0] = src.registers[0]; } @@ -177,13 +215,13 @@ struct Store { //////////////////////////////////////////////////////////////////////////////////////////////////// -template -struct Store { +template +struct Store { /// The output type. - typedef typename Vectorize::Type AccessType; + typedef typename Vectorize::Type AccessType; /// The store function. - static CUTLASS_DEVICE void store(AccessType const& src, Scalar_* pointer, int offset) { + static CUTLASS_HOST_DEVICE void store(AccessType const& src, Scalar_* pointer, int offset) { uint2* addr = reinterpret_cast(&pointer[offset]); addr[0] = make_uint2(src.registers[0], src.registers[1]); } @@ -191,13 +229,13 @@ struct Store { //////////////////////////////////////////////////////////////////////////////////////////////////// -template -struct Store { +template +struct Store { /// The output type. typedef typename Vectorize::Type AccessType; /// The store function. - static CUTLASS_DEVICE void store(AccessType const& src, double* pointer, int offset) { + static CUTLASS_HOST_DEVICE void store(AccessType const& src, double* pointer, int offset) { double2* addr = reinterpret_cast(&pointer[offset]); addr[0] = make_double2(src[0], src[1]); } @@ -205,13 +243,13 @@ struct Store { //////////////////////////////////////////////////////////////////////////////////////////////////// -template -struct Store { +template +struct Store { /// The output type. - typedef typename Vectorize::Type AccessType; + typedef typename Vectorize::Type AccessType; /// The store function. - static CUTLASS_DEVICE void store(AccessType const& src, Scalar_* pointer, int offset) { + static CUTLASS_HOST_DEVICE void store(AccessType const& src, Scalar_* pointer, int offset) { uint4* addr = reinterpret_cast(&pointer[offset]); addr[0] = make_uint4(src.registers[0], src.registers[1], src.registers[2], src.registers[3]); } @@ -219,4 +257,123 @@ struct Store { //////////////////////////////////////////////////////////////////////////////////////////////////// +template +struct Load { + /// The output type. + typedef FragmentElement_ AccessType; + + /// The load function. + static CUTLASS_HOST_DEVICE void load(AccessType& value, Scalar_ const* pointer, int offset) { + value.load(&pointer[offset], kStride); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Load, + kAccessSize, + Memory_, + FragmentElementType::kWmmaMatrix, + FragmentElement_, + kStride, + size> { + /// The output type. + typedef FragmentElement_ AccessType; + + /// The load function. + static CUTLASS_HOST_DEVICE void load(AccessType& value, Vector const* pointer, + int offset) { + value.load(&pointer[offset], kStride * 32); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Load, + kAccessSize, + Memory_, + FragmentElementType::kWmmaMatrix, + FragmentElement_, + kStride, + size> { + /// The output type. + typedef FragmentElement_ AccessType; + + /// The load function. + static CUTLASS_HOST_DEVICE void load(AccessType& value, Vector const* pointer, + int offset) { + value.load(&pointer[offset], kStride * 8); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Load, + kAccessSize, + Memory_, + FragmentElementType::kWmmaMatrix, + FragmentElement_, + kStride, + size> { + /// The output type. + typedef FragmentElement_ AccessType; + + /// The load function. + static CUTLASS_HOST_DEVICE void load(AccessType& value, Vector const* pointer, + int offset) { + value.load(&pointer[offset], kStride * 8); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// +template +struct Store { + /// The input type. + typedef FragmentElement_ AccessType; + + /// The store function. + static CUTLASS_HOST_DEVICE void store(AccessType const& value, Scalar_* pointer, int offset) { + value.store(&pointer[offset], kStride); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace cutlass diff --git a/cutlass/matrix_traits.h b/cutlass/matrix_traits.h index 77e8b70625..08a43a99af 100644 --- a/cutlass/matrix_traits.h +++ b/cutlass/matrix_traits.h @@ -27,13 +27,327 @@ */ #pragma once +#include "cutlass/coord.h" + namespace cutlass { //////////////////////////////////////////////////////////////////////////////////////////////////// -/// Describes layouts of matrices +/// MatrixCoord wraps Coord<2, int> to provide a helper for accessing named dimensions. Classes +/// expecting a coordinate in the rank=2 index space of a matrix should use MatrixCoord. +struct MatrixCoord : public Coord<2, int> { + + /// Integer-valued index + typedef int Index; + + /// Base type is a Coord of rank=2 + typedef Coord<2, Index> Base; + + /// Rows dimension + static int const kRow = 0; + + /// Columns dimension + static int const kColumn = 1; + + // + // Methods + // + + /// Default ctor + CUTLASS_HOST_DEVICE + MatrixCoord() { } + + /// Constructs from Coord<2> + CUTLASS_HOST_DEVICE + MatrixCoord(Coord<2, Index> const &coord): Base(coord) { } + + /// Helper to construct from a row and column + CUTLASS_HOST_DEVICE + MatrixCoord(Index row, Index column): Base(make_Coord(row, column)) { } + + /// Returns the row of the coordinate + CUTLASS_HOST_DEVICE + Index const & row() const { return this->at(kRow); } + + /// Returns the row of the coordinate + CUTLASS_HOST_DEVICE + Index & row() { return this->at(kRow); } + + /// Returns the column of the coordinate + CUTLASS_HOST_DEVICE + Index const & column() const { return this->at(kColumn); } + + /// Returns the column of the coordinate + CUTLASS_HOST_DEVICE + Index & column() { return this->at(kColumn); } + + // + // Coord operators + // + + /// Element-wise addition + CUTLASS_HOST_DEVICE + MatrixCoord operator+(Base const& b) const { + return MatrixCoord(Base::operator+(b)); + } + + /// Element-wise subtraction + CUTLASS_HOST_DEVICE + MatrixCoord operator-(Base const& b) const { + return MatrixCoord(Base::operator-(b)); + } + + /// Element-wise multiplication + CUTLASS_HOST_DEVICE + MatrixCoord operator*(Base const& b) const { + return MatrixCoord(Base::operator*(b)); + } + + /// Element-wise division + CUTLASS_HOST_DEVICE + MatrixCoord operator/(Base const& b) const { + return MatrixCoord(Base::operator/(b)); + } + + /// In-place addition + CUTLASS_HOST_DEVICE + MatrixCoord& operator+=(Base const& b) { + Base::operator+=(b); + return *this; + } + + /// In-place subtraction + CUTLASS_HOST_DEVICE + MatrixCoord& operator-=(Base const& b) { + Base::operator-=(b); + return *this; + } + + /// In-place multiplication + CUTLASS_HOST_DEVICE + MatrixCoord& operator*=(Base const& b) { + Base::operator*=(b); + return *this; + } + + /// In-place division + CUTLASS_HOST_DEVICE + MatrixCoord& operator/=(Base const& b) { + Base::operator/=(b); + return *this; + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Defines data layouts of various matrix formats usable by TensorRef and other classes. +// +// The following define classes satisfying the TensorRefMapFunc concept. These must support the +// following operations, where func is an instance of type TensorRefMapFunc. +// +// Coord = func(Coord); +// +// Though not required to be usable by TensorRef, each of the following also define a helper +// function to map the "leading dimension" to an appropriate stride vector. Implementations +// following this convention should also implement the following static method: +// +// Coord stride = TensorRefMapFunc::stride(leading_dim); +// struct MatrixLayout { + + /// Enumeration defining fundamental contiguous layouts. enum Kind { kRowMajor, kColumnMajor }; + + // + // TensorRefMapFunc definitions for common layouts + // + + /// Mapping function for row-major matrices + struct RowMajor { + static int const kStorageRank = 2; + /// Maps (i, j) to (i, j) + CUTLASS_HOST_DEVICE + Coord operator()(MatrixCoord const &coord) const { + return coord; + } + }; + + /// Mapping function for column-major matrices + struct ColumnMajor { + static int const kStorageRank = 2; + /// Maps (i, j) to (j, i) + CUTLASS_HOST_DEVICE + Coord operator()(MatrixCoord const &coord) const { + return make_Coord(coord.column(), coord.row()); + } + }; + + /// Mapping function for interleaved matrices. Matrix is structured + /// as row-major arrangement of fixed-size columns. + template + struct RowMajorInterleaved { + + /// Rank of storage n-D array + static int const kStorageRank = 3; + + /// Interleaving size + static int const kInterleave = Interleave; + + /// Maps (row, col) to (row, col, row) + CUTLASS_HOST_DEVICE + Coord operator()(MatrixCoord const &coord) const { + return make_Coord( + coord.row() / kInterleave, + coord.column(), + coord.row() % kInterleave + ); + } + + /// Helper to compute stride vector from leading dimension + CUTLASS_HOST_DEVICE + static Coord stride(int ldm) { + return make_Coord( + ldm * kInterleave, + kInterleave, + 1 + ); + } + }; + + /// Mapping function for interleaved matrices. Matrix is structured + /// as column-major arrangement of fixed-size rows. + template + struct ColumnMajorInterleaved { + + /// Rank of storage n-D array + static int const kStorageRank = 3; + + /// Interleaving size + static int const kInterleave = Interleave; + + /// Maps (row, col) to (col, row, col) + CUTLASS_HOST_DEVICE + Coord operator()(MatrixCoord const &coord) const { + return make_Coord( + coord.column() / kInterleave, + coord.row(), + coord.column() % kInterleave + ); + } + + /// Helper to compute stride vector from leading dimension + CUTLASS_HOST_DEVICE + static Coord stride(int ldm) { + return make_Coord( + ldm * kInterleave, + kInterleave, + 1 + ); + } + }; + + /// Mapping function for scenario in which layout is row-major or column-major but this information + /// is only available at runtime. + struct ContiguousLayout { + /// Arbitrary storage rank + static int const kStorageRank = 3; + + /// Dimension of rows + static int const kRow = 0; + + /// Dimension of columns + static int const kColumn = 1; + + /// Mapping function defined by runtime variable. Returns coordinates in n-D storage array + /// as (matrix row, matrix colum, 0) + CUTLASS_HOST_DEVICE + Coord operator()(MatrixCoord const &coord) const { + return make_Coord(coord.row(), coord.column(), 0); + } + + /// Helper to construct a stride vector based on contiguous matrix layout and leading dimension + CUTLASS_HOST_DEVICE + static Coord stride(MatrixLayout::Kind layout, int ldm) { + if (layout == MatrixLayout::kRowMajor) { + return make_Coord(ldm, 1, 1); + } + return make_Coord(1, ldm, 1); + } + }; + + /// Mapping function for block-linear matrices. Matrix is structured + /// as column-major arrangement of 2D tiles (that are column-major). + template + struct ColumnMajorBlockLinear { + + /// Rank of storage n-D array + static int const kStorageRank = 4; + + /// Interleaving size in rows dimension + static int const kBlockRows = BlockRows; + + /// Interleaving size in columns dimension + static int const kBlockColumns = BlockColumns; + + /// Maps (row, col) to (col, row, col, row) + CUTLASS_HOST_DEVICE + Coord operator()(MatrixCoord const &coord) const { + return make_Coord( + coord.column() / kBlockColumns, + coord.row() / kBlockRows, + coord.column() % kBlockColumns, + coord.row() % kBlockRows + ); + } + + /// Helper to compute stride vector from leading dimension + CUTLASS_HOST_DEVICE + static Coord stride(int ldm) { + return make_Coord( + ldm * kBlockRows * kBlockColumns, + kBlockRows * kBlockColumns, + kBlockRows, + 1 + ); + } + }; + + /// Mapping function for block-linear matrices. Matrix is structured + /// as row-major arrangement of 2D tiles (that are row-major) + template + struct RowMajorBlockLinear { + + /// Rank of storage n-D array + static int const kStorageRank = 4; + + /// Interleaving size in rows dimension + static int const kBlockRows = BlockRows; + + /// Interleaving size in columns dimension + static int const kBlockColumns = BlockColumns; + + /// Maps (row, col) to (row, col, row, col) + CUTLASS_HOST_DEVICE + Coord operator()(MatrixCoord const &coord) const { + return make_Coord( + coord.row() / kBlockRows, + coord.column() / kBlockColumns, + coord.row() % kBlockRows, + coord.column() % kBlockColumns + ); + } + + /// Helper to compute stride vector from leading dimension + CUTLASS_HOST_DEVICE + static Coord stride(int ldm) { + return make_Coord( + ldm * kBlockRows * kBlockColumns, + kBlockRows * kBlockColumns, + kBlockColumns, + 1 + ); + } + }; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -45,4 +359,14 @@ struct GemmOperand { //////////////////////////////////////////////////////////////////////////////////////////////////// +/// Transformation applied to matrix operands +struct MatrixTransform { + enum Kind { + kNone, /// no operation + kConjugate, /// conjugate + }; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace cutlass diff --git a/cutlass/predicate_vector.h b/cutlass/predicate_vector.h index 81668577e2..4a37d017d7 100644 --- a/cutlass/predicate_vector.h +++ b/cutlass/predicate_vector.h @@ -28,12 +28,13 @@ */ #pragma once +#include #include -#include -#include +#include "cutlass/cutlass.h" +#include "cutlass/shape.h" -#include +#include "cutlass/util/platform.h" namespace cutlass { @@ -114,7 +115,7 @@ struct PredicateVector { // Make sure no one tries to put more than 8 bits in a byte :) static_assert(kPredicatesPerByte <= 8, "kPredicatesPerByte must fit within an actual byte"); // Make sure the "offsetted" bits fit in one byte. - static_assert(kPredicateStart + kPredicatesPerByte < 8, + static_assert(kPredicateStart + kPredicatesPerByte <= 8, "The offsetted predicates must fit within an actual byte."); /// Storage type of individual elements diff --git a/cutlass/reshape_tile.h b/cutlass/reshape_tile.h index 55aebfcafb..67faa602ac 100644 --- a/cutlass/reshape_tile.h +++ b/cutlass/reshape_tile.h @@ -27,7 +27,7 @@ */ #pragma once -#include +#include "cutlass/shape.h" namespace cutlass { diff --git a/cutlass/shape.h b/cutlass/shape.h index 4f6b222eec..b8c0c66f35 100644 --- a/cutlass/shape.h +++ b/cutlass/shape.h @@ -27,7 +27,7 @@ */ #pragma once -#include +#include "cutlass/cutlass.h" namespace cutlass { @@ -128,6 +128,17 @@ struct ShapeDiv { //////////////////////////////////////////////////////////////////////////////////////////////////// +template +struct ShapeDivCeiling { + typedef Shape<(A_::kD + B_::kD - 1) / B_::kD, + (A_::kH + B_::kH - 1) / B_::kH, + (A_::kW + B_::kW - 1) / B_::kW, + (A_::kC + B_::kC - 1) / B_::kC> + Shape; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + template struct ShapeMax { typedef Shape<(A_::kD > B_::kD ? A_::kD : B_::kD), @@ -150,12 +161,12 @@ struct ShapeMin { //////////////////////////////////////////////////////////////////////////////////////////////////// -template +template struct ShapeStrides { typedef Shape + elementsPerAccess> Shape; }; @@ -167,7 +178,7 @@ struct ShapeStrides { */ template struct ComputeOffsetFromShape { - static CUTLASS_DEVICE int get(int d, int h, int w, int c) { + static CUTLASS_HOST_DEVICE int get(int d, int h, int w, int c) { // clang-format off return d * Shape_::kH * Shape_::kW * Shape_::kC + h * Shape_::kW * Shape_::kC + @@ -179,73 +190,19 @@ struct ComputeOffsetFromShape { //////////////////////////////////////////////////////////////////////////////////////////////////// -/** -* @brief Compute the offset for the given coordinates in a cube with a depth of 1 -* @tparam kSh Elements in the H dimension -* @tparam kSw Elements in the W dimension -* @tparam kSc Separation between two elements in "elements" -*/ -template -struct ComputeOffsetFromShape > { - static CUTLASS_DEVICE int get(int d, int h, int w, int c) { - return h * kSw_ * kSc_ + w * kSc_ + c; - } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -/** -* @brief Compute the offset for the given coordinates in a cube with one channel and a depth of 1 -* @tparam kSh Elements in the H dimension -* @tparam kSw Elements in the W dimension -*/ -template -struct ComputeOffsetFromShape > { - static CUTLASS_DEVICE int get(int d, int h, int w, int c) { return h * kSw_ + w; } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////// - /** * @brief Compute the offset for the given coordinates in a cube * @tparam A \ref layout_concept where each dimension of the cube specifies the corresponding stride. */ template struct ComputeOffsetFromStrides { - static CUTLASS_DEVICE int get(int d, int h, int w, int c) { + static CUTLASS_HOST_DEVICE int get(int d, int h, int w, int c) { return d * Strides_::kD + h * Strides_::kH + w * Strides_::kW + c * Strides_::kC; } }; //////////////////////////////////////////////////////////////////////////////////////////////////// -/** -* @brief Compute the offset for the given coordinates in a cube with a depth of 1 -* @tparam S_h Stride in the H dimension in scalars -* @tparam S_w Stride in the W dimension in scalars -* @tparam S_c Stride between two scalars. -*/ -template -struct ComputeOffsetFromStrides > { - static CUTLASS_DEVICE int get(int d, int h, int w, int c) { - return h * S_h_ + w * S_w_ + c * S_c_; - } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////// - -/** -* @brief Compute the offset for the given coordinates in a cube with one channel and a depth of 1 -* @tparam S_h Stride in the H dimension in scalars -* @tparam S_w Stride in the W dimension in scalars -*/ -template -struct ComputeOffsetFromStrides > { - static CUTLASS_DEVICE int get(int d, int h, int w, int c) { return h * S_h_ + w * S_w_; } -}; - -//////////////////////////////////////////////////////////////////////////////////////////////////// - /** * @brief Decompose threadId.x into coordinate of a cube whose dimensions are specified by Threads_. * Afterwards compute the offset of those coordinates using Strides_ diff --git a/cutlass/tensor_ref.h b/cutlass/tensor_ref.h index 8ef31e3b8f..09134190c0 100644 --- a/cutlass/tensor_ref.h +++ b/cutlass/tensor_ref.h @@ -27,125 +27,613 @@ */ #pragma once -#include - -#include -#include -#include +#include "cutlass/coord.h" +#include "cutlass/cutlass.h" +#include "cutlass/vector.h" namespace cutlass { -//////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Default mapping function from coordinates in a tensor's index space into the n-D array held +/// in memory. Assumes StorageRank = Rank +template +struct IdentityTensorMapFunc { + static int const kStorageRank = Rank; + CUTLASS_HOST_DEVICE + Coord operator()(Coord const &coord) const { + return coord; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/* \brief Structure modeling a pointer and stride into a tensor. + + A tensor consists of an index space with Rank_ dimensions. It is stored in memory modeled + as an n-D array, where n = StorageRank_. A mapping function maps the logical coordinates of the + tensor's index space into the n-D array, and a stride vector maps the n-D array to linear memory. + + CUTLASS requires the n-D array's least significant, "fastest changing" dimension to + be contiguous in memory. It therefore has a stride of 1 and is not stored. Construction is offered + from vectors of full StorageRank and of the 'compact' rank, though it is in error to construct + with the least significant stride != 1. + + The requirement that the least significant dimension be consecutive enables numerous optimizations + and assumptions about vectorizing memory accesses throughout CUTLASS. It also matches various + BLAS conventions in which only the "leading dimension" or most significant stride of a rank=2 + matrix is provided. + + This does affect the ability of constructing arbitrary "sparse" 2-D matrices in memory where all + stride elements are > 1. This can be overcome by defining a custom mapping function and a + StorageRank of 3 or more. + + + Examples: + + (These examples use helpers for matrix layouts defined in cutlass/matrix_traits.h) + + 1. Column-major matrix may be represented as a rank=2 tensor: + + TensorRef A(ptr_A, make_Coord(ldm, 1)); + + 2. Row-major matrix may be represented as a rank=2 tensor: + + TensorRef B(ptr_A, ldm); + + 3. An interleaved matrix may be represented as a rank=2 tensor: + + TensorRef > C; + + 4. Defining a sparse matrix with arbitrary strides in each dimension + + struct ContiguousLayout { + + /// Arbitrary storage rank + static int const kStorageRank = 3; + + /// Mapping function defined by runtime stride configuration + CUTLASS_HOST_DEVICE + Coord<3> operator()(MatrixCoord const &coord) const { + return make_Coord(coord.row(), coord.column(), 0); + } + }; + + typedef TensorRef ContiguousTensorRef; + + // Construct the TensorRef object from a pair of stride values + ContiguousTensorRef D(ptr_D, make_Coord(row_stride, column_stride)); + -/// Structure modeling a pointer and stride into a tensor -template + 5. A helper exists to define a TensorRef for a contiguous matrix whose layout + is not known at compile time. + + MatrixLayout::Kind layout; // Could be MatrixLayout::kRowMajor or MatrixLayout::kColumnMajor + int ldm; // leading dimension + + ContiguousTensorRef E(ptr_E, ContiguousLayout::stride(layout, ldm)); + +*/ +template < + /// Data type of element stored within tensor + typename Storage_, + /// Rank of logical tensor + int Rank_, + /// Maps a Coord in the logical tensor index space to the internal n-D array + typename MapFunc_ = IdentityTensorMapFunc, + /// Rank of internal n-D array + int StorageRank_ = MapFunc_::kStorageRank, + /// Index type used for coordinates + typename Index_ = int, + /// Index type used for offsets and pointer differences + typename LongIndex_ = long long +> class TensorRef { public: /// Data type of individual access typedef Storage_ Storage; - /// Rank of tensor - static int const Rank = Rank_; + /// Logical rank of tensor index space + static int const kRank = Rank_; + + /// Mapping function from logical coordinate to internal n-D array + typedef MapFunc_ MapFunc; + + /// Rank of internal storage + static int const kStorageRank = StorageRank_; + + /// Index type + typedef Index_ Index; + + /// Typically, strides in memory can be very large + typedef LongIndex_ LongIndex; + + /// Coordinate in logical tensor space + typedef Coord TensorCoord; + + /// Coordinate in storage n-D array + typedef Coord StorageCoord; + + /// Stride vector in storage coordinage space - assumes least significant stride + /// is 1 and does not store it. + typedef Coord StrideVector; + + /// Tensor reference to of constant value + typedef TensorRef< + typename platform::remove_const::type const, + Rank_, + MapFunc_, + StorageRank_, + Index_, + LongIndex_> ConstTensorRef; + + /// Require at least rank=1. Mathematically, a rank=0 tensor would be considered to be a + /// scalar, but degenerate cases such as these are difficult to accommodate without + /// extensive C++ metaprogramming or support for zero-length arrays. + static_assert(kRank > 0, "Cannot define a zero-rank TensorRef"); - private: // - // Data members + // Definitions included for backwards compatibility - to be removed in next major release // - /// Pointer to storage element + /// Coordinate in logical tensor space + typedef TensorCoord Coord_t; + + /// Logical rank of tensor index space + static int const Rank = kRank; + + private: + + /// Pointer Storage* ptr_; - /// Stride information - Coord stride_; + /// Stride vector - fastest-changing stride assumed to be 1 and not stored + StrideVector stride_; + + /// Maps a logical coordinate to an n-D array's tensor space + MapFunc coord_map_; public: + // // Methods // - /// Default ctor + /// Helper for 1-D memory. All higher ranks are projected onto the fastest changing rank. CUTLASS_HOST_DEVICE - TensorRef() : ptr_(nullptr) {} + TensorRef(Storage *ptr = nullptr): ptr_(ptr) { + for (int i = 0; i < kStorageRank - 1; ++i) { + stride_[i] = 1; + } + } - /// Constructs from a pointer, size, and stride + /// Helper to construct from a pointer and single stride element for 2-D pitch linear memory. + // Higher ranks are projected onto the fastest-changing rank. CUTLASS_HOST_DEVICE - TensorRef(Storage* ptr, Coord stride) : ptr_(ptr), stride_(stride) {} + TensorRef(Storage* ptr, Index ldm) { + ptr_ = ptr; + for (int i = 0; i < kStorageRank - 1; ++i) { + stride_[i] = ldm; + } + } - /// Updates the pointer, stride, and location within a TensorRef + /// Constructs from a single pointer and stride vector CUTLASS_HOST_DEVICE - void reset(Storage* ptr = nullptr, Coord stride = Coord(0)) { + TensorRef(Storage* ptr, StrideVector const& stride) : ptr_(ptr), stride_(stride) { + + } + + /// Constructs from a pointer and a stride vector of size kRank. If fastest changing + /// stride is not 1, construction fails and subsequent calls to good() will return false. + CUTLASS_HOST_DEVICE + TensorRef(Storage* ptr, StorageCoord const& stride) { + // Fastest-changing stride must be one + if (stride.at(kStorageRank - 1) == 1) { + ptr_ = ptr; + for (int i = 0; i < kStorageRank - 1; ++i) { + stride_[i] = stride[i]; + } + } + else { + // Fastest-chaning stride must be 1. + reset(); + } + } + + /// Enables conversion from TensorRef of non-const type + CUTLASS_HOST_DEVICE + TensorRef( + TensorRef< + typename platform::remove_const::type, + kRank, + MapFunc, + kStorageRank, + Index, + LongIndex> const &ref + ): + ptr_(ref.data()) { + for (int i = 0; i < kStorageRank - 1; ++i) { + stride_[i] = ref.stride(i); + } + } + + /// Returns a reference to constant-valued tensor + CUTLASS_HOST_DEVICE + ConstTensorRef const_ref() const { + return ConstTensorRef(*this); + } + + /// Updates only the pointer + CUTLASS_HOST_DEVICE + void reset(Storage* ptr = nullptr) { ptr_ = ptr; - stride_ = stride; } - /// Conversion function - template - TensorRef convert() { - Coord converted_stride; - for (int i = 0; i < Rank - 1; ++i) { - converted_stride[i] = stride_[i] * Extent::kValue / Extent::kValue; + /// Updates the pointer, stride, and location within a TensorRef + CUTLASS_HOST_DEVICE + void reset(Storage* ptr, StorageCoord const & stride) { + // Fastest-changing stride must be one + if (stride.at(kStorageRank - 1) == 1) { + ptr_ = ptr; + for (int i = 0; i < kStorageRank - 1; ++i) { + stride_[i] = stride[i]; + } + } + else { + // Fastest-changing stride must be 1 - this is an error. + reset(); + } + } + + /// Returns true if the TensorRef may be safely accessed + CUTLASS_HOST_DEVICE + bool good() const { + return ptr_ != nullptr; + } + + /// Returns the pointer to referenced data + CUTLASS_HOST_DEVICE + Storage * data() const { return ptr_; } + + /// Returns the stride of the tensor + CUTLASS_HOST_DEVICE + StorageCoord stride() const { + StorageCoord ld; + for (int i = 0; i < kStorageRank - 1; ++i) { + ld[i] = stride_[i]; + } + ld[kStorageRank - 1] = 1; + return ld; + } + + /// Returns the stride of the tensor in the given dimension + CUTLASS_HOST_DEVICE + Index stride(int dim) const { + // fastest-changing stride assumbed to be 1 + if (dim + 1 >= kStorageRank) { + return 1; + } + return stride_.at(dim); + } + + /// Returns the maximum stride element as the 'leading dimension' + CUTLASS_HOST_DEVICE + Index leading_dim(int idx = 0) const { return stride(idx); } + + /// Maps a logical coordinate to an n-D array in memory + CUTLASS_HOST_DEVICE + StorageCoord map(TensorCoord const &coord) const { + return coord_map_(coord); + } + + /// Computes the offset of an index from the origin of the tensor + CUTLASS_HOST_DEVICE + LongIndex offset(TensorCoord const& coord) const { + return stride().template dot(map(coord)); + } + + /// Returns a reference to the element at a given Coord + CUTLASS_HOST_DEVICE + Storage& at(TensorCoord const& coord) const { + return ptr_[offset(coord)]; + } + + /// Returns a reference to the element at a given linear index + CUTLASS_HOST_DEVICE + Storage& at(LongIndex idx) const { return ptr_[idx]; } + + /// Returns a reference to the element at a given Coord + CUTLASS_HOST_DEVICE + Storage& operator[](TensorCoord const& coord) const { + return ptr_[offset(coord)]; + } + + /// Returns a reference to the element at a given linear index + CUTLASS_HOST_DEVICE + Storage& operator[](LongIndex idx) const { return ptr_[idx]; } + + /// Adds an offset to each pointer + CUTLASS_HOST_DEVICE + TensorRef & add_pointer_offset(LongIndex delta) { + ptr_ += delta; + return *this; + } + + /// Returns a TensorRef offset by a given amount + CUTLASS_HOST_DEVICE + TensorRef operator+(TensorCoord const& b) const { + TensorRef result(*this); + result.add_pointer_offset(offset(b)); + return result; + } + + /// Returns a TensorRef offset by a given amount + CUTLASS_HOST_DEVICE + TensorRef& operator+=(TensorCoord const& b) { + add_pointer_offset(offset(b)); + return *this; + } + + /// Returns a TensorRef offset by a given amount + CUTLASS_HOST_DEVICE + TensorRef operator-(TensorCoord const& b) const { + TensorRef result(*this); + result.add_pointer_offset(-offset(b)); + return result; + } + + /// Returns a TensorRef offset by a given amount + CUTLASS_HOST_DEVICE + TensorRef& operator-=(TensorCoord const& b) { + add_pointer_offset(-offset(b)); + return *this; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Partial specializations to handle degenerate cases. +// +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Specialization for rank=1 case with no internal StrideVector +template < + /// Data type of element stored within tensor + typename Storage_, + /// Rank of logical tensor + int Rank_, + /// Maps a Coord in the logical tensor index space to the internal n-D array + typename MapFunc_, + /// Index type used for coordinates + typename Index_, + /// Index type used for offsets and pointer differences + typename LongIndex_ +> +class TensorRef { + public: + /// Data type of individual access + typedef Storage_ Storage; + + /// Logical rank of tensor index space + static int const kRank = Rank_; + + /// Mapping function from logical coordinate to internal n-D array + typedef MapFunc_ MapFunc; + + /// Rank of internal storage + static int const kStorageRank = 1; + + /// Index type + typedef Index_ Index; + + /// Typically, strides in memory can be very large + typedef LongIndex_ LongIndex; + + /// Coordinate in logical tensor space + typedef Coord TensorCoord; + + /// Coordinate in storage n-D array + typedef Coord StorageCoord; + + /// Stride vector in storage coordinage space - assumes least significant stride + /// is 1 and does not store it. + struct StrideVector { }; + + /// Tensor reference to of constant value + typedef TensorRef< + typename platform::remove_const::type const, + Rank_, + MapFunc_, + kStorageRank, + Index_, + LongIndex_> ConstTensorRef; + + // + // Definitions included for backwards compatibility - to be removed in next major release + // + + /// Coordinate in logical tensor space + typedef TensorCoord Coord_t; + + /// Logical rank of tensor index space + static int const Rank = kRank; + + private: + + /// Pointer + Storage* ptr_; + + /// Maps a logical coordinate to an n-D array's tensor space + MapFunc coord_map_; + + public: + + // + // Methods + // + + /// Helper for 1-D memory. All higher ranks are projected onto the fastest changing rank. + CUTLASS_HOST_DEVICE + TensorRef(Storage *ptr = nullptr): ptr_(ptr) { } + + /// Constructs from a single pointer and stride vector + CUTLASS_HOST_DEVICE + TensorRef(Storage* ptr, StrideVector const& stride) : ptr_(ptr) { + + } + + /// Constructs from a pointer and a stride vector of size kRank. If fastest changing + /// stride is not 1, construction fails and subsequent calls to good() will return false. + CUTLASS_HOST_DEVICE + TensorRef(Storage* ptr, StorageCoord const& stride) { + // Fastest-changing stride must be one + if (stride.at(kStorageRank - 1) == 1) { + ptr_ = ptr; + } + else { + // Fastest-chaning stride must be 1. + reset(); } - converted_stride[Rank - 1] = stride_[Rank - 1]; + } - return TensorRef(reinterpret_cast(ptr_), converted_stride); + /// Enables conversion from TensorRef of non-const type + CUTLASS_HOST_DEVICE + TensorRef( + TensorRef< + typename platform::remove_const::type, + kRank, + MapFunc, + kStorageRank, + Index, + LongIndex> const &ref + ): + ptr_(ref.data()) { + } + + /// Returns a reference to constant-valued tensor + CUTLASS_HOST_DEVICE + ConstTensorRef const_ref() const { + return ConstTensorRef(*this); + } + + /// Updates only the pointer + CUTLASS_HOST_DEVICE + void reset(Storage* ptr = nullptr) { + ptr_ = ptr; + } + + /// Updates the pointer, stride, and location within a TensorRef + CUTLASS_HOST_DEVICE + void reset(Storage* ptr, StorageCoord const & stride) { + // Fastest-changing stride must be one + if (stride.at(kStorageRank - 1) == 1) { + ptr_ = ptr; + } + else { + // Fastest-changing stride must be 1 - this is an error. + reset(); + } } /// Returns true if the TensorRef may be safely accessed CUTLASS_HOST_DEVICE - bool good() const { return ptr_ != nullptr; } + bool good() const { + return ptr_ != nullptr; + } /// Returns the pointer to referenced data CUTLASS_HOST_DEVICE - Storage* data() const { return ptr_; } + Storage * data() const { return ptr_; } /// Returns the stride of the tensor CUTLASS_HOST_DEVICE - Coord const& stride() const { return stride_; } + StorageCoord stride() const { + StorageCoord ld; + ld[kStorageRank - 1] = 1; + return ld; + } /// Returns the stride of the tensor in the given dimension CUTLASS_HOST_DEVICE - int const& stride(int dim) const { return stride_.at(dim); } + Index stride(int dim) const { + // fastest-changing stride assumbed to be 1 + return 1; + } /// Returns the maximum stride element as the 'leading dimension' CUTLASS_HOST_DEVICE - int leading_dim() const { return __NV_STD_MAX(stride_[1], stride_[2]); } + Index leading_dim(int idx = 0) const { return 1; } + + /// Maps a logical coordinate to an n-D array in memory + CUTLASS_HOST_DEVICE + StorageCoord map(TensorCoord const &coord) const { + return coord_map_(coord); + } /// Computes the offset of an index from the origin of the tensor CUTLASS_HOST_DEVICE - long long offset(Coord const& coord) const { - return stride_.template dot(coord); + LongIndex offset(TensorCoord const& coord) const { + return stride().template dot(map(coord)); } /// Returns a reference to the element at a given Coord CUTLASS_HOST_DEVICE - Storage& at(Coord const& coord) const { return ptr_[offset(coord)]; } + Storage& at(TensorCoord const& coord) const { + return ptr_[offset(coord)]; + } - /// Element-wise accessor - Storage& operator[](Coord const& coord) const { return at(coord); } + /// Returns a reference to the element at a given linear index + CUTLASS_HOST_DEVICE + Storage& at(LongIndex idx) const { return ptr_[idx]; } /// Returns a reference to the element at a given Coord CUTLASS_HOST_DEVICE - Storage& at(int idx) const { return ptr_[idx]; } + Storage& operator[](TensorCoord const& coord) const { + return ptr_[offset(coord)]; + } + + /// Returns a reference to the element at a given linear index + CUTLASS_HOST_DEVICE + Storage& operator[](LongIndex idx) const { return ptr_[idx]; } - /// Element-wise accessor - Storage& operator[](int idx) const { return at(idx); } + /// Adds an offset to each pointer + CUTLASS_HOST_DEVICE + TensorRef & add_pointer_offset(LongIndex delta) { + ptr_ += delta; + return *this; + } - /// Adds an offset to the pointer + /// Returns a TensorRef offset by a given amount CUTLASS_HOST_DEVICE - TensorRef& advance(Coord const& b) { - ptr_ += offset(b); + TensorRef operator+(TensorCoord const& b) const { + TensorRef result(*this); + result.add_pointer_offset(offset(b)); + return result; + } + + /// Returns a TensorRef offset by a given amount + CUTLASS_HOST_DEVICE + TensorRef& operator+=(TensorCoord const& b) { + add_pointer_offset(offset(b)); return *this; } /// Returns a TensorRef offset by a given amount CUTLASS_HOST_DEVICE - TensorRef operator+(Coord const& b) const { return TensorRef(ptr_ + offset(b), stride_); } + TensorRef operator-(TensorCoord const& b) const { + TensorRef result(*this); + result.add_pointer_offset(-offset(b)); + return result; + } /// Returns a TensorRef offset by a given amount CUTLASS_HOST_DEVICE - TensorRef operator-(Coord const& b) const { return TensorRef(ptr_ - offset(b), stride_); } + TensorRef& operator-=(TensorCoord const& b) { + add_pointer_offset(-offset(b)); + return *this; + } }; -//////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////// -} // namespace cutlass +} // namespace cutlass diff --git a/cutlass/tensor_ref_collection.h b/cutlass/tensor_ref_collection.h new file mode 100644 index 0000000000..b2972e1848 --- /dev/null +++ b/cutlass/tensor_ref_collection.h @@ -0,0 +1,420 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Introduces TensorRefCollection concept and defines TensorRefBatch and TensorRefArray. +*/ + +#pragma once + +#include "cutlass/tensor_ref.h" + +namespace cutlass { + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// +// TensorRefCollection is a concept for storing a logical collection of TensorRef objects. Classes +// satisfying the TensorRefCollection concept must support the following: +// +// // Define storage type +// typedef typename TensorRefCollection::Storage Storage; +// +// // Define a type for offsets in memory +// typedef typename TensorRefCollection::LongIndex LongIndex; +// +// // Define a ConstIterator type satisfying TensorRefIterator +// typedef typename TensorRefCollection::ConstIterator TensorRefIterator; +// +// // Implement a begin() method. +// TensorRefIterator iterator = collection.begin(); +// +// +// TensorRefIterator is a concept for accessing an element in a TensorRefCollection. Classes +// satisfying the TensorRefIterator concept must support the following: +// +// // Define a TensorRef type accessed by the iterator +// typedef typename TensorRefIterator::TensorRef TensorRef; +// +// // Access the TensorRef +// TensorRef ref = *iterator; +// +// // Pre-increment and post-increment +// ++iterator; +// iterator++; +// +// // Pre-decrement and post-decrement +// --iterator; +// iterator--; +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// This satisfies TensorRefCollection and stores a collection of TensorRef objects that +/// have identical strides. TensorRef objects are separated by a linear stride. +template < + /// Data type of element stored within tensor + typename Storage_, + /// Rank of logical tensor + int Rank_, + /// Maps a Coord in the logical tensor index space to the internal n-D array + typename MapFunc_ = IdentityTensorMapFunc, + /// Rank of internal n-D array + int StorageRank_ = MapFunc_::kStorageRank, + /// Index type used for coordinates + typename Index_ = int, + /// Index type used for offsets and pointer differences + typename LongIndex_ = long long +> +struct TensorRefBatchStrided: + public TensorRef { + + // + // Type definitions + // + + /// Underlying TensorRef type + typedef TensorRef Base; + + /// Storage type + typedef typename Base::Storage Storage; + + /// Index type + typedef Index_ Index; + + /// Typically, strides in memory can be very large + typedef LongIndex_ LongIndex; + + /// Coordinate in logical tensor space + typedef Coord TensorCoord; + + /// Tensor reference implied by the TensorRefBatchStrided + typedef Base TensorRef; + + /// Constant iterator over tensors implied by TensorRefBatchStrided + class ConstIterator { + public: + /// TensorRef returned by the iterator + typedef Base TensorRef; + + private: + + /// Reference to the parent TensorBatchRef object + TensorRefBatchStrided const &ref_; + + /// Offset from the base TensorRef pointer + LongIndex offset_; + + public: + + /// Constructs a ConstIterator from a parent TensorRefBatchStrided + CUTLASS_HOST_DEVICE + ConstIterator( + TensorRefBatchStrided const &ref, + LongIndex offset = 0): ref_(ref), offset_(offset) { } + + /// Obtains a TensorRef pointed to by the iterator + CUTLASS_HOST_DEVICE + TensorRef *operator() const { + TensorRef ref(ref_); + ref.add_pointer_offset(offset_); + return ref; + } + + /// Advances the iterator to point to the next tensor + CUTLASS_HOST_DEVICE + ConstIterator &operator++() { + offset_ += ref_.tensor_stride; + return *this; + } + + /// Advances the iterator to point to the next tensor + CUTLASS_HOST_DEVICE + ConstIterator operator++(int) { + ConstIterator ret(*this); + offset_ += ref_.tensor_stride; + return ret; + } + + /// Returns an iterator advanced by (idx) amount + CUTLASS_HOST_DEVICE + ConstIterator operator+(Index idx) { + return ConstIterator(ref, offset_ + ref_.tensor_stride * idx); + } + + /// Advances this iterator by (idx) and returns a reference to self + CUTLASS_HOST_DEVICE + ConstIterator &operator+=(Index idx) { + offset_ += ref_.tensor_stride * idx; + return *this; + } + + /// Moves to the previous tensor + CUTLASS_HOST_DEVICE + ConstIterator &operator--() { + offset_ -= ref_.tensor_stride; + return *this; + } + + /// Moves to the previous tensor + CUTLASS_HOST_DEVICE + ConstIterator operator--(int) { + ConstIterator ret(*this); + offset_ -= ref_.tensor_stride; + return ret; + } + + /// Returns an iterator moved forward by (idx) amount + CUTLASS_HOST_DEVICE + ConstIterator operator-(Index idx) { + return ConstIterator(ref_, offset_ - ref_.tensor_stride * idx); + } + + /// Moves this iterator by (idx) and returns a reference to self + CUTLASS_HOST_DEVICE + ConstIterator &operator-=(Index idx) { + offset_ -= ref_.tensor_stride * idx; + return *this; + } + + /// Returns the difference in offset between two iterators + CUTLASS_HOST_DEVICE + Stride operator-(ConstIterator const &it) { + return offset_ - it.offset_; + } + }; + + // + // Data members + // + + /// Stride between tensors + LongIndex tensor_stride; + + // + // Methods + // + + // Default ctor + CUTLASS_HOST_DEVICE + TensorRefBatchStrided(): tensor_stride(0) { } + + // Constructs form a tensor reference and + CUTLASS_HOST_DEVICE + TensorRefBatchStrided(TensorRef const &ref, LongIndex _tensor_stride = 0): + TensorRef(ref), + tensor_stride(_tensor_stride) { } + + /// Gets the pointer offset + CUTLASS_HOST_DEVICE + LongIndex get_pointer_offset(Index idx) const { + return idx * tensor_stride; + } + + // Returns a reference + CUTLASS_HOST_DEVICE + TensorRef at(Index idx) const { + TensorRef ref(*this); + ref.add_pointer_offset(get_pointer_offset(idx)); + return ref; + } + + /// Returns an iterator + CUTLASS_HOST_DEVICE + ConstIterator begin() { + return ConstIterator(*this); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// This satisfies TensorRefCollection and stores a collection of TensorRef objects. This is a +/// structure of arrays in that the individual members of the TensorRef are held in distinct arrays. +/// +/// Note, TensorRef maps a logical coordinate space to an n-D array with rank kStorageRank. It +/// maintains a stride vector of similar rank, but the least significant rank is defined to be 1. +/// +/// The least significant stride of 1 is not stored, and therefore the number of stride arrays is +/// kStorageRank - 1. +template < + /// Data type of element stored within tensor + typename Storage_, + /// Rank of logical tensor + int Rank_, + /// Maps a Coord in the logical tensor index space to the internal n-D array + typename MapFunc_ = IdentityTensorMapFunc, + /// Rank of internal n-D array + int StorageRank_ = MapFunc_::kStorageRank, + /// Index type used for coordinates + typename Index_ = int, + /// Index type used for offsets and pointer differences + typename LongIndex_ = long long +> +struct TensorRefArray { + // + // Type definitions + // + + /// TensorRef type obtained from the TensorRefArray + typedef TensorRef TensorRef; + + /// Element pointed to by the TensorRef + typedef Storage_ Storage; + + /// Index type + typedef Index_ Index; + + /// Typically, strides in memory can be very large + typedef LongIndex_ LongIndex; + + /// Rank of the stride vector + static int const kStorageRank = TensorRef::kStorageRank; + + /// TensorRefIterator over TensorRef objects in TensorRefArray + class ConstIterator { + public: + + /// TensorRef returned by the iterator + typedef Base TensorRef; + + private: + /// Reference to the TensorRefArray + TensorRefArray const &ref_; + + /// Index into TensorRefArray + int idx_; + + public: + + /// Constructs a ConstIterator over the TensorRef objects + CUTLASS_HOST_DEVICE + ConstIterator(TensorArrayRef const &ref, int idx = 0): ref_(ref), idx_(idx) { } + + /// Obtains a TensorRef pointed to by this iterator + CUTLASS_HOST_DEVICE + TensorRef *operator() const { + return ref_.reference(idx_); + } + + /// Advances to next TensorRef + CUTLASS_HOST_DEVICE + ConstIterator &operator++() { + ++idx_; + return *this; + } + + /// Advances to next TensorRef + CUTLASS_HOST_DEVICE + ConstIterator operator++(int) { + ConstIterator ret(*this); + idx_ ++; + return ret; + } + + CUTLASS_HOST_DEVICE + ConstIterator operator+(Index idx) { + return ConstIterator(ref_, idx_ + idx); + } + + CUTLASS_HOST_DEVICE + ConstIterator &operator+=(Index idx) { + idx_ += idx; + return *this; + } + + CUTLASS_HOST_DEVICE + ConstIterator &operator--() { + --idx_; + return *this; + } + + /// Advances to next TensorRef + CUTLASS_HOST_DEVICE + ConstIterator operator--(int) { + ConstIterator ret(*this); + --idx_; + return ret; + } + + CUTLASS_HOST_DEVICE + ConstIterator &operator-=(Index idx) { + idx_ -= idx; + return *this; + } + + CUTLASS_HOST_DEVICE + ConstIterator operator-(Index idx) { + return ConstIterator(ref_, idx_ + idx); + } + }; + + // + // Data members + // + + /// Base addresses + Storage **pointers; + + /// Array of strides + Index *strides[kStorageRank - 1]; + + // + // Methods + // + + // Default ctor + CUTLASS_HOST_DEVICE + TensorArrayRef() { } + + // Construct from pointers to arrays to strides + CUTLASS_HOST_DEVICE + TensorArrayRef( + Storage **_pointers, + Index _strides[kStorageRank - 1]): pointers(_pointers) { + + // Copy pointers to strides arrays + for (int i = 0; i < kStorageRank - 1; ++i) { + strides[i] = _strides[i]; + } + } + + // Returns a TensorRef at the given index in the collection + CUTLASS_HOST_DEVICE + TensorRef at(Index idx) const { + Coord stride; + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kStorageRank - 1; ++i) { + stride[i] = stride_[idx][i]; + } + return TensorRef(pointers[idx], stride); + } + + /// Returns an TesnorRefIterator over the TensorRef objects in this collection + CUTLASS_HOST_DEVICE + ConstIterator begin() { + return ConstIterator(*this); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass diff --git a/cutlass/tensor_view.h b/cutlass/tensor_view.h index 89c6bd5716..4ef99e027e 100644 --- a/cutlass/tensor_view.h +++ b/cutlass/tensor_view.h @@ -24,51 +24,110 @@ **************************************************************************************************/ /*! \file \brief Defines a structure containing strides and a pointer to tensor data. + + TensorView is derived from TensorRef and contributes bounds to the tensor's index space. Thus, + it is a complete mathematical object and may be used in tensor algorithms. It is decoupled from + data storage and is therefore lightweight and may be embedded in larger tensor objects or + memory structures. + + See cutlass/tensor_ref.h for more details about the mapping of the logical tensor index space to + linear memory. */ #pragma once #include -#include -#include +#include "cutlass/cutlass.h" +#include "cutlass/tensor_ref.h" namespace cutlass { //////////////////////////////////////////////////////////////////////////////////////////////////// -/// Host-side reference implementation of tensor operations -template -class TensorView : public TensorRef { +/// Defines a view into a logical tensor +template < + /// Data type of element stored within tensor + typename Storage_, + /// Rank of logical tensor + int Rank_ = 4, + /// Maps a Coord in the logical tensor index space to the internal n-D array + typename MapFunc_ = IdentityTensorMapFunc, + /// Rank of internal n-D array + int StorageRank_ = MapFunc_::kStorageRank, + /// Index type used for coordinates + typename Index_ = int, + /// Index type used for offsets and pointer differences + typename LongIndex_ = long long +> +class TensorView : public TensorRef { public: - /// Reference and stride - typedef TensorRef Base; + /// Base tensor reference + typedef TensorRef Base; + + /// Tensor reference to of constant value + typedef TensorRef< + typename platform::remove_const::type const, + Rank_, + MapFunc_, + StorageRank_, + Index_, + LongIndex_> ConstTensorRef; + + /// Base tensor reference + typedef Base TensorRef; + + /// Storage type + typedef typename Base::Storage Storage; + + /// Index type + typedef typename Base::Index Index; + + /// Coordinate in logical tensor space + typedef typename TensorRef::TensorCoord TensorCoord; + + /// Coordinate in storage n-D array + typedef typename TensorRef::StorageCoord StorageCoord; + + /// Stride vector in storage coordinate space + /// Least significant stride is = 1 and not stored + typedef typename TensorRef::StrideVector StrideVector; + + /// TensorView of constant value + typedef TensorView< + typename platform::remove_const::type const, + Rank_, + MapFunc_, + StorageRank_, + Index_, + LongIndex_> ConstTensorView; - /// Reference and stride - typedef Base TensorRef_t; + // + // Definitions included for backwards compatibility - to be removed in next major release + // - /// Reference to constant type - typedef TensorRef ConstTensorRef_t; + /// Coordinate in logical tensor space + typedef TensorCoord Coord_t; - /// Rank of tensor - static int const Rank = TensorRef_t::Rank; + /// Logical rank of tensor index space + static int const Rank = Base::kRank; /// Type used to compute the offset of an element to the base of a tensor - typedef int Offset_t; + typedef typename Base::LongIndex Offset_t; + + /// Base class + typedef TensorRef TensorRef_t; - /// Coordinate into tensor - typedef Coord Coord_t; + /// TensorRef to const-valued type + typedef typename TensorRef::ConstTensorRef ConstTensorRef_t; private: // // Data members // - /// Pointer to pitch-linear memory - TensorRef_t ref_; - /// Dimensions of coordinate (independent of stride) - Coord_t size_; + TensorCoord size_; public: // @@ -79,91 +138,126 @@ class TensorView : public TensorRef { CUTLASS_HOST_DEVICE TensorView() {} - /// Constructs a Tensor_view from a TensorRef and size + /// Constructs a TensorView from a TensorRef and size CUTLASS_HOST_DEVICE - TensorView(TensorRef_t const& _ref, Coord_t const& _size) : Base(_ref), size_(_size) {} + TensorView(Base const& _ref, TensorCoord const& _size) : Base(_ref), size_(_size) {} - /// Returns true if the Tensor_view is bound to some memory + /// Constructs a TensorView from a pointer, a stride vector, and size CUTLASS_HOST_DEVICE - bool good() const { return ref().good(); } - - /// Returns a pointer to data + TensorView( + Storage *ptr, + StrideVector const &stride, + TensorCoord const& size + ): + Base(ptr, stride), size_(size) {} + + /// Constructs a TensorView from a pointer, a stride vector, and size CUTLASS_HOST_DEVICE - T* data() const { return ref().data(); } + TensorView( + Storage *ptr, + StorageCoord const &stride, + TensorCoord const& size + ): + Base(ptr, stride), size_(size) {} /// Updates the reference and size of a Tensor_view object CUTLASS_HOST_DEVICE - void reset(TensorRef_t const& _ref = TensorRef_t(0), Coord_t const& _size = Coord_t()) { + void reset(Base const& _ref = Base(), TensorCoord const& _size = TensorCoord()) { Base::operator=(_ref); size_ = _size; } - /// Accesses the tensor reference pointing to data - CUTLASS_HOST_DEVICE - TensorRef_t& ref() { return *this; } - - /// - CUTLASS_HOST_DEVICE - ConstTensorRef_t const_ref() { return ConstTensorRef_t(data(), stride()); } - - /// Accesses the tensor reference pointing to data - CUTLASS_HOST_DEVICE - TensorRef_t const& ref() const { return *this; } - /// Accesses the size CUTLASS_HOST_DEVICE - Coord_t const& size() const { return size_; } + TensorCoord const& size() const { return size_; } /// Accesses the size CUTLASS_HOST_DEVICE - int size(int dim) const { return size_.at(dim); } - - /// Accesses the stride - CUTLASS_HOST_DEVICE - Coord_t const& stride() const { return ref().stride(); } - - /// Accesses the stride - CUTLASS_HOST_DEVICE - int const& stride(int dim) const { return ref().stride(dim); } + Index size(int dim) const { return size_.at(dim); } /// Assigns the Tensor_view CUTLASS_HOST_DEVICE TensorView& operator=(TensorView const& _tensor) { - Base::operator=(_tensor._ref); + Base::operator=(_tensor); size_ = _tensor.size_; return *this; } - /// Returns the index of an element - CUTLASS_HOST_DEVICE - Offset_t offset(Coord_t const& coord) const { return ref().offset(coord); } - /// Determines whether a location is within a tensor CUTLASS_HOST_DEVICE - bool contains(Coord_t const& coord) const { - for (int dim = 0; dim < Rank; ++dim) { - if (coord.at(dim) >= size_.at(dim)) { + bool contains(TensorCoord const& coord) const { + CUTLASS_PRAGMA_UNROLL + for (int dim = 0; dim < Rank_; ++dim) { + if (coord[dim] >= size_[dim]) { return false; } } return true; } - /// Element-wise accessor + /// Returns a TensorRef pointing to the first element of the tensor. CUTLASS_HOST_DEVICE - T& at(Coord_t const& coord) const { return ref().at(coord); } - - /// Element-wise accessor - T& operator[](Coord const& coord) const { return at(coord); } + TensorRef ref() const { + return TensorRef(*this); + } - /// Element-wise accessor + /// Returns a TensorRef pointing to the first element of the tensor. CUTLASS_HOST_DEVICE - T& at(Offset_t idx) const { return ref().at(idx); } + ConstTensorRef const_ref() const { + return ConstTensorRef(*this); + } /// Returns a Tensor_view given location and size quantities CUTLASS_HOST_DEVICE - TensorView subview(Coord_t const& location, Coord_t size) const { - return TensorView(ref() + location, size.clamp(size_ - location)); + TensorView subview(TensorCoord const& location, TensorCoord size) const { + return TensorView((*this) + location, size.clamp(size_ - location)); + } + + /// Returns the number of scalar elements needed to store tensor + CUTLASS_HOST_DEVICE + size_t capacity() const { + int max_rank = 0; + + StorageCoord mapped_size(this->map(size())); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Base::kStorageRank; ++i) { + if (!i || + this->stride(i) * mapped_size[i] > this->stride(max_rank) * mapped_size[max_rank]) { + max_rank = i; + } + } + return this->stride(max_rank) * mapped_size[max_rank]; + } + + /// Returns a TensorView offset by a given amount + CUTLASS_HOST_DEVICE + TensorView operator+(TensorCoord const& b) const { + TensorView result(*this); + result.add_pointer_offset(this->offset(b)); + return result; + } + + /// Returns a TensorRef offset by a given amount + CUTLASS_HOST_DEVICE + TensorView& operator+=(TensorCoord const& b) { + this->add_pointer_offset(this->offset(b)); + return *this; + } + + /// Returns a TensorRef offset by a given amount + CUTLASS_HOST_DEVICE + TensorView operator-(TensorCoord const& b) const { + TensorRef result(*this); + result.add_pointer_offset(-this->offset(b)); + return result; + } + + /// Returns a TensorRef offset by a given amount + CUTLASS_HOST_DEVICE + TensorView& operator-=(TensorCoord const& b) { + this->add_pointer_offset(-this->offset(b)); + return *this; } }; diff --git a/cutlass/tile_allocation.h b/cutlass/tile_allocation.h new file mode 100644 index 0000000000..81db797f9a --- /dev/null +++ b/cutlass/tile_allocation.h @@ -0,0 +1,143 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Defines a fragment based on a Shape<> template. +*/ +#pragma once + +#include "cutlass/shape.h" +#include "cutlass/fragment.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/zip_tensor_ref.h" + +namespace cutlass { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Class for storing a tile in memory and accessing it through a tensor ref +template +struct TileAllocation { + // + // Type definitions + // + + /// Scalar element + typedef Scalar_ Scalar; + + /// The actual storage (may differ from the scalar type) + typedef typename StorageType::Type Storage; + + /// Size of the allocation in units of scalars + typedef Shape_ Shape; + + /// Strides + typedef typename ShapeStrides::Shape Strides; + + /// Defines the tensor reference for this allocation + typedef TensorRef ConstTensorRef; + + /// Defines the tensor reference for this allocation + typedef TensorRef TensorRef; + + // + // Data members + // + + /// Storage + Storage storage[Shape::kD][Shape::kH][Shape::kW][Shape::kC]; + + // + // Methods + // + + /// Returns a pointer to the raw data + CUTLASS_DEVICE + Scalar *data() { return reinterpret_cast(&storage[0][0][0][0]); } + + /// Returns a const pointer to the raw data + CUTLASS_DEVICE + Scalar const *data() const { return reinterpret_cast(&storage[0][0][0][0]); } + + /// Returns a TensorRef object pointing to the data + CUTLASS_DEVICE + TensorRef reference() { + return TensorRef(data(), make_Coord(Strides::kD, Strides::kH, Strides::kW, Strides::kC)); + } + + /// Returns a TensorRef object pointing to the data + CUTLASS_DEVICE + ConstTensorRef reference() const { + return ConstTensorRef(data(), make_Coord(Strides::kD, Strides::kH, Strides::kW, Strides::kC)); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Manages a pair of tile allocations as if they are one allocation +template +struct ZipTileAllocation { + // + // Type definitions + // + + /// First tensor allocation + typedef First_ First; + + /// Second tensor allocation + typedef Second_ Second; + + /// Defines the tensor reference for this allocation + typedef ZipTensorRef TensorRef; + + /// Defines the tensor reference for this allocation + typedef ZipTensorRef + ConstTensorRef; + + // + // Data members + // + + /// First tensor allocation + First first; + + /// Second tensor allocation + Second second; + + // + // Methods + // + + /// Returns a TensorRef object pointing to the data + CUTLASS_DEVICE + TensorRef reference() { return TensorRef(first.reference(), second.reference()); } + + /// Returns a TensorRef object pointing to the data + CUTLASS_DEVICE + ConstTensorRef reference() const { return ConstTensorRef(first.reference(), second.reference()); } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass diff --git a/cutlass/tile_coord.h b/cutlass/tile_coord.h new file mode 100644 index 0000000000..b3d809bc36 --- /dev/null +++ b/cutlass/tile_coord.h @@ -0,0 +1,194 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Defines a coordinate used for the CUTLASS 4-D tile structure. +*/ + +#pragma once + +#include "cutlass/coord.h" + +namespace cutlass { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// TileCoord wraps Coord<4, int> to provide a helper for accessing named dimensions. Classes +/// expecting a coordinate in the rank=4 index space of a CUTLASS tile structure should use TileCoord. +template +struct TileCoord : public Coord<4, Index_> { + + /// Index type + typedef Index_ Index; + + /// Underlying Coord<4> + typedef Coord<4, Index> Base; + + /// D dimension + static int kD = 0; + + /// H dimension + static int kH = 1; + + /// W dimension + static int kW = 2; + + /// C dimension + static int kC = 3; + + // + // Methods + // + + /// Default ctor + CUTLASS_HOST_DEVICE + TileCoord() { } + + /// Constructs from Coord<3> and infers coord[kC] = 0 + CUTLASS_HOST_DEVICE + TileCoord(Coord<3, Index> const &coord): + Base(make_Coord(coord[0], coord[1], coord[2], 0)) { } + + /// Constructs from Coord<4> + CUTLASS_HOST_DEVICE + TileCoord(Coord<4, Index> const &coord): Base(coord) { } + + /// Constructs from an array of coordinate elements + CUTLASS_HOST_DEVICE + TileCoord(Index coord[4]): Base(coord) { } + + /// Helper to construct from a row and column + CUTLASS_HOST_DEVICE + TileCoord(Index d, Index h, Index w, Index c): Base(make_Coord(d, h, w, c)) { } + + /// Returns the D element of the coordinate + CUTLASS_HOST_DEVICE + Index const & d() const { return this->at(kD); } + + /// Returns the D element of the coordinate + CUTLASS_HOST_DEVICE + Index & d() { return this->at(kD); } + + /// Returns the H element of the coordinate + CUTLASS_HOST_DEVICE + Index const & h() const { return this->at(kH); } + + /// Returns the H element of the coordinate + CUTLASS_HOST_DEVICE + Index & h() { return this->at(kH); } + + /// Returns the W element of the coordinate + CUTLASS_HOST_DEVICE + Index const & w() const { return this->at(kW); } + + /// Returns the W element of the coordinate + CUTLASS_HOST_DEVICE + Index & w() { return this->at(kW); } + + /// Returns the Celement of the coordinate + CUTLASS_HOST_DEVICE + Index const & c() const { return this->at(kC); } + + /// Returns the C element of the coordinate + CUTLASS_HOST_DEVICE + Index & c() { return this->at(kC); } + + /// Gets H and W dimensions as a Coord<2> + CUTLASS_HOST_DEVICE + Coord<2> hw() const { + return make_Coord(h(), w()); + } + + /// Gets H, W, and C dimensions as a Coord<3> + CUTLASS_HOST_DEVICE + Coord<3> hwc() const { + return make_Coord(h(), w(), c()); + } + + /// Gets D, H, and W dimensions as a Coord<3> + CUTLASS_HOST_DEVICE + Coord<3> dhw() const { + return make_Coord(d(), h(), w()); + } + + // + // Coord operators + // + + /// Element-wise addition + CUTLASS_HOST_DEVICE + TileCoord operator+(Base const& b) const { + return TileCoord(Base::operator+(b)); + } + + /// Element-wise subtraction + CUTLASS_HOST_DEVICE + TileCoord operator-(Base const& b) const { + return TileCoord(Base::operator-(b)); + } + + /// Element-wise multiplication + CUTLASS_HOST_DEVICE + TileCoord operator*(Base const& b) const { + return TileCoord(Base::operator*(b)); + } + + /// Element-wise division + CUTLASS_HOST_DEVICE + TileCoord operator/(Base const& b) const { + return TileCoord(Base::operator/(b)); + } + + /// In-place addition + CUTLASS_HOST_DEVICE + TileCoord& operator+=(Base const& b) { + Base::operator+=(b); + return *this; + } + + /// In-place subtraction + CUTLASS_HOST_DEVICE + TileCoord& operator-=(Base const& b) { + Base::operator-=(b); + return *this; + } + + /// In-place multiplication + CUTLASS_HOST_DEVICE + TileCoord& operator*=(Base const& b) { + Base::operator*=(b); + return *this; + } + + /// In-place division + CUTLASS_HOST_DEVICE + TileCoord& operator/=(Base const& b) { + Base::operator/=(b); + return *this; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass diff --git a/cutlass/tile_iterator.h b/cutlass/tile_iterator.h index 5d39c4f808..51e5779490 100644 --- a/cutlass/tile_iterator.h +++ b/cutlass/tile_iterator.h @@ -28,10 +28,13 @@ */ #pragma once -#include -#include -#include -#include +#include "cutlass/coord.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/fragment.h" +#include "cutlass/load_store.h" +#include "cutlass/predicate_vector.h" +#include "cutlass/vector.h" +#include namespace cutlass { @@ -61,12 +64,6 @@ as a Coord<4>. struct IteratorAdvance { enum Kind { kD, kH, kW }; }; - -/// Specifies whether iterator storage fragment consists of Scalar values or WMMA matrix -struct IteratorFragment { - enum Kind { kScalar, kWmmaMatrix }; -}; - /////////////////////////////////////////////////////////////////////////////////////////////////// /** @@ -77,7 +74,7 @@ template + int AccessSize> struct TileTraits { /// Shape of the tile typedef Tile_ Tile; @@ -89,11 +86,52 @@ struct TileTraits { typedef Iterations_ Iterations; /// Functor that returns the logical coordinate of each entity's initial offset in the tile + // + // ThreadOffset should be a functor defined like: + // + // struct ThreadOffsetExample { + // CUTLASS_DEVICE + // Coord<4> operator()() const { + // return make_Coord(0, threadIdx.y, threadIdx.x, 0); + // } + // }; + // typedef ThreadOffset_ ThreadOffset; + + /// Strides for immediate offset computation + typedef Shape<0, 0, 0, 0> ImmediateOffsetStrides; + + /// Access size + static int const kAccessSize = AccessSize; +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Functor computing a predicate given the logical position of an access +template +struct RegularTilePredicateFunctor { + typedef Delta_ Delta; + + /// Dimensions of the bounding volume + Coord<3> bounds; + + /// Constructs a predicate functor given the bounds of a tensor + CUTLASS_HOST_DEVICE + RegularTilePredicateFunctor(Coord<3> _bounds) : bounds(_bounds) {} + + /// Computes the predicate given the logical position of an access + CUTLASS_HOST_DEVICE + bool operator()(Coord<3> iteration, Coord<3> offset) const { + return (iteration[0] * Delta::kD + offset[0] < bounds[0]) && + (iteration[1] * Delta::kH + offset[1] < bounds[1]) && + (iteration[2] * Delta::kW + offset[2] < bounds[2]); + } }; /////////////////////////////////////////////////////////////////////////////////////////////////// +template +struct DumpType {}; /// Iterator for accessing a stripmined tile in memory template > struct TileIteratorBase { /// concept TileTraits @@ -117,7 +155,7 @@ struct TileIteratorBase { static IteratorAdvance::Kind const kAdvance = Advance_; /// Specifies iterator storage fragment type (Scalar or WmmaMatrix) - static IteratorFragment::Kind const kIteratorFragment = IteratorFragment_; + static FragmentElementType::Kind const kFragmentElementType = FragmentElementType_; /// Source or destination memory space static MemorySpace::Kind const kMemorySpace = MemorySpace; @@ -144,18 +182,19 @@ struct TileIteratorBase { typedef typename Traits::ThreadOffset ThreadOffset; /// The number of scalars accessed per load/store. - static int const kAccessSize = Tile::kC; + static int const kAccessSize = Traits::kAccessSize; /// The elements loaded/store by one instruction. typedef typename Vectorize::Type AccessType; /// The size of storage needed per fragment static int const kFragmentSize = - (kIteratorFragment == IteratorFragment::kWmmaMatrix ? 16 : sizeof(AccessType)); + (kFragmentElementType == FragmentElementType::kWmmaMatrix ? 16 : sizeof(AccessType)); /// The storage. typedef Fragment::kCount, kFragmentSize> Storage; /// The fragment. typedef Fragment::kCount * kAccessSize> Fragment; + /// The fragment iterator. typedef FragmentIterator FragmentIterator; /// The fragment const iterator. @@ -172,25 +211,61 @@ struct TileIteratorBase { /// Parameters to the iterator struct Params { - Index stride_d; + + // + // Dat members + // + + long long stride_d; Index stride_h; Index stride_w; - Index inc_d; + long long inc_d; Index inc_h; Index inc_w; - Index inc_advance; + long long inc_advance; + + // + // Methods + // + + /// Constructs params + CUTLASS_HOST_DEVICE + Params() : stride_d(0), stride_h(0), stride_w(0), inc_d(0), inc_h(0), inc_w(0) {} + + /// Constructs params + CUTLASS_HOST_DEVICE + Params(long long _stride_d, + Index _stride_h, + Index _stride_w, + long long _inc_d, + Index _inc_h, + Index _inc_w, + long long _inc_advance) + : stride_d(_stride_d), + stride_h(_stride_h), + stride_w(_stride_w), + inc_d(_inc_d), + inc_h(_inc_h), + inc_w(_inc_w), + inc_advance(_inc_advance) {} + + /// Constructs params with a stride vector + CUTLASS_HOST_DEVICE + Params(Coord<4> const &stride) { + initialize(stride); + } /// Initializes params CUTLASS_HOST_DEVICE - int initialize(Index _stride_d, + int initialize(long long _stride_d, Index _stride_h, Index _stride_w, - Index _inc_d, + long long _inc_d, Index _inc_h, Index _inc_w, - Index _inc_advance) { + long long _inc_advance) { stride_d = _stride_d; stride_h = _stride_h; stride_w = _stride_w; @@ -203,61 +278,79 @@ struct TileIteratorBase { return 0; } + /// Initializes the parameters object from a vector of strides + CUTLASS_HOST_DEVICE + int initialize(Coord<4> const &stride) { + return initialize(stride[0], stride[1], stride[2]); + } + + /// Initializes the parameters object from a vector of strides CUTLASS_HOST_DEVICE - int initialize(Index _stride_d, Index _stride_h, Index _stride_w) { + int initialize(long long _stride_d, Index _stride_h, Index _stride_w) { stride_d = _stride_d; stride_h = _stride_h; stride_w = _stride_w; inc_w = stride_w * Delta::kW; inc_h = stride_h * Delta::kH - stride_w * Delta::kW * (Iterations::kW - 1); + inc_d = stride_d * Delta::kD - stride_h * Delta::kH * (Iterations::kH - 1) - + stride_w * Delta::kW * (Iterations::kW - 1); + + inc_advance = 0; if (kAdvance == IteratorAdvance::kH) { // Advance in the H dimension. - inc_d = 0; + inc_advance = Tile::kH * stride_h; } else if (kAdvance == IteratorAdvance::kW) { // Advance in the W dimension. - inc_d = stride_w * Tile::kW - stride_h * Tile::kH; + inc_advance = Tile::kW * stride_w; + } else { // Advance in the D dimension. - inc_d = stride_d; + inc_advance = Tile::kD * stride_d; } - inc_advance = 0; + inc_advance -= stride_d * Delta::kD * (Iterations::kD - 1) + + stride_h * Delta::kH * (Iterations::kH - 1) + + stride_w * Delta::kW * (Iterations::kW - 1); return 0; } + /// Gotta have this CUTLASS_HOST_DEVICE int initialize() { stride_d = 0; stride_h = 0; stride_w = 1; - inc_d = inc_h = inc_w = inc_advance = 0; + inc_advance = 0; + inc_d = inc_h = inc_w = 0; return 0; } }; /// Is the iterator valid? - CUTLASS_DEVICE bool valid(int d, int h, int w, int c) const { return true; } + CUTLASS_HOST_DEVICE bool valid(int d, int h, int w, int c) const { return true; } // // Static function members // /// Initializes a predicate vector - template - CUTLASS_DEVICE static void initialize_predicates(PredicateIterator predicate_it, - Coord<3> const &bounds, - Coord<3> const &offset = make_Coord(0, 0, 0)) { + template + CUTLASS_HOST_DEVICE static void initialize_predicates(PredicateIterator predicate_it, + PredicateFunctor const &predicate_func, + Coord<3> const &offset) { + CUTLASS_PRAGMA_UNROLL for (int d = 0; d < Iterations::kD; ++d) { - bool enable_d = (d * Delta::kD + offset[0] < bounds[0]); + CUTLASS_PRAGMA_UNROLL for (int h = 0; h < Iterations::kH; ++h) { - bool enable_h = (h * Delta::kH + offset[1] < bounds[1]); + CUTLASS_PRAGMA_UNROLL for (int w = 0; w < Iterations::kW; ++w) { - bool enable_w = (w * Tile::kC * Delta::kW + offset[2] < bounds[2]); - predicate_it.set(d, h, w, 0, enable_d && enable_h && enable_w); + bool enable = predicate_func(make_Coord(d, h, w), offset); + predicate_it.set(enable); + ++predicate_it; } } } @@ -301,7 +394,7 @@ template > struct TileLoadIterator : public TileIteratorBase { /// Base class typedef TileIteratorBase Base; @@ -329,13 +422,13 @@ struct TileLoadIterator : public TileIteratorBase TensorRef; + /// Parameters struct Params : public BaseParams { /// Pointer to memory Scalar const *pointer; + // + // Methods + // + + /// Initialize params to access storage object + CUTLASS_HOST_DEVICE + Params() : pointer(0){ Base::Params::initialize(); } + + /// Initialize params to access storage object + CUTLASS_HOST_DEVICE + Params(Scalar const *ptr) : pointer(ptr) { Base::Params::initialize(); } + + /// Constructs with a CompactTensorRef<> + CUTLASS_HOST_DEVICE + Params(TensorRef const &ref): pointer(ref.data()) { + Base::Params::initialize(ref.stride()); + } + + /// Initialize params to access storage object + CUTLASS_HOST_DEVICE + Params(Scalar const *ptr, + long long _stride_d, + Index _stride_h, + Index _stride_w, + long long _inc_d, + Index _inc_h, + Index _inc_w, + Index _inc_advance) + : pointer(ptr) { + Base::Params::initialize( + _stride_d, _stride_h, _stride_w, _inc_d, _inc_h, _inc_w, _inc_advance); + } + + /// Initialize params to access storage object + CUTLASS_HOST_DEVICE + Params(Scalar const *ptr, long long stride_d, Index stride_h, Index stride_w) + : pointer(ptr) { + Base::Params::initialize(stride_d, stride_h, stride_w); + } + + /// Initializes params to access a raw pointer + CUTLASS_HOST_DEVICE + int initialize(TensorRef const &ref) { + pointer = ref.data(); + return Base::Params::initialize(ref.stride()); + } + /// Initialize params to access storage object CUTLASS_HOST_DEVICE int initialize(SharedStorage const &storage) { pointer = &storage[0]; + Base::Params::initialize(); + return 0; + } + + /// Initialize params to access storage object + CUTLASS_HOST_DEVICE + int initialize(Scalar const *ptr) { + pointer = ptr; + Base::Params::initialize(); return 0; } /// Initializes params to access a raw pointer CUTLASS_HOST_DEVICE - int initialize(Scalar const *ptr, Index stride_d, Index stride_h, Index stride_w) { + int initialize(Scalar const *ptr, long long stride_d, Index stride_h, Index stride_w) { Base::Params::initialize(stride_d, stride_h, stride_w); pointer = ptr; return 0; @@ -411,10 +566,10 @@ struct TileLoadIterator : public TileIteratorBase + /// Initializes a predicate vector using a RegularTilePredicateFunctor + template < + /// Predicate iterator + typename PredicateIterator> CUTLASS_HOST_DEVICE void initialize_predicates(PredicateIterator predicate_it, Coord<3> const &bounds, Coord<3> const &block_offset = make_Coord(0, @@ -455,8 +612,23 @@ struct TileLoadIterator : public TileIteratorBase(bounds), + block_offset + make_Coord(thread_offset[0], thread_offset[1], thread_offset[2])); + } + + /// Initializes a predicate vector using an arbitrary predicate functor + template < + /// Predicate iterator + typename PredicateIterator, + /// Functor computing predicates + typename PredicateFunctor> + CUTLASS_HOST_DEVICE void initialize_predicates(PredicateIterator predicate_it, + PredicateFunctor const &functor, + Coord<3> const &block_offset) { + Base::initialize_predicates( + predicate_it, + functor, + block_offset + make_Coord(thread_offset[0], thread_offset[1], thread_offset[2])); } // @@ -475,41 +647,27 @@ struct TileLoadIterator : public TileIteratorBase const &block_offset = make_Coord(0, 0, 0), ThreadOffset thread_offset_func = ThreadOffset()) : stage(0) { - int const offset = thread_offset_func()[2]; - params.pointer = &shared_storage[offset]; - } + params.pointer = ptr + thread_offset_func()[2]; - /// Returns the current pointer - CUTLASS_HOST_DEVICE - Scalar const *data() const { return params.pointer; } + params.stride_d = 0; + params.stride_h = 0; + params.stride_w = 1; - /// The accessor. - CUTLASS_DEVICE void get(AccessType &value, int d, int h, int w, int c) const { - int const imm = - ComputeOffsetFromStrides::get(d, h, w, c); - Load::load(value, params.pointer, imm); + params.inc_d = params.inc_h = params.inc_w = params.inc_advance = 0; } /// Increment in the D dimension @@ -524,8 +682,21 @@ struct TileLoadIterator : public TileIteratorBase::get(d, h, w, c); + Load::load(value, params.pointer, offset); + } + /// Increment the stage. - CUTLASS_DEVICE void inc_stage() { + CUTLASS_HOST_DEVICE void inc_stage() { if (Tile::kD > 1) { int const kStageSize = Tile::kH * Tile::kW * Tile::kC; if (stage == Tile::kD - 1) { @@ -538,7 +709,27 @@ struct TileLoadIterator : public TileIteratorBase const &offset) { + long long _offset = offset.template dot( + make_Coord(params.stride_d, params.stride_h, params.stride_w) + ); + + params.pointer += _offset; + return *this; + } + + /// Adds a raw offset to the pointer + CUTLASS_HOST_DEVICE void add_pointer_offset(Index offset) { params.pointer += offset; } + + CUTLASS_HOST_DEVICE Index stride_advance(void) { + Index stride = params.stride_h; + if (kAdvance == IteratorAdvance::kW) { + stride = params.stride_w; + } + return stride; + } + /// Loads a fragment and advances the iterator to the next tile. template CUTLASS_HOST_DEVICE void load_post_increment(Fragment &fragment, PredicateIterator pred_it) { @@ -547,11 +738,12 @@ struct TileLoadIterator : public TileIteratorBase::load( - reinterpret_cast(frag_iterator.at(d, h, w, 0)), data(), 0); + for (int c = 0; c < Iterations::kC; ++c) { + if (*pred_it) { + load_element( + reinterpret_cast(frag_iterator.at(d, h, w, c)), d, h, w, c); + } } - if (w < Iterations::kW - 1) { inc_w(); } @@ -587,6 +779,19 @@ struct TileLoadIterator : public TileIteratorBase + CUTLASS_HOST_DEVICE void load(Fragment &fragment, int d) { + FragmentIterator frag_iterator(fragment); + for (int h = 0; h < Iterations::kH; ++h) { + for (int w = 0; w < Iterations::kW; ++w) { + for (int c = 0; c < Iterations::kC; ++c) { + load_element(reinterpret_cast(frag_iterator.at(0, h, w, c)), d, h, w, c); + } + } + } + } }; /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -626,7 +831,7 @@ template > struct TileStoreIterator : public TileIteratorBase { /// Base class typedef TileIteratorBase Base; @@ -660,11 +865,14 @@ struct TileStoreIterator : public TileIteratorBase TensorRef; + /// Parameters struct Params : public BaseParams { /// Pointer to memory Scalar *pointer; + // + // Methods + // + + // Default constructor + CUTLASS_HOST_DEVICE + Params() : pointer(0) {} + + // Default constructor + CUTLASS_HOST_DEVICE + Params(Scalar *ptr) : pointer(ptr) { Base::Params::initialize(); } + + /// Constructs with a CompactTensorRef<> + CUTLASS_HOST_DEVICE + Params(TensorRef const &ref): pointer(ref.data()) { + Base::Params::initialize(ref.stride()); + } + + // Default constructor + CUTLASS_HOST_DEVICE + Params(Scalar *ptr, long long stride_d, Index stride_h, Index stride_w) { + initialize(ptr, stride_d, stride_h, stride_w); + } + + // Default constructor + CUTLASS_HOST_DEVICE + Params(Scalar *ptr, + long long _stride_d, + Index _stride_h, + Index _stride_w, + long long _inc_d, + Index _inc_h, + Index _inc_w, + Index _inc_advance) { + initialize(ptr, _stride_d, _stride_h, _stride_w, _inc_d, _inc_h, _inc_w, _inc_advance); + } + /// Initialize params to access storage object CUTLASS_HOST_DEVICE int initialize(SharedStorage &storage) { pointer = &storage[0]; - return 0; + return Base::Params::initialize(); + } + + /// Initialize params to access storage object + CUTLASS_HOST_DEVICE + int initialize(Scalar *ptr) { + pointer = ptr; + return Base::Params::initialize(); } /// Initializes params to access a raw pointer CUTLASS_HOST_DEVICE - int initialize(Scalar *ptr, Index stride_d, Index stride_h, Index stride_w) { + int initialize(Scalar *ptr, long long stride_d, Index stride_h, Index stride_w) { Base::Params::initialize(stride_d, stride_h, stride_w); pointer = ptr; return 0; @@ -730,10 +988,10 @@ struct TileStoreIterator : public TileIteratorBase + /// Initializes a predicate vector using a RegularTilePredicateFunctor + template < + /// Predicate iterator + typename PredicateIterator> CUTLASS_HOST_DEVICE void initialize_predicates(PredicateIterator predicate_it, Coord<3> const &bounds, Coord<3> const &block_offset = make_Coord(0, @@ -774,8 +1034,23 @@ struct TileStoreIterator : public TileIteratorBase(bounds), + block_offset + make_Coord(thread_offset[0], thread_offset[1], thread_offset[2])); + } + + /// Initializes a predicate vector using an arbitrary predicate functor + template < + /// Predicate iterator + typename PredicateIterator, + /// Functor computing predicates + typename PredicateFunctor> + CUTLASS_HOST_DEVICE void initialize_predicates(PredicateIterator predicate_it, + PredicateFunctor const &functor, + Coord<3> const &block_offset) { + Base::initialize_predicates( + predicate_it, + functor, + block_offset + make_Coord(thread_offset[0], thread_offset[1], thread_offset[2])); } // @@ -794,25 +1069,22 @@ struct TileStoreIterator : public TileIteratorBase const &block_offset = make_Coord(0, 0, 0), - ThreadOffset thread_offset_func = ThreadOffset()) + TileStoreIterator(Params const &, Scalar *ptr, ThreadOffset thread_offset_func = ThreadOffset()) : stage(0) { - int const offset = thread_offset_func()[2]; - params.pointer = &shared_storage[offset]; - } + params.pointer = ptr + thread_offset_func()[2]; + params.stride_d = 0; + params.stride_h = 0; + params.stride_w = 1; - /// Returns the current pointer - CUTLASS_HOST_DEVICE - Scalar *data() const { return params.pointer; } + params.inc_d = params.inc_h = params.inc_w = params.inc_advance = 0; + } /// Increment in the D dimension CUTLASS_HOST_DEVICE void inc_d() { params.pointer += params.inc_d; } @@ -827,7 +1099,7 @@ struct TileStoreIterator : public TileIteratorBase 1) { int const kStageSize = Tile::kH * Tile::kW * Tile::kC; if (stage == Tile::kD - 1) { @@ -840,25 +1112,43 @@ struct TileStoreIterator : public TileIteratorBase const &offset) { + params.pointer += offset.template dot( + make_Coord(params.stride_d, params.stride_h, params.stride_w) + ); + return *this; + } + + /// Adds a raw offset to the pointer + CUTLASS_HOST_DEVICE void add_pointer_offset(Index offset) { params.pointer += offset; } + + /// Stores a single fragment element into memory. + CUTLASS_HOST_DEVICE void store_element(AccessType const &value, int d, int h, int w, int c) { + int const offset = ComputeOffsetFromStrides::get(d, h, w, c); - Store::store(value, params.pointer, imm); + Store::store(value, params.pointer, offset); } - public: /// Stores a fragment and advances to the next tile. template - CUTLASS_HOST_DEVICE void store_post_increment(Fragment &fragment, PredicateIterator pred_it) { - FragmentIterator frag_iterator(fragment); + CUTLASS_HOST_DEVICE void store_post_increment(Fragment const &fragment, PredicateIterator pred_it) { + FragmentConstIterator frag_iterator(fragment); for (int d = 0; d < Iterations::kD; ++d) { for (int h = 0; h < Iterations::kH; ++h) { for (int w = 0; w < Iterations::kW; ++w, ++pred_it) { - if (*pred_it) { - Store::store( - reinterpret_cast(frag_iterator.at(d, h, w, 0)), data(), 0); + for (int c = 0; c < Iterations::kC; ++c) { + if (*pred_it) { + store_element( + reinterpret_cast(frag_iterator.at(d, h, w, c)), d, h, w, c); + } } if (w < Iterations::kW - 1) { inc_w(); @@ -877,23 +1167,103 @@ struct TileStoreIterator : public TileIteratorBase - CUTLASS_HOST_DEVICE void store_post_increment(Fragment &fragment) { + CUTLASS_HOST_DEVICE void store_post_increment(Fragment const &fragment) { typename PredicateVector::TrivialIterator pred_it; store_post_increment(fragment, pred_it); } /// Stores a fragment without advancing the iterator. template - CUTLASS_HOST_DEVICE void store(Fragment &fragment, PredicateIterator pred_it) const { + CUTLASS_HOST_DEVICE void store(Fragment const &fragment, PredicateIterator pred_it) const { TileStoreIterator _store_it(*this); _store_it.store_post_increment(fragment, pred_it); } /// Stores a fragment without advancing the iterator. template - CUTLASS_HOST_DEVICE void store(Fragment &fragment) const { + CUTLASS_HOST_DEVICE void store(Fragment const &fragment) const { typename PredicateVector::TrivialIterator pred_it; store(fragment, pred_it); } + + /// Loads a single fragment element from memory + CUTLASS_HOST_DEVICE void load_element(AccessType &value, int d, int h, int w, int c) const { + int const offset = + ComputeOffsetFromStrides::get(d, h, w, c); + + Load::load(value, params.pointer, offset); + } + + /// Loads a fragment and advances the iterator to the next tile. + template + CUTLASS_HOST_DEVICE void load_post_increment(Fragment &fragment, PredicateIterator pred_it) { + FragmentIterator frag_iterator(fragment); + + for (int d = 0; d < Iterations::kD; ++d) { + for (int h = 0; h < Iterations::kH; ++h) { + for (int w = 0; w < Iterations::kW; ++w, ++pred_it) { + for (int c = 0; c < Iterations::kC; ++c) { + if (*pred_it) { + load_element( + reinterpret_cast(frag_iterator.at(d, h, w, c)), d, h, w, c); + } + } + if (w < Iterations::kW - 1) { + inc_w(); + } + } + if (h < Iterations::kH - 1) { + inc_h(); + } + } + if (d < Iterations::kD - 1) { + inc_d(); + } + } + inc_advance(); + } + + /// Loads a fragment and advances the iterator to the next tile. + template + CUTLASS_HOST_DEVICE void load_post_increment(Fragment &fragment) { + typename PredicateVector::TrivialIterator pred_it; + load_post_increment(fragment, pred_it); + } + + /// Loads a fragment without advancing the iterator.. + template + CUTLASS_HOST_DEVICE void load(Fragment &fragment, PredicateIterator pred_it) const { + TileStoreIterator _load_it(*this); + _load_it.load_post_increment(fragment, pred_it); + } + + /// Loads a fragment without advancing the iterator.. + template + CUTLASS_HOST_DEVICE void load(Fragment &fragment) const { + typename PredicateVector::TrivialIterator pred_it; + load(fragment, pred_it); + } + + /// Loads a fragment without advancing the iterator.. + template + CUTLASS_HOST_DEVICE void load(Fragment &fragment, int d) { + FragmentIterator frag_iterator(fragment); + for (int h = 0; h < Iterations::kH; ++h) { + for (int w = 0; w < Iterations::kW; ++w) { + for (int c = 0; c < Iterations::kC; ++c) { + load_element(reinterpret_cast(frag_iterator.at(0, h, w, c)), d, h, w, c); + } + } + } + } }; -} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass diff --git a/cutlass/tile_stream.h b/cutlass/tile_stream.h new file mode 100644 index 0000000000..7790605a05 --- /dev/null +++ b/cutlass/tile_stream.h @@ -0,0 +1,378 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Implements the tile stream concept, composing an iterator with a transformation. Offers + split-phase semantics, separating the initiation of an asynchronous memory operation with a + fence forcing it to complete. +*/ +#pragma once + +// clang-format off + +#include "cutlass/convert.h" +#include "cutlass/tile_iterator.h" + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Generic stream for loading and transforming fragments +template > +struct TileLoadStream { + // + // Type definitions + // + + /// TileLoadIterator + typedef Iterator_ Iterator; + + /// Transformer + typedef Transformer_ Transformer; + + /// Fragment fetched from source memory + typedef typename Iterator::Fragment Fragment; + + /// Output fragment from transformer + typedef typename Transformer::OutputFragment TransformedFragment; + + /// Tensor reference expected by the stream + typedef typename Iterator::TensorRef TensorRef; + + /// Empty predicate vector struct + struct PredicateVector {}; + + /// Index type + typedef typename Iterator::Index Index; + + /// Parameters object used to construct generic load stream + struct Params { + /// Parameters to the iterator + typename Iterator::Params iterator; + + // + // Methods + // + + /// Default constructor + CUTLASS_HOST_DEVICE + Params() {} + + /// Constructor with iterator params + CUTLASS_HOST_DEVICE + Params(typename Iterator::Params const &_iterator) : iterator(_iterator) {} + }; + + // + // Data members + // + + /// Iterator to load tiles + Iterator iterator; + + /// Fragment loaded via iterator + Fragment fetched_fragment; + + /// Transformation applied to fragments + Transformer transformer; + + /// Transformed fragment from transformer + TransformedFragment transformed_fragment; + + // + // Methods + // + + /// Ctor + CUTLASS_DEVICE + TileLoadStream(Params const &_params, TensorRef const &_ref) + : iterator(_params.iterator, _ref) {} + + /// Ctor + CUTLASS_DEVICE + TileLoadStream(Params const &_params, + Coord<3> const &threadblock_offset = make_Coord(0, 0, 0) + ): iterator(_params.iterator, threadblock_offset) { } + + /// Loads a tile and increments the iterator + CUTLASS_DEVICE + void copy() { iterator.load_post_increment(fetched_fragment); } + + /// Commits the fetched fragment and applies a transformation + CUTLASS_DEVICE + void commit() { transformer.transform(fetched_fragment, transformed_fragment); } + + /// Accesses the loaded, transformed fragment + CUTLASS_DEVICE + Fragment &intermediate_fragment() { return fetched_fragment; } + + /// Accesses the loaded, transformed fragment + CUTLASS_DEVICE + TransformedFragment &fragment() { return transformed_fragment; } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Generic stream for transforming and storing fragments +template > +struct TileStoreStream { + // + // Type definitions + // + + /// TileLoadIterator + typedef Iterator_ Iterator; + + /// Transformer + typedef Transformer_ Transformer; + + /// Source fragment + typedef typename Transformer::InputFragment Fragment; + + /// Transformed fragment, compatible with Iterator::Fragment + typedef typename Transformer::OutputFragment TransformedFragment; + + /// Tensor reference expected by the underlying iterator + typedef typename Iterator::TensorRef TensorRef; + + /// Empty predicate vector struct + struct PredicateVector {}; + + /// Index type + typedef typename Iterator::Index Index; + + /// Parameters used to construct the stream + struct Params { + /// Parameters to the iterator + typename Iterator::Params iterator; + + // + // Methods + // + + /// Default constructor + CUTLASS_HOST_DEVICE + Params() {} + + /// Constructor with iterator params + CUTLASS_HOST_DEVICE + Params(typename Iterator::Params const &_iterator) : iterator(_iterator) {} + }; + + // + // Data members + // + + /// Iterator to store tiles + Iterator iterator; + + /// Transformation applied to inputs + Transformer transformer; + + /// Source fragment + Fragment source_fragment; + + /// Transformed fragment from transformer + TransformedFragment transformed_fragment; + + // + // Methods + // + + /// Ctor + CUTLASS_DEVICE + TileStoreStream(Params const &_params, TensorRef const &_ref) + : iterator(_params.iterator, _ref) {} + + /// Ctor + CUTLASS_DEVICE + TileStoreStream(Params const &_params, + Coord<3> const &threadblock_offset = make_Coord(0, 0, 0) + ): iterator(_params.iterator, threadblock_offset) { } + + /// Stores a fragment and increments the iterator + CUTLASS_DEVICE + void copy() { + + transformer.transform(source_fragment, transformed_fragment); + iterator.store_post_increment(transformed_fragment); + } + + /// Stores a fragment and increments the iterator + CUTLASS_DEVICE + void copy(Fragment const &frag) { + source_fragment = frag; + copy(); + } + + /// Commits the store operation + CUTLASS_DEVICE + void commit() {} + + /// Accesses the transformed fragment + CUTLASS_DEVICE + Fragment &fragment() { return source_fragment; } + + /// Accesses the fragment after trasnforming + CUTLASS_DEVICE + TransformedFragment &intermediate_fragment() { return transformed_fragment; } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Generic stream for loading and transforming fragments +template , + typename Transformer_ = Copy > +struct PredicatedTileLoadStream : public TileLoadStream { + // + // Type definitions + // + + typedef TileLoadStream Base; + + /// TileLoadIterator + typedef Iterator_ Iterator; + + /// Predicate functor + typedef PredicateFunctor_ PredicateFunctor; + + /// Transformer + typedef Transformer_ Transformer; + + /// Fragment fetched from source memory + typedef typename Base::Fragment Fragment; + + /// Output fragment from transformer + typedef typename Base::TransformedFragment TransformedFragment; + + /// Parameters object used to construct generic load stream + typedef typename Base::Params Params; + + // + // Data members + // + + /// Predicates + typename Iterator::PredicateVector predicates; + + // + // Methods + // + + /// Ctor + CUTLASS_DEVICE + PredicatedTileLoadStream(Params const &_params, + Coord<3> const &bounds, + Coord<3> const &threadblock_offset = make_Coord(0, 0, 0)) + : Base(_params, threadblock_offset) { + this->iterator.initialize_predicates( + predicates.begin(), PredicateFunctor(bounds), threadblock_offset); + } + + /// Loads a tile and increments the iterator + CUTLASS_DEVICE + void copy() { this->iterator.load_post_increment(this->fetched_fragment, predicates.begin()); } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Generic stream for transforming and storing fragments +template , + typename Transformer_ = Copy > +struct PredicatedTileStoreStream : public TileStoreStream { + // + // Type definitions + // + + typedef TileStoreStream Base; + + /// TileLoadIterator + typedef Iterator_ Iterator; + + /// Predicate functor + typedef PredicateFunctor_ PredicateFunctor; + + /// Transformer + typedef Transformer_ Transformer; + + /// Fragment fetched from source memory + typedef typename Base::Fragment Fragment; + + /// Output fragment from transformer + typedef typename Base::TransformedFragment TransformedFragment; + + /// Parameters object used to construct generic load stream + typedef typename Base::Params Params; + + // + // Data members + // + + /// Predicates + typename Iterator::PredicateVector predicates; + + // + // Methods + // + + /// Ctor + CUTLASS_DEVICE + PredicatedTileStoreStream(Params const &_params, + Coord<3> const &bounds, + Coord<3> const &threadblock_offset = make_Coord(0, 0, 0)) + : Base(_params, threadblock_offset) { + this->iterator.initialize_predicates( + predicates.begin(), PredicateFunctor(bounds), threadblock_offset); + } + + /// Stores the fragment and increments the iterator + CUTLASS_DEVICE + void copy() { + this->transformer.transform(this->source_fragment, this->transformed_fragment); + this->iterator.store_post_increment(this->transformed_fragment, predicates.begin()); + } + + /// Stores the fragment and increments the iterator + CUTLASS_DEVICE + void copy(Fragment const &frag) { + this->source_fragment = frag; + copy(); + } + + /// Commits the store operation + CUTLASS_DEVICE + void commit() {} +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass + +// clang-format on diff --git a/cutlass/tile_traits_standard.h b/cutlass/tile_traits_standard.h index 14ecd01abc..9145c5da92 100644 --- a/cutlass/tile_traits_standard.h +++ b/cutlass/tile_traits_standard.h @@ -28,7 +28,7 @@ */ #pragma once -#include +#include "cutlass/tile_iterator.h" namespace cutlass { @@ -204,6 +204,9 @@ struct TileTraitsStandard { /// Number of participating warps static int const kWarpCount = kThreads / kWarpSize; + /// By default, do not do scalar loads + static int const kAccessSize = 1; + // Static assertions static_assert(!(ShapeCount::kDhw % kThreads), "Tiling undefined if elements not divisible by threads."); @@ -223,8 +226,7 @@ struct TileTraitsStandard { typedef typename Traits::Delta Delta; /// Delta between each thread's access - /// TODO MTA this is wrong for sure, but Delta is used for stride computation at the moment - typedef Delta ImmediateOffsetStrides; + typedef Shape<0, 0, 0, 0> ImmediateOffsetStrides; /// Number of accesses typedef typename Traits::Iterations Iterations; diff --git a/cutlass/util/complex.h b/cutlass/util/complex.h new file mode 100644 index 0000000000..260a3abd2c --- /dev/null +++ b/cutlass/util/complex.h @@ -0,0 +1,457 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include "cutlass/cutlass.h" +#include + +namespace cutlass { +namespace platform { + +////////////////////////////////////////////////////////////////////////////////////////////////// + +// +// Accessors for CUDA complex types +// + +/// Returns the real part of the complex number +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +CUTLASS_HOST_DEVICE +float const &real(cuFloatComplex const &z) { return z.x; } + +/// Returns the real part of the complex number +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +CUTLASS_HOST_DEVICE +float &real(cuFloatComplex &z) { return z.x; } + +/// Returns the real part of the complex number +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +CUTLASS_HOST_DEVICE +double const &real(cuDoubleComplex const &z) { return z.x; } + +/// Returns the real part of the complex number +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +CUTLASS_HOST_DEVICE +double &real(cuDoubleComplex &z) { return z.x; } + +/// Returns the imaginary part of the complex number +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +CUTLASS_HOST_DEVICE +float const &imag(cuFloatComplex const &z) { return z.y; } + +/// Returns the imaginary part of the complex number +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +CUTLASS_HOST_DEVICE +float &imag(cuFloatComplex &z) { return z.y; } + +/// Returns the imaginary part of the complex number +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +CUTLASS_HOST_DEVICE +double const &imag(cuDoubleComplex const &z) { return z.y; } + +/// Returns the imaginary part of the complex number +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +CUTLASS_HOST_DEVICE +double &imag(cuDoubleComplex &z) { return z.y; } + +////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Class for representing and manipulating complex numbers with conversions from built-in CUDA +/// complex types. +template +class complex { + public: + /// Type alias for scalar type + typedef T value_type; + + private: + // + // Data members + // + + /// Real part + T _real; + + /// Imaginary part + T _imag; + + public: +// +// Methods +// + +/// Constructor +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type + CUTLASS_HOST_DEVICE + complex(T r = T(0), T i = T(0)) : _real(r), _imag(i) {} + +/// Conversion from cuFloatComplex +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type + CUTLASS_HOST_DEVICE + complex(cuFloatComplex const &z) : _real(platform::real(z)), _imag(platform::imag(z)) {} + +/// Conversion from cuDoubleComplex +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type + CUTLASS_HOST_DEVICE + complex(cuDoubleComplex const &z) : _real(platform::real(z)), _imag(platform::imag(z)) {} + +/// Accesses the real part of the complex number +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type + CUTLASS_HOST_DEVICE + T const &real() const { return _real; } + +/// Accesses the real part of the complex number +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type + CUTLASS_HOST_DEVICE + T &real() { return _real; } + +/// Accesses the imaginary part of the complex number +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type + CUTLASS_HOST_DEVICE + T const &imag() const { return _imag; } + +/// Accesses the imaginary part of the complex number +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type + CUTLASS_HOST_DEVICE + T &imag() { return _imag; } + +/// Converts to cuFloatComplex +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type + CUTLASS_HOST_DEVICE + operator cuFloatComplex() const { return make_cuFloatComplex(real(), imag()); } + +/// Converts to cuDoubleComplex +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type + CUTLASS_HOST_DEVICE + operator cuDoubleComplex() const { return make_cuDoubleComplex(real(), imag()); } +}; + +// +// Accessors for complex template +// + +/// Returns the real part of the complex number +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE T const &real(complex const &z) { + return z.real(); +} + +/// Returns the real part of the complex number +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE T &real(complex &z) { + return z.real(); +} + +/// Returns the imaginary part of the complex number +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE T const &imag(complex const &z) { + return z.imag(); +} + +/// Returns the imaginary part of the complex number +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE T &imag(complex &z) { + return z.imag(); +} + +// +// Output operators +// + +template +std::ostream &operator<<(std::ostream &out, complex const &z) { + T _r = real(z); + T _i = imag(z); + return out << _r << "+i" << _i; +} + +// +// Non-member operators defined for complex types +// + +/// Equality operator +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE bool operator==(complex const &lhs, complex const &rhs) { + return real(lhs) == (rhs) && imag(lhs) == imag(rhs); +} + +/// Inequality operator +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE bool operator!=(complex const &lhs, complex const &rhs) { + return !(lhs == rhs); +} + +/// Addition +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex operator+(complex const &lhs, complex const &rhs) { + return complex(real(lhs) + real(rhs), imag(lhs) + imag(rhs)); +} + +/// Subtraction +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex operator-(complex const &lhs, complex const &rhs) { + return complex(real(lhs) - real(rhs), imag(lhs) - imag(rhs)); +} + +/// Multiplication +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex operator*(complex const &lhs, complex const &rhs) { + return complex(real(lhs) * real(rhs) - imag(lhs) * imag(rhs), + real(lhs) * imag(rhs) + imag(lhs) * real(rhs)); +} + +/// Scalar Multiplication +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex operator*(complex const &lhs, T const &s) { + return complex(real(lhs) * s, imag(lhs) * s); +} + +/// Scalar Multiplication +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex operator*(T const &s, complex const &rhs) { + return complex(s * real(rhs), s * imag(rhs)); +} + +/// Division +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex operator/(complex const &lhs, complex const &rhs) { + T d = (real(rhs) * (rhs) + imag(rhs) * imag(rhs)); + + return complex((real(lhs) * (rhs) + imag(lhs) * imag(rhs)) / d, + (imag(lhs) * (rhs)-real(lhs) * imag(rhs)) / d); +} + +/// Scalar Division +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex operator/(complex const &lhs, T const &s) { + return complex(real(lhs) / s, imag(lhs) / s); +} + +/// Scalar divided by complex +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex operator/(T const &s, complex const &rhs) { + T d = (real(rhs) * (rhs) + imag(rhs) * imag(rhs)); + + return complex((s * (rhs)) / d, -(s * imag(rhs)) / d); +} + +/// Addition +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex &operator+=(complex &lhs, complex const &rhs) { + lhs = (lhs + rhs); + return lhs; +} + +/// Subtraction +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex &operator-=(complex &lhs, complex const &rhs) { + lhs = (lhs - rhs); + return lhs; +} + +/// Multiplication +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex &operator*=(complex &lhs, complex const &rhs) { + lhs = (lhs * rhs); + return lhs; +} + +/// Scalar multiplication +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex &operator*=(complex &lhs, T s) { + lhs = (lhs * s); + return lhs; +} + +/// Division +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex &operator/=(complex &lhs, complex const &rhs) { + lhs = (lhs / rhs); + return lhs; +} + +// +// Non-member functions defined for complex numbers +// + +/// Returns the magnitude of the complex number +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE T abs(complex const &z) { + return sqrt(norm(z)); +} + +/// Returns the magnitude of the complex number +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE T arg(complex const &z) { + return atan2(imag(z), real(z)); +} + +/// Returns the squared magnitude +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE T norm(complex const &z) { + return real(z) * real(z) + imag(z) * imag(z); +} + +/// Returns the complex conjugate +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex conj(complex const &z) { + return complex(real(z), -imag(z)); +} + +/// Projects the complex number z onto the Riemann sphere +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex proj(complex const &z) { + T d = real(z) * real(z) + imag(z) * imag(z) + T(1); + return complex((T(2) * real(z)) / d, (T(2) * imag(z)) / d); +} + +/// Returns a complex number with magnitude r and phase theta +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex polar(T const &r, T const &theta = T()) { + return complex(r * cos(theta), r * sin(theta)); +} + +/// Computes the complex exponential of z. +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex exp(complex const &z) { + return complex(real(z) * cos(imag(z)), real(z) * sin(imag(z))); +} + +/// Computes the complex exponential of z. +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex log(complex const &z) { + return complex(log(abs(z)), arg(z)); +} + +/// Computes the complex exponential of z. +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex log10(complex const &z) { + return log(z) / T(log(T(10))); +} + +/// Computes the square root of complex number z +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex sqrt(complex const &z) { + return sqrt(T(2)) / T(2) * + complex(sqrt(sqrt(norm(z)) + real(z)), + (imag(z) < 0 ? T(-1) : T(1)) * sqrt(sqrt(norm(z)) - real(z))); +} + +/// Computes the cosine of complex z. +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex cos(complex const &z) { + return (exp(z) + exp(-z)) / T(2); +} + +/// Computes the sin of complex z. +#pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex with a + // host-only type +template +CUTLASS_HOST_DEVICE complex sin(complex const &z) { + return (exp(-z) - exp(z)) * complex(T(0), T(1) / T(2)); +} + +////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace platform +} // namespace cutlass diff --git a/cutlass/util/cutlass_math.h b/cutlass/util/cutlass_math.h index 0ecdc43829..e3b46ef35a 100644 --- a/cutlass/util/cutlass_math.h +++ b/cutlass/util/cutlass_math.h @@ -30,7 +30,7 @@ * \brief Math utilities */ -#include +#include "cutlass/util/platform.h" namespace cutlass { @@ -128,4 +128,38 @@ CUTLASS_HOST_DEVICE value_t lcm(value_t a, value_t b) { return temp ? (a / temp * b) : 0; } +/** + * log2 computation, what's the + * difference between the below codes and + * log2_up/down codes? + */ +template +CUTLASS_HOST_DEVICE value_t clz(value_t x) { + for (int i = 31; i >= 0; --i) { + if ((1 << i) & x) return 31 - i; + } + return 32; +} + +template +CUTLASS_HOST_DEVICE value_t find_log2(value_t x) { + int a = 31 - clz(x); + a += (x & (x - 1)) != 0; // Round up, add 1 if not a power of 2. + return a; +} + +/****************************************************************************** + * Min/Max + ******************************************************************************/ + +template +struct Min { + static int const kValue = (A < B) ? A : B; +}; + +template +struct Max { + static int const kValue = (A > B) ? A : B; +}; + } // namespace cutlass diff --git a/cutlass/gemm/identity_block_swizzle.h b/cutlass/util/numeric_types.h similarity index 79% rename from cutlass/gemm/identity_block_swizzle.h rename to cutlass/util/numeric_types.h index e1bdb2e003..d8094a2567 100644 --- a/cutlass/gemm/identity_block_swizzle.h +++ b/cutlass/util/numeric_types.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -22,27 +22,26 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ -/*! \file - \brief Defies functors for mapping blockIdx to partitions of the GEMM computation. - - Currently, we only implement an identity mapping. +/*! + \file + \brief */ #pragma once namespace cutlass { -namespace gemm { -//////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// +// Definitions for 1-bit binary and 4-bit integer types +// + +struct bin1_t {}; // 1-bit binary type -struct IdentityBlockSwizzle { - /// Ctor. - CUTLASS_DEVICE IdentityBlockSwizzle() {} +struct int4_t {}; // 4-bit signed integer type - /// Swizzle the block index. - CUTLASS_DEVICE dim3 swizzle() { return blockIdx; } -}; +struct uint4_t {}; // 4-bit unsigned integer type -//////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////// -} // namespace gemm } // namespace cutlass diff --git a/cutlass/util/platform.h b/cutlass/util/platform.h index 2a44c10e62..3fd7c897d9 100644 --- a/cutlass/util/platform.h +++ b/cutlass/util/platform.h @@ -110,9 +110,17 @@ #include // For integral constants, conditional metaprogramming, and type traits #endif -#include +#include "cutlass/cutlass.h" #endif + +//----------------------------------------------------------------------------- +// OS +//----------------------------------------------------------------------------- +#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) +#define CUTLASS_OS_WINDOWS +#endif + /****************************************************************************** * Macros ******************************************************************************/ diff --git a/cutlass/vector.h b/cutlass/vector.h index a66dfdef7c..aeababb667 100644 --- a/cutlass/vector.h +++ b/cutlass/vector.h @@ -31,7 +31,8 @@ #include #endif -#include +#include "cutlass/util/numeric_types.h" +#include "cutlass/util/platform.h" namespace cutlass { @@ -80,13 +81,43 @@ union Vector { uint32_t registers[kRegisters]; /// Accessor to the ith lane. - CUTLASS_DEVICE Scalar const& operator[](uint32_t i) const { return scalars[i]; } + CUTLASS_HOST_DEVICE Scalar const& operator[](uint32_t i) const { return scalars[i]; } /// Accessor to the ith lane. - CUTLASS_DEVICE Scalar& operator[](uint32_t i) { return scalars[i]; } + CUTLASS_HOST_DEVICE Scalar& operator[](uint32_t i) { return scalars[i]; } }; //////////////////////////////////////////////////////////////////////////////////////////////////// +template <> +union Vector { + /// The scalar type. + typedef half Scalar; + + /// The number of elements in the vector. + enum { kLanes = 1 }; + /// The size of the vector. + enum { kVectorSize = kLanes * (int)sizeof(Scalar) }; + /// The number of registers needed to store the vector. + enum { kRegisters = kVectorSize < 4 ? 1 : kVectorSize / 4 }; + + // Make sure that the vector type makes sense. + static_assert(kVectorSize <= 16, "Vector type is too large"); + + /// The aligned storage to make sure we have good alignment. + AlignedStruct aligned_; + /// The associated array of scalars. + uint16_t scalars[kLanes]; + + /// Accessor to the ith lane. + CUTLASS_HOST_DEVICE Scalar const& operator[](uint32_t i) const { + return reinterpret_cast(scalars[i]); + } + /// Accessor to the ith lane. + CUTLASS_HOST_DEVICE Scalar& operator[](uint32_t i) { + return reinterpret_cast(scalars[i]); + } +}; + #if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16) template @@ -112,19 +143,124 @@ union Vector { uint32_t registers[kRegisters]; /// Accessor to the ith lane. - CUTLASS_DEVICE Scalar const& operator[](uint32_t i) const { + CUTLASS_HOST_DEVICE Scalar const& operator[](uint32_t i) const { return reinterpret_cast(scalars[i]); } /// Accessor to the ith lane. - CUTLASS_DEVICE Scalar& operator[](uint32_t i) { return reinterpret_cast(scalars[i]); } + CUTLASS_HOST_DEVICE Scalar& operator[](uint32_t i) { + return reinterpret_cast(scalars[i]); + } }; #endif //////////////////////////////////////////////////////////////////////////////////////////////////// +/// Vector definition for 1-bit binary datatype +template +union Vector { + /// The scalar type. + typedef bin1_t Scalar; + + /// The number of elements in the vector. + enum { kLanes = kLanes_ }; + /// The size of the vector. + enum { kVectorSize = kLanes / 8 }; + /// The number of registers needed to store the vector. + enum { kRegisters = kVectorSize < 4 ? 1 : kVectorSize / 4 }; + + static_assert((kLanes >= 8) && !(kLanes % 8), + "May only construct vectors of bin1_t that are multiples of 8 bits."); + + /// The aligned storage to make sure we have good alignment. + AlignedStruct aligned_; + /// The data in registers. + uint32_t registers[kRegisters]; + + /// Default Constructor + CUTLASS_HOST_DEVICE + Vector() {} + /// Constructor to convert from uint32_t type + CUTLASS_HOST_DEVICE Vector(uint32_t value) { registers[0] = value; } + /// Accessor to the ith lane. + CUTLASS_HOST_DEVICE bool operator[](uint32_t i) const { + return ( (registers[i / 32] & (1 << (i % 32))) != 0 ); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Vector definition for 4-bit signed integer datatype +template +union Vector { + /// The scalar type. + typedef int4_t Scalar; + + /// The number of elements in the vector. + enum { kLanes = kLanes_ }; + /// The size of the vector. + enum { kVectorSize = kLanes / 2 }; + /// The number of registers needed to store the vector. + enum { kRegisters = kVectorSize < 4 ? 1 : kVectorSize / 4 }; + + static_assert((kLanes >= 2) && !(kLanes % 2), + "May only construct vectors of int4_t that are multiples of 8 bits."); + + /// The aligned storage to make sure we have good alignment. + AlignedStruct aligned_; + /// The data in registers. + uint32_t registers[kRegisters]; + + /// Default Constructor + CUTLASS_HOST_DEVICE + Vector() {} + /// Constructor to convert from uint32_t type + CUTLASS_HOST_DEVICE Vector(uint32_t value) { registers[0] = value; } + /// Accessor to the ith lane. + CUTLASS_HOST_DEVICE int operator[](uint32_t i) const { + return (registers[i / 8] >> (i % 8 * 4) & 0x0f) + - 16 * (registers[i / 8] >> (i % 8 * 4 + 3) & 0x01); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Vector definition for 4-bit unsigned integer datatype +template +union Vector { + /// The scalar type. + typedef uint4_t Scalar; + + /// The number of elements in the vector. + enum { kLanes = kLanes_ }; + /// The size of the vector. + enum { kVectorSize = kLanes / 2 }; + /// The number of registers needed to store the vector. + enum { kRegisters = kVectorSize < 4 ? 1 : kVectorSize / 4 }; + + static_assert((kLanes >= 2) && !(kLanes % 2), + "May only construct vectors of uint4_t that are multiples of 8 bits."); + + /// The aligned storage to make sure we have good alignment. + AlignedStruct aligned_; + /// The data in registers. + uint32_t registers[kRegisters]; + + /// Default Constructor + CUTLASS_HOST_DEVICE + Vector() {} + /// Constructor to convert from uint32_t type + CUTLASS_HOST_DEVICE Vector(uint32_t value) { registers[0] = value; } + /// Accessor to the ith lane. + CUTLASS_HOST_DEVICE int operator[](uint32_t i) const { + return registers[i / 8] >> (i % 8 * 4) & 0x0f; + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + template -CUTLASS_DEVICE void make_zero(Scalar_& x) { +CUTLASS_HOST_DEVICE void make_zero(Scalar_& x) { x = Scalar_(0); } @@ -137,15 +273,29 @@ struct Vectorize { //////////////////////////////////////////////////////////////////////////////////////////////////// -template -struct Vectorize { - typedef Element_ Type; +template +struct Vectorize, kLanes_> { + typedef Vector Type; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Vectorize, kLanes_> { + typedef Vector Type; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Vectorize, kLanes_> { + typedef Vector Type; }; //////////////////////////////////////////////////////////////////////////////////////////////////// template -CUTLASS_DEVICE void make_zero(Vector& vec) { +CUTLASS_HOST_DEVICE void make_zero(Vector& vec) { for (int i = 0; i < Vector::kRegisters; ++i) { vec.registers[i] = 0; } diff --git a/cutlass/wmma_matrix.h b/cutlass/wmma_matrix.h index c4d8a0b54b..61c4ed2724 100644 --- a/cutlass/wmma_matrix.h +++ b/cutlass/wmma_matrix.h @@ -28,20 +28,23 @@ #pragma once #if defined(__CUDACC__) && (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700) - -// Dependent header files should use the following macro to guard all code using -// nvcuda::wmma:: to enable compilation for CUDA Compute Capabilities < sm_70. -// Earlier shader models not support Tensor Cores. #define CUTLASS_USE_WMMA_API +#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 10) && (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 750) +#define CUTLASS_USE_SUBBYTE_WMMA +#endif + #include "stdio.h" +#if __CUDACC_VER_MAJOR__ >= 10 +#include +#else #include -#include -#include -#include -#include -#include +#endif +#include "cutlass/fragment.h" +#include "cutlass/matrix_traits.h" +#include "cutlass/shape.h" +#include "cutlass/vector.h" namespace cutlass { @@ -61,6 +64,34 @@ struct WmmaLayout { //////////////////////////////////////////////////////////////////////////////////////////////////// +/// Statically maps cutlass types to nvcuda::wmma datatypes +template +struct WmmaDataType{ + typedef Type_ Type; +}; + +#ifdef CUTLASS_USE_SUBBYTE_WMMA +/// Statically maps cutlass::Vector to nvcuda::wmma::experimental::precision::b1 +template<> +struct WmmaDataType > { + typedef nvcuda::wmma::experimental::precision::b1 Type; +}; + +/// Statically maps cutlass::Vector to nvcuda::wmma::experimental::precision::s4 +template<> +struct WmmaDataType > { + typedef nvcuda::wmma::experimental::precision::s4 Type; +}; + +/// Statically maps cutlass::Vector to nvcuda::wmma::experimental::precision::u4 +template<> +struct WmmaDataType > { + typedef nvcuda::wmma::experimental::precision::u4 Type; +}; +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + /// Adapter to nvcuda::wmma fragment load and store operations template WmmaShape_::kH, WmmaShape_::kD, /// The scalar. - Scalar_, + typename WmmaDataType::Type, /// The layout. typename WmmaLayout::Layout> { /// This type. @@ -117,7 +148,7 @@ struct WmmaMatrix WmmaShape_::kH, WmmaShape_::kD, /// The scalar. - Scalar_, + typename WmmaDataType::Type, /// The layout. typename WmmaLayout::Layout> { /// This type. @@ -188,6 +219,18 @@ struct WmmaMatrix //////////////////////////////////////////////////////////////////////////////////////////////////// -} // namespace cutlass +// WmmaMatrix cannot be used in a Union and thus in cannot be used in our Vector implementation. +// The only use of WmmaMatrix in in combination with Vectorize has kLanes == 1. Due to this it is +// safe to keep the Vector->Scalar conversion for WmmaMatrix. +template +struct Vectorize, 1> { + typedef WmmaMatrix Type; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// +} #endif // defined CUTLASS_USE_WMMA_API diff --git a/cutlass/zip_fragment.h b/cutlass/zip_fragment.h new file mode 100644 index 0000000000..37a788614a --- /dev/null +++ b/cutlass/zip_fragment.h @@ -0,0 +1,150 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Models a pair of fragments +*/ +#pragma once + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/shape.h" +#include "cutlass/util/cutlass_math.h" +#include "cutlass/vector.h" + +namespace cutlass { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/** +* @brief A template defining \ref fragment_concept +* @concept{fragment_concept} +*/ +template +struct ZipFragment { + /// First fragment object + typedef First_ First; + + /// Second fragment object + typedef Second_ Second; + + /// This class. + typedef ZipFragment This_; + + // + // Data members + // + + /// First fragment object + First first; + + /// Second fragment object + Second second; + + // + // Methods + // + + /// Default ctor + CUTLASS_DEVICE + ZipFragment() { } + + /// Copy ctor + CUTLASS_DEVICE + ZipFragment(First const &_first, Second const &_second): first(_first), second(_second) { } + + /// Clear a fragment. + CUTLASS_DEVICE void clear() { + first.clear(); + second.clear(); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Helper to construct a ZipFragment object +template +CUTLASS_HOST_DEVICE +ZipFragment make_ZipFragment(First const &first, Second const &second) { + return ZipFragment(first, second); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Zips two convert operations +template +struct ZipConvert { + /// First convert operator + typedef First_ First; + + /// Second convert operator + typedef Second_ Second; + + /// Defines the input zip fragment + typedef ZipFragment InputFragment; + + /// Defines the output zip fragment + typedef ZipFragment + OutputFragment; + + // + // + // + + /// First transformer + First first; + + /// Second transformer + Second second; + + // + // + // + + /// Ctor. + CUTLASS_DEVICE ZipConvert() {} + + /// Ctor. + CUTLASS_DEVICE ZipConvert(First const &_first, Second const &_second): first(_first), second(_second) { } + + /// Transform a fragment. + CUTLASS_DEVICE void transform(InputFragment const& src, OutputFragment& dst) { + first.transform(src.first, dst.first); + second.transform(src.second, dst.second); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Helper to construct a ZipConvert object +template +CUTLASS_HOST_DEVICE +ZipConvert make_ZipConvert(First const &first, Second const &second) { + return ZipConvert(first, second); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass diff --git a/cutlass/zip_tensor_ref.h b/cutlass/zip_tensor_ref.h new file mode 100644 index 0000000000..d2cff9e0c0 --- /dev/null +++ b/cutlass/zip_tensor_ref.h @@ -0,0 +1,77 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Defines a structure containing a pair of TensorRef-like objects +*/ +#pragma once + +#include "cutlass/coord.h" +#include "cutlass/tensor_ref.h" + +namespace cutlass { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct ZipTensorRef { + /// First tensor ref + typedef First_ First; + + /// Second tensor ref + typedef Second_ Second; + + // + // Data members + // + + /// First TensorRef + First first; + + /// Second TensorRef + Second second; + + // + // Methods + // + + CUTLASS_HOST_DEVICE + ZipTensorRef() {} + + CUTLASS_HOST_DEVICE + ZipTensorRef(First const& _first, Second const& _second) : first(_first), second(_second) {} +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Constructs a ZipTensorRef +template +CUTLASS_HOST_DEVICE +ZipTensorRef make_ZipTensorRef(First const &first, Second const &second) { + return ZipTensorRef(first, second); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass diff --git a/cutlass/zip_tile_iterator.h b/cutlass/zip_tile_iterator.h new file mode 100644 index 0000000000..f8ba4eee3e --- /dev/null +++ b/cutlass/zip_tile_iterator.h @@ -0,0 +1,287 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Constructs an iterator that owns two tile iterator instances +*/ + +#pragma once + +#include "cutlass/coord.h" +#include "cutlass/zip_tensor_ref.h" +#include "cutlass/zip_fragment.h" + +namespace cutlass { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Constructs an iterator from a pair of iterators +template +class ZipTileIterator { + public: + /// First iterator type + typedef First_ First; + + /// Second iterator type + typedef Second_ Second; + + /// Params object + struct Params { + /// Parameters of first iterator + typename First::Params first; + + /// Parameters of second iterator + typename Second::Params second; + + /// Constructs a parameters object + CUTLASS_HOST_DEVICE + Params() {} + + /// Constructs a parameters object + CUTLASS_HOST_DEVICE + Params(typename First::Params const &_first, typename Second::Params const &_second) + : first(_first), second(_second) {} + }; + + /// Fragment type + typedef ZipFragment Fragment; + + /// Predicate vector + typedef typename First::PredicateVector PredicateVector; + + /// Index type + typedef typename First::Index Index; + + /// Tensor reference + typedef ZipTensorRef< + typename First::TensorRef, + typename Second::TensorRef> TensorRef; + + // + // Data members + // + + /// First iterator + First first; + + /// Second iterator + Second second; + + // + // Methods + // + + /// Default constructor + CUTLASS_DEVICE + ZipTileIterator() {} + + /// Constructs a zip iterator from params + CUTLASS_DEVICE + ZipTileIterator(Params const &_params, Coord<3> const &threadblock_offset = make_Coord(0, 0, 0)) + : first(_params.first, threadblock_offset), second(_params.second, threadblock_offset) {} + + /// Constructs a zip iterator from iterator instances + CUTLASS_DEVICE + ZipTileIterator(First const &_first, Second const &_second) : first(_first), second(_second) {} + + /// Constructs a zip iterator from iterator instances + CUTLASS_DEVICE + ZipTileIterator(TensorRef const &ref) : first(ref.first), second(ref.second) {} + + /// Constructs a zip iterator from iterator instances + CUTLASS_DEVICE + ZipTileIterator(Params const &_params, TensorRef const &ref): + first(_params.first, ref.first), second(_params.second, ref.second) {} + + // + // Predicate initialization + // + + /// Initializes a predicate vector using a RegularTilePredicateFunctor + template < + /// Predicate iterator + typename PredicateIterator> + CUTLASS_HOST_DEVICE void initialize_predicates(PredicateIterator predicate_it, + Coord<3> const &bounds, + Coord<3> const &block_offset = make_Coord(0, + 0, + 0)) { + first.initialize_predicates(predicate_it, bounds, block_offset); + } + + /// Initializes a predicate vector using an arbitrary predicate functor + template < + /// Predicate iterator + typename PredicateIterator, + /// Functor computing predicates + typename PredicateFunctor> + CUTLASS_HOST_DEVICE void initialize_predicates(PredicateIterator predicate_it, + PredicateFunctor const &functor, + Coord<3> const &block_offset) { + first.initialize_predicates(predicate_it, functor, block_offset); + } + + // + // No predicates + // + + /// Loads a fragment and increments without predicates + template + CUTLASS_DEVICE void load_post_increment(Fragment &fragment) { + first.load_post_increment(fragment.first); + second.load_post_increment(fragment.second); + } + + /// Loads a fragment and increments without predicates + template + CUTLASS_DEVICE void load_post_increment(Fragment &fragment, + Coord<4> const &offset) { + first.load_post_increment(fragment.first, offset); + second.load_post_increment(fragment.second, offset); + } + + /// Loads a fragment without predicates + template + CUTLASS_DEVICE void load(Fragment &fragment) const { + first.load(fragment.first); + second.load(fragment.second); + } + + /// Loads a fragment without predicates + template + CUTLASS_DEVICE void load(Fragment &fragment, + Coord<4> const &offset) const { + first.load(fragment.first, offset); + second.load(fragment.second, offset); + } + + /// Stores a fragment and increments without predicates + template + CUTLASS_DEVICE void store_post_increment(Fragment const &fragment) { + first.store_post_increment(fragment.first); + second.store_post_increment(fragment.second); + } + + /// Stores a fragment and increments without predicates + template + CUTLASS_DEVICE void store_post_increment(Fragment const &fragment, + Coord<4> const &offset) { + first.store_post_increment(fragment.first, offset); + second.store_post_increment(fragment.second, offset); + } + + /// Stores a fragment without predicates + template + CUTLASS_DEVICE void store(Fragment const &fragment) const { + first.store(fragment.first); + second.store(fragment.second); + } + + /// Stores a fragment without predicates + template + CUTLASS_DEVICE void store(Fragment const &fragment, + Coord<4> const &offset) const { + first.store(fragment.first, offset); + second.store(fragment.second, offset); + } + + // + // With predication + // + + /// Loads a fragment and increments, using predicates + template + CUTLASS_DEVICE void load_post_increment(Fragment &fragment, PredicateIterator pred_it) { + first.load_post_increment(fragment.first, pred_it); + second.load_post_increment(fragment.second, pred_it); + } + + /// Loads a fragment with predicates + template + CUTLASS_DEVICE void load(Fragment &fragment, PredicateIterator pred_it) const { + first.load(fragment.first, pred_it); + second.load(fragment.second, pred_it); + } + + /// Loads a fragment and increments, using predicates + template + CUTLASS_DEVICE void store_post_increment(Fragment const &fragment, PredicateIterator pred_it) { + first.store_post_increment(fragment.first, pred_it); + second.store_post_increment(fragment.second, pred_it); + } + + /// Loads a fragment with predicates + template + CUTLASS_DEVICE void store(Fragment const &fragment, PredicateIterator pred_it) const { + first.store(fragment.first, pred_it); + second.store(fragment.second, pred_it); + } + + // + // Advances the iterators + // + + /// Increments store iterator to next tile + CUTLASS_DEVICE ZipTileIterator &increment(int count = 1) { + first.increment(count); + second.increment(count); + return *this; + } + + /// Increments to next tile + CUTLASS_DEVICE ZipTileIterator &operator++() { return increment(); } + + CUTLASS_DEVICE ZipTileIterator &operator+=(int count) { return increment(count); } + + /// Adds a vector offset to the underlying iterators + CUTLASS_DEVICE ZipTileIterator &operator+=(Coord<3> const &offset) { + first += offset; + second += offset; + return *this; + } + + /// Increments store iterator to previous tile + CUTLASS_DEVICE ZipTileIterator &decrement(int count = 1) { + first.decrement(count); + second.decrement(count); + return *this; + } + + /// Increments to subsequent tile + CUTLASS_DEVICE ZipTileIterator &operator--() { return decrement(); } + + /// Decrements to previous tile + CUTLASS_DEVICE ZipTileIterator &operator-=(int count) { return decrement(count); } + + /// Adds an offset to both iterators + CUTLASS_DEVICE void add_pointer_offset(Index offset) { + first.add_pointer_offset(offset); + second.add_pointer_offset(offset); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namspace cutlass diff --git a/docs/annotated.html b/docs/annotated.html index e6c405d597..da54a8ee0a 100644 --- a/docs/annotated.html +++ b/docs/annotated.html @@ -74,303 +74,368 @@
Here are the classes, structs, unions and interfaces with brief descriptions:
[detail level 1234]
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 Ncutlass
 Ngemm
 Nplatform
 CAlignedStruct
 CComputeOffsetFromShapeCompute the offset for the given coordinates in a cube
 CComputeOffsetFromShape< Shape< 1, kSh_, kSw_, 1 > >Compute the offset for the given coordinates in a cube with one channel and a depth of 1
 CComputeOffsetFromShape< Shape< 1, kSh_, kSw_, kSc_ > >Compute the offset for the given coordinates in a cube with a depth of 1
 CComputeOffsetFromStridesCompute the offset for the given coordinates in a cube
 CComputeOffsetFromStrides< Shape< 1, S_h_, S_w_, 1 > >Compute the offset for the given coordinates in a cube with one channel and a depth of 1
 CComputeOffsetFromStrides< Shape< 1, S_h_, S_w_, S_c_ > >Compute the offset for the given coordinates in a cube with a depth of 1
 CComputeThreadOffsetFromStridesDecompose threadId.x into coordinate of a cube whose dimensions are specified by Threads_. Afterwards compute the offset of those coordinates using Strides_
 CComputeThreadOffsetFromStrides< Shape< 1, T_h_, T_w_, 1 >, Shape< 1, S_h_, S_w_, 1 > >Specialization for D=1 and C=1
 CComputeThreadOffsetFromStrides< Shape< 1, T_h_, T_w_, T_c_ >, Shape< 1, S_h_, S_w_, S_c_ > >Specialization for D=1
 CConstPredicateTileAdapterAdapter to enable random access to predicates via logical coordinate within a tile
 CConvert
 CConvert< Fragment< InputScalar_, kScalars_ >, Fragment< OutputScalar_, kScalars_ > >
 CCoordStatically-sized array specifying Coords within a tensor
 CCopy
 Cdivide_assert
 CExtentReturns the extent of a scalar or vector
 CExtent< Vector< T, Lanes > >Returns the number of lanes of a vector if need be
 CExtent< Vector< T, Lanes > const >Returns the number of lanes of a vector if need be
 CFragmentA template defining Fragment Concept
 CFragmentConstIterator
 CFragmentIteratorA template defining Fragment Iterator Concept
 CFragmentLoad
 CFragmentLoad< IteratorFragment::kScalar, kAccessSize, Scalar_, Memory_, FragmentElement_, kStride >
 CFragmentLoad< IteratorFragment::kWmmaMatrix, kAccessSize, Scalar_, Memory_, FragmentElement_, kStride >
 CFragmentStore
 CFragmentStore< IteratorFragment::kScalar, kAccessSize, Scalar_, Memory_, FragmentElement_, kStride >
 CFragmentStore< IteratorFragment::kWmmaMatrix, kAccessSize, Scalar_, Memory_, FragmentElement_, kStride >
 CGemmOperandGemm operand - D = A * B + C
 CIdentityDescribes identity elements
 Cis_pow2
 CIteratorAdvanceSpecifies dimension in which post-increment accesses advance
 CIteratorFragmentSpecifies whether iterator storage fragment consists of Scalar values or WMMA matrix
 CLoad
 CLoad< double, 2, Memory_, true, 16 >
 CLoad< Scalar_, Lanes_, Memory_, true, 16 >
 CLoad< Scalar_, Lanes_, Memory_, true, 4 >
 CLoad< Scalar_, Lanes_, Memory_, true, 8 >
 Clog2_down
 Clog2_down< N, 1, Count >
 Clog2_up
 Clog2_up< N, 1, Count >
 CMatrixLayoutDescribes layouts of matrices
 CMemorySpaceEnum to specify which memory space data resides in
 CPredicateTileAdapterAdapter to enable random access to predicates via logical coordinate within a tile
 CPredicateVectorStatically sized array of bits implementing
 CReshapeTile
 CReshapeTile< Tile_, kAccessSize_, true >
 CShapeA Shape implementing Layout Concept describing the dimensions of a cube
 CShapeAdd
 CShapeCountCompute derived counted of a Layout Concept based class
 CShapeDiv
 CShapeMax
 CShapeMin
 CShapeMul
 CShapeScale
 CShapeStrides
 CShapeSub
 Csqrt_est
 CStorageType
 CStorageType< 1 >
 CStorageType< 2 >
 CStorageType< 4 >
 CStore
 CStore< double, 2, Memory_, true, 16 >
 CStore< Scalar_, Lanes_, Memory_, true, 16 >
 CStore< Scalar_, Lanes_, Memory_, true, 4 >
 CStore< Scalar_, Lanes_, Memory_, true, 8 >
 CTensorRefStructure modeling a pointer and stride into a tensor
 CTensorViewHost-side reference implementation of tensor operations
 CTiledThreadOffsetBasic thread offset function computed from a thread shape
 CTileIteratorBaseIterator for accessing a stripmined tile in memory
 CTileLoadIteratorAn iterator implementing Tile Load Iterator Concept for loading a tile from memory
 CTileStoreIteratorAn iterator implementing Tile Store Iterator Concept for storing a tile to memory
 CTileTraitsA template defining Tile Traits Concept
 CTileTraitsContiguousMajor
 CTileTraitsStandardChooses 'best' shape to enable warp raking along contiguous dimension if possible
 CTileTraitsStrideMajor
 CTileTraitsWarpRakeTiling in which warps rake across the contiguous dimension
 CTrivialPredicateTileAdapterAlways returns true predicate
 CVector
 CVector< half, kLanes_ >
 CVectorize
 CVectorize< Element_, 1 >
 CVectorTraitsTraits describing properties of vectors and scalar-as-vectors
 CVectorTraits< Vector< T, Lanes > >Partial specialization for actual cutlass::Vector
 CVectorTraits< Vector< T, Lanes > const >Partial specialization for actual cutlass::Vector
 Ncutlass
 CDebugType
 CDebugValue
diff --git a/docs/classcutlass_1_1PredicateVector_1_1ConstIterator-members.html b/docs/classcutlass_1_1PredicateVector_1_1ConstIterator-members.html index 860cd05cbf..18f59fc0ce 100644 --- a/docs/classcutlass_1_1PredicateVector_1_1ConstIterator-members.html +++ b/docs/classcutlass_1_1PredicateVector_1_1ConstIterator-members.html @@ -91,7 +91,7 @@ diff --git a/docs/classcutlass_1_1PredicateVector_1_1ConstIterator.html b/docs/classcutlass_1_1PredicateVector_1_1ConstIterator.html index 1fbdc759c7..7e7089a067 100644 --- a/docs/classcutlass_1_1PredicateVector_1_1ConstIterator.html +++ b/docs/classcutlass_1_1PredicateVector_1_1ConstIterator.html @@ -381,7 +381,7 @@

diff --git a/docs/classcutlass_1_1PredicateVector_1_1Iterator-members.html b/docs/classcutlass_1_1PredicateVector_1_1Iterator-members.html index ca3ff04aa1..73d0ebcaa1 100644 --- a/docs/classcutlass_1_1PredicateVector_1_1Iterator-members.html +++ b/docs/classcutlass_1_1PredicateVector_1_1Iterator-members.html @@ -93,7 +93,7 @@ diff --git a/docs/classcutlass_1_1PredicateVector_1_1Iterator.html b/docs/classcutlass_1_1PredicateVector_1_1Iterator.html index 42a0693823..2cbc797d80 100644 --- a/docs/classcutlass_1_1PredicateVector_1_1Iterator.html +++ b/docs/classcutlass_1_1PredicateVector_1_1Iterator.html @@ -443,7 +443,7 @@

diff --git a/docs/classcutlass_1_1TensorRef-members.html b/docs/classcutlass_1_1TensorRef-members.html index 4bf37ad133..202c9ab424 100644 --- a/docs/classcutlass_1_1TensorRef-members.html +++ b/docs/classcutlass_1_1TensorRef-members.html @@ -73,35 +73,52 @@
-
cutlass::TensorRef< Storage_, Rank_ > Member List
+
cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > Member List
-

This is the complete list of members for cutlass::TensorRef< Storage_, Rank_ >, including all inherited members.

+

This is the complete list of members for cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >, including all inherited members.

- - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
advance(Coord< Rank > const &b)cutlass::TensorRef< Storage_, Rank_ >inline
at(Coord< Rank > const &coord) constcutlass::TensorRef< Storage_, Rank_ >inline
at(int idx) constcutlass::TensorRef< Storage_, Rank_ >inline
convert()cutlass::TensorRef< Storage_, Rank_ >inline
data() constcutlass::TensorRef< Storage_, Rank_ >inline
good() constcutlass::TensorRef< Storage_, Rank_ >inline
leading_dim() constcutlass::TensorRef< Storage_, Rank_ >inline
offset(Coord< Rank > const &coord) constcutlass::TensorRef< Storage_, Rank_ >inline
operator+(Coord< Rank > const &b) constcutlass::TensorRef< Storage_, Rank_ >inline
operator-(Coord< Rank > const &b) constcutlass::TensorRef< Storage_, Rank_ >inline
operator[](Coord< Rank > const &coord) constcutlass::TensorRef< Storage_, Rank_ >inline
operator[](int idx) constcutlass::TensorRef< Storage_, Rank_ >inline
Rankcutlass::TensorRef< Storage_, Rank_ >static
reset(Storage *ptr=nullptr, Coord< Rank > stride=Coord< Rank >(0))cutlass::TensorRef< Storage_, Rank_ >inline
Storage typedefcutlass::TensorRef< Storage_, Rank_ >
stride() constcutlass::TensorRef< Storage_, Rank_ >inline
stride(int dim) constcutlass::TensorRef< Storage_, Rank_ >inline
TensorRef()cutlass::TensorRef< Storage_, Rank_ >inline
TensorRef(Storage *ptr, Coord< Rank > stride)cutlass::TensorRef< Storage_, Rank_ >inline
add_pointer_offset(LongIndex delta)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
at(TensorCoord const &coord) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
at(LongIndex idx) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
const_ref() constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
ConstTensorRef typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
Coord_t typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
data() constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
good() constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
Index typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
kRankcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >static
kStorageRankcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >static
leading_dim(int idx=0) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
LongIndex typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
map(TensorCoord const &coord) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
MapFunc typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
offset(TensorCoord const &coord) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
operator+(TensorCoord const &b) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
operator+=(TensorCoord const &b)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
operator-(TensorCoord const &b) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
operator-=(TensorCoord const &b)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
operator[](TensorCoord const &coord) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
operator[](LongIndex idx) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
Rankcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >static
reset(Storage *ptr=nullptr)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
reset(Storage *ptr, StorageCoord const &stride)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
Storage typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
StorageCoord typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
stride() constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
stride(int dim) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
StrideVector typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
TensorCoord typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
TensorRef(Storage *ptr=nullptr)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
TensorRef(Storage *ptr, Index ldm)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
TensorRef(Storage *ptr, StrideVector const &stride)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
TensorRef(Storage *ptr, StorageCoord const &stride)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
TensorRef(TensorRef< typename platform::remove_const< Storage >::type, kRank, MapFunc, kStorageRank, Index, LongIndex > const &ref)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
diff --git a/docs/classcutlass_1_1TensorRef.html b/docs/classcutlass_1_1TensorRef.html index 05a9b3dd52..1053ca0a91 100644 --- a/docs/classcutlass_1_1TensorRef.html +++ b/docs/classcutlass_1_1TensorRef.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::TensorRef< Storage_, Rank_ > Class Template Reference +Cutlass: cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > Class Template Reference @@ -78,93 +78,278 @@ Static Public Attributes | List of all members
-
cutlass::TensorRef< Storage_, Rank_ > Class Template Reference
+
cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > Class Template Reference
-

Structure modeling a pointer and stride into a tensor. -

-

#include <tensor_ref.h>

+
+Inheritance diagram for cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >:
+
+
+ + +cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > +cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > + +
- - - + + + + + + + + + + + + + + + + + + + + + + + + + +

Public Types

typedef Storage_ Storage
 Data type of individual access. More...
 
typedef Storage_ Storage
 Data type of individual access. More...
 
typedef MapFunc_ MapFunc
 Mapping function from logical coordinate to internal n-D array. More...
 
typedef Index_ Index
 Index type. More...
 
typedef LongIndex_ LongIndex
 Typically, strides in memory can be very large. More...
 
typedef Coord< kRankTensorCoord
 Coordinate in logical tensor space. More...
 
typedef Coord< kStorageRankStorageCoord
 Coordinate in storage n-D array. More...
 
typedef Coord< kStorageRank - 1 > StrideVector
 
typedef TensorRef< typename platform::remove_const< Storage >::type const, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > ConstTensorRef
 Tensor reference to of constant value. More...
 
typedef TensorCoord Coord_t
 Coordinate in logical tensor space. More...
 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Public Member Functions

CUTLASS_HOST_DEVICE TensorRef ()
 Default ctor. More...
 
CUTLASS_HOST_DEVICE TensorRef (Storage *ptr, Coord< Rank > stride)
 Constructs from a pointer, size, and stride. More...
 
CUTLASS_HOST_DEVICE void reset (Storage *ptr=nullptr, Coord< Rank > stride=Coord< Rank >(0))
 Updates the pointer, stride, and location within a TensorRef. More...
 
template<typename T >
TensorRef< T, Rankconvert ()
 Conversion function. More...
 
CUTLASS_HOST_DEVICE bool good () const
 Returns true if the TensorRef may be safely accessed. More...
 
CUTLASS_HOST_DEVICE Storagedata () const
 Returns the pointer to referenced data. More...
 
CUTLASS_HOST_DEVICE Coord< Rank > const & stride () const
 Returns the stride of the tensor. More...
 
CUTLASS_HOST_DEVICE int const & stride (int dim) const
 Returns the stride of the tensor in the given dimension. More...
 
CUTLASS_HOST_DEVICE int leading_dim () const
 Returns the maximum stride element as the 'leading dimension'. More...
 
CUTLASS_HOST_DEVICE long long offset (Coord< Rank > const &coord) const
 Computes the offset of an index from the origin of the tensor. More...
 
CUTLASS_HOST_DEVICE Storageat (Coord< Rank > const &coord) const
 Returns a reference to the element at a given Coord. More...
 
Storageoperator[] (Coord< Rank > const &coord) const
 Element-wise accessor. More...
 
CUTLASS_HOST_DEVICE Storageat (int idx) const
 Returns a reference to the element at a given Coord. More...
 
Storageoperator[] (int idx) const
 Element-wise accessor. More...
 
CUTLASS_HOST_DEVICE TensorRefadvance (Coord< Rank > const &b)
 Adds an offset to the pointer. More...
 
CUTLASS_HOST_DEVICE TensorRef operator+ (Coord< Rank > const &b) const
 Returns a TensorRef offset by a given amount. More...
 
CUTLASS_HOST_DEVICE TensorRef operator- (Coord< Rank > const &b) const
 Returns a TensorRef offset by a given amount. More...
 
CUTLASS_HOST_DEVICE TensorRef (Storage *ptr=nullptr)
 Helper for 1-D memory. All higher ranks are projected onto the fastest changing rank. More...
 
CUTLASS_HOST_DEVICE TensorRef (Storage *ptr, Index ldm)
 Helper to construct from a pointer and single stride element for 2-D pitch linear memory. More...
 
CUTLASS_HOST_DEVICE TensorRef (Storage *ptr, StrideVector const &stride)
 Constructs from a single pointer and stride vector. More...
 
CUTLASS_HOST_DEVICE TensorRef (Storage *ptr, StorageCoord const &stride)
 
CUTLASS_HOST_DEVICE TensorRef (TensorRef< typename platform::remove_const< Storage >::type, kRank, MapFunc, kStorageRank, Index, LongIndex > const &ref)
 Enables conversion from TensorRef of non-const type. More...
 
CUTLASS_HOST_DEVICE ConstTensorRef const_ref () const
 Returns a reference to constant-valued tensor. More...
 
CUTLASS_HOST_DEVICE void reset (Storage *ptr=nullptr)
 Updates only the pointer. More...
 
CUTLASS_HOST_DEVICE void reset (Storage *ptr, StorageCoord const &stride)
 Updates the pointer, stride, and location within a TensorRef. More...
 
CUTLASS_HOST_DEVICE bool good () const
 Returns true if the TensorRef may be safely accessed. More...
 
CUTLASS_HOST_DEVICE Storagedata () const
 Returns the pointer to referenced data. More...
 
CUTLASS_HOST_DEVICE StorageCoord stride () const
 Returns the stride of the tensor. More...
 
CUTLASS_HOST_DEVICE Index stride (int dim) const
 Returns the stride of the tensor in the given dimension. More...
 
CUTLASS_HOST_DEVICE Index leading_dim (int idx=0) const
 Returns the maximum stride element as the 'leading dimension'. More...
 
CUTLASS_HOST_DEVICE StorageCoord map (TensorCoord const &coord) const
 Maps a logical coordinate to an n-D array in memory. More...
 
CUTLASS_HOST_DEVICE LongIndex offset (TensorCoord const &coord) const
 Computes the offset of an index from the origin of the tensor. More...
 
CUTLASS_HOST_DEVICE Storageat (TensorCoord const &coord) const
 Returns a reference to the element at a given Coord. More...
 
CUTLASS_HOST_DEVICE Storageat (LongIndex idx) const
 Returns a reference to the element at a given linear index. More...
 
CUTLASS_HOST_DEVICE Storageoperator[] (TensorCoord const &coord) const
 Returns a reference to the element at a given Coord. More...
 
CUTLASS_HOST_DEVICE Storageoperator[] (LongIndex idx) const
 Returns a reference to the element at a given linear index. More...
 
CUTLASS_HOST_DEVICE TensorRefadd_pointer_offset (LongIndex delta)
 Adds an offset to each pointer. More...
 
CUTLASS_HOST_DEVICE TensorRef operator+ (TensorCoord const &b) const
 Returns a TensorRef offset by a given amount. More...
 
CUTLASS_HOST_DEVICE TensorRefoperator+= (TensorCoord const &b)
 Returns a TensorRef offset by a given amount. More...
 
CUTLASS_HOST_DEVICE TensorRef operator- (TensorCoord const &b) const
 Returns a TensorRef offset by a given amount. More...
 
CUTLASS_HOST_DEVICE TensorRefoperator-= (TensorCoord const &b)
 Returns a TensorRef offset by a given amount. More...
 
- - - + + + + + + + + +

Static Public Attributes

static int const Rank = Rank_
 Rank of tensor. More...
 
static int const kRank = Rank_
 Logical rank of tensor index space. More...
 
static int const kStorageRank = StorageRank_
 Rank of internal storage. More...
 
static int const Rank = kRank
 Logical rank of tensor index space. More...
 

Member Typedef Documentation

- -

◆ Storage

+ +

◆ ConstTensorRef

+ +
+
+
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef TensorRef< typename platform::remove_const<Storage>::type const, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_> cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstTensorRef
+
+ +
+
+ +

◆ Coord_t

+ +
+
+
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef TensorCoord cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::Coord_t
+
+

Require at least rank=1. Mathematically, a rank=0 tensor would be considered to be a scalar, but degenerate cases such as these are difficult to accommodate without extensive C++ metaprogramming or support for zero-length arrays.

+ +
+
+ +

◆ Index

+ +
+
+
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef Index_ cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::Index
+
+ +
+
+ +

◆ LongIndex

+ +
+
+
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef LongIndex_ cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::LongIndex
+
+ +
+
+ +

◆ MapFunc

+ +
+
+
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef MapFunc_ cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::MapFunc
+
+ +
+
+ +

◆ Storage

+ +
+
+
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef Storage_ cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::Storage
+
+ +
+
+ +

◆ StorageCoord

+ +
+
+
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef Coord<kStorageRank> cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::StorageCoord
+
+ +
+
+ +

◆ StrideVector

-template<typename Storage_, int Rank_>
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
- + + +
typedef Storage_ cutlass::TensorRef< Storage_, Rank_ >::Storagetypedef Coord<kStorageRank - 1> cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::StrideVector
+
+

Stride vector in storage coordinage space - assumes least significant stride is 1 and does not store it.

+ +
+
+ +

◆ TensorCoord

+ +
+
+
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + +
typedef Coord<kRank> cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::TensorCoord
@@ -172,21 +357,22 @@

Constructor & Destructor Documentation

- -

◆ TensorRef() [1/2]

+ +

◆ TensorRef() [1/5]

-template<typename Storage_, int Rank_>
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
- +
- + - + +
CUTLASS_HOST_DEVICE cutlass::TensorRef< Storage_, Rank_ >::TensorRef CUTLASS_HOST_DEVICE cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::TensorRef ()Storageptr = nullptr)
@@ -199,27 +385,103 @@

-

◆ TensorRef() [2/2]

+ +

◆ TensorRef() [2/5]

-template<typename Storage_, int Rank_>
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + +
- + - + - + + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TensorRef< Storage_, Rank_ >::TensorRef CUTLASS_HOST_DEVICE cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::TensorRef (StorageStorage ptr,
Coord< RankIndex ldm 
)
+
+inline
+
+ +
+ + +

◆ TensorRef() [3/5]

+ +
+
+
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::TensorRef (Storageptr,
StrideVector const & stride 
)
+
+inline
+
+ +
+
+ +

◆ TensorRef() [4/5]

+ +
+
+
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + - - - - - - + + + + +
+ + + + + + + + + + + @@ -234,26 +496,55 @@

+

Constructs from a pointer and a stride vector of size kRank. If fastest changing stride is not 1, construction fails and subsequent calls to good() will return false.

+ + + + +

◆ TensorRef() [5/5]

+ +
+
+
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+

CUTLASS_HOST_DEVICE cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::TensorRef (Storageptr,
StorageCoord const &  stride 
+ + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::TensorRef (TensorRef< typename platform::remove_const< Storage >::type, kRank, MapFunc, kStorageRank, Index, LongIndex > const & ref)
+
+inline
+

Member Function Documentation

- -

◆ advance()

+ +

◆ add_pointer_offset()

-template<typename Storage_, int Rank_>
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
- + - - + +
CUTLASS_HOST_DEVICE TensorRef& cutlass::TensorRef< Storage_, Rank_ >::advance CUTLASS_HOST_DEVICE TensorRef& cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::add_pointer_offset (Coord< Rank > const & b)LongIndex delta)
@@ -266,21 +557,21 @@

-

◆ at() [1/2]

+ +

◆ at() [1/2]

-template<typename Storage_, int Rank_>
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
- + - + @@ -294,21 +585,21 @@

-

◆ at() [2/2]

+ +

◆ at() [2/2]

-template<typename Storage_, int Rank_>
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

CUTLASS_HOST_DEVICE Storage& cutlass::TensorRef< Storage_, Rank_ >::at CUTLASS_HOST_DEVICE Storage& cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::at (Coord< Rank > const & TensorCoord const &  coord) const
- + - + @@ -322,24 +613,22 @@

-

◆ convert()

+ +

◆ const_ref()

-template<typename Storage_, int Rank_>
-
-template<typename T >
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

CUTLASS_HOST_DEVICE Storage& cutlass::TensorRef< Storage_, Rank_ >::at CUTLASS_HOST_DEVICE Storage& cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::at (int LongIndex  idx) const
@@ -351,19 +640,19 @@

-

◆ data()

+ +

◆ data()

-template<typename Storage_, int Rank_>
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

- + - +
TensorRef<T, Rank> cutlass::TensorRef< Storage_, Rank_ >::convert CUTLASS_HOST_DEVICE ConstTensorRef cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::const_ref ( ) const
- + + +
- + @@ -378,19 +667,19 @@

-

◆ good()

+ +

◆ good()

-template<typename Storage_, int Rank_>
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

CUTLASS_HOST_DEVICE Storage* cutlass::TensorRef< Storage_, Rank_ >::data CUTLASS_HOST_DEVICE Storage* cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::data ( ) const
- +
- + @@ -405,21 +694,22 @@

-

◆ leading_dim()

+ +

◆ leading_dim()

-template<typename Storage_, int Rank_>
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

CUTLASS_HOST_DEVICE bool cutlass::TensorRef< Storage_, Rank_ >::good CUTLASS_HOST_DEVICE bool cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::good ( ) const
- +
- + - + +
CUTLASS_HOST_DEVICE int cutlass::TensorRef< Storage_, Rank_ >::leading_dim CUTLASS_HOST_DEVICE Index cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::leading_dim ()int idx = 0) const
@@ -432,21 +722,21 @@

-

◆ offset()

+ +

◆ map()

-template<typename Storage_, int Rank_>
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
- +
- + - + @@ -460,21 +750,49 @@

-

◆ operator+()

+ +

◆ offset()

-template<typename Storage_, int Rank_>
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

CUTLASS_HOST_DEVICE long long cutlass::TensorRef< Storage_, Rank_ >::offset CUTLASS_HOST_DEVICE StorageCoord cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::map (Coord< Rank > const & TensorCoord const &  coord) const
+ + +
- + - + + + + +
CUTLASS_HOST_DEVICE TensorRef cutlass::TensorRef< Storage_, Rank_ >::operator+ CUTLASS_HOST_DEVICE LongIndex cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::offset (Coord< Rank > const & TensorCoord const & coord) const
+
+inline
+
+ +
+ + +

◆ operator+()

+ +
+
+
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + +
+ + + + + @@ -488,21 +806,49 @@

-

◆ operator-()

+ +

◆ operator+=()

+ +
+
+
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+

CUTLASS_HOST_DEVICE TensorRef cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::operator+ (TensorCoord const &  b) const
+ + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE TensorRef& cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::operator+= (TensorCoord const & b)
+
+inline
+
+ +
+ + +

◆ operator-()

-template<typename Storage_, int Rank_>
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
- +
- + - + @@ -516,21 +862,49 @@

-

◆ operator[]() [1/2]

+ +

◆ operator-=()

+ +
+
+
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+

CUTLASS_HOST_DEVICE TensorRef cutlass::TensorRef< Storage_, Rank_ >::operator- CUTLASS_HOST_DEVICE TensorRef cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::operator- (Coord< Rank > const & TensorCoord const &  b) const
+ + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE TensorRef& cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::operator-= (TensorCoord const & b)
+
+inline
+
+ +
+ + +

◆ operator[]() [1/2]

-template<typename Storage_, int Rank_>
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
- +
- + - + @@ -544,21 +918,21 @@

-

◆ operator[]() [2/2]

+ +

◆ operator[]() [2/2]

-template<typename Storage_, int Rank_>
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

Storage& cutlass::TensorRef< Storage_, Rank_ >::operator[] CUTLASS_HOST_DEVICE Storage& cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::operator[] (Coord< Rank > const & TensorCoord const &  coord) const
- + - + - + @@ -122,25 +122,28 @@ - - - - - - - - - + + + + + + + + + + + + - - - - + + + +
- + - + @@ -572,28 +946,56 @@

-

◆ reset()

+ +

◆ reset() [1/2]

-template<typename Storage_, int Rank_>
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

Storage& cutlass::TensorRef< Storage_, Rank_ >::operator[] CUTLASS_HOST_DEVICE Storage& cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::operator[] (int LongIndex  idx) const
+ + +
- + - - + + + + +
CUTLASS_HOST_DEVICE void cutlass::TensorRef< Storage_, Rank_ >::reset CUTLASS_HOST_DEVICE void cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::reset (Storageptr = nullptr, Storageptr = nullptr)
+
+inline
+
+ +
+ + +

◆ reset() [2/2]

+ +
+
+
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + - + +
+ + + + + + - - + + @@ -610,19 +1012,19 @@

-

◆ stride() [1/2]

+ +

◆ stride() [1/2]

-template<typename Storage_, int Rank_>
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

CUTLASS_HOST_DEVICE void cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::reset (Storageptr,
Coord< Rankstride = Coord<Rank>(0) StorageCoord const & stride 
+ + +
- + @@ -637,19 +1039,19 @@

-

◆ stride() [2/2]

+ +

◆ stride() [2/2]

-template<typename Storage_, int Rank_>
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

CUTLASS_HOST_DEVICE Coord<Rank> const& cutlass::TensorRef< Storage_, Rank_ >::stride CUTLASS_HOST_DEVICE StorageCoord cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::stride ( ) const
@@ -511,8 +551,8 @@

-

◆ inc_advance()

+ +

◆ inc_c()

@@ -523,7 +563,7 @@

- + @@ -666,19 +1068,67 @@

Member Data Documentation

- -

◆ Rank

+ +

◆ kRank

+ +
+
+
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+
CUTLASS_HOST_DEVICE int const& cutlass::TensorRef< Storage_, Rank_ >::stride CUTLASS_HOST_DEVICE Index cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::stride ( int  dim)
+ + + + +
+ + + + +
int const cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::kRank = Rank_
+
+static
+
+ +
+ + +

◆ kStorageRank

+ +
+
+
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + +
int const cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::kStorageRank = StorageRank_
+
+static
+
+ +
+
+ +

◆ Rank

-template<typename Storage_, int Rank_>
+template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
@@ -696,7 +1146,7 @@

diff --git a/docs/classcutlass_1_1TensorRef.png b/docs/classcutlass_1_1TensorRef.png new file mode 100644 index 0000000000..f8caaa61d0 Binary files /dev/null and b/docs/classcutlass_1_1TensorRef.png differ diff --git a/docs/classcutlass_1_1TensorRefArray_1_1ConstIterator-members.html b/docs/classcutlass_1_1TensorRefArray_1_1ConstIterator-members.html new file mode 100644 index 0000000000..44c118956d --- /dev/null +++ b/docs/classcutlass_1_1TensorRefArray_1_1ConstIterator-members.html @@ -0,0 +1,101 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+

- +
int const cutlass::TensorRef< Storage_, Rank_ >::Rank = Rank_int const cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::Rank = kRank
+ + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + + +
+
+
cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator Member List
+
+
+ +

This is the complete list of members for cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator, including all inherited members.

+ + + + + + + + + + + + +
ConstIterator(TensorArrayRef const &ref, int idx=0)cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
operator() constcutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
operator+(Index idx)cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
operator++()cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
operator++(int)cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
operator+=(Index idx)cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
operator-(Index idx)cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
operator--()cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
operator--(int)cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
operator-=(Index idx)cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
TensorRef typedefcutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator
+ + + + diff --git a/docs/classcutlass_1_1TensorRefArray_1_1ConstIterator.html b/docs/classcutlass_1_1TensorRefArray_1_1ConstIterator.html new file mode 100644 index 0000000000..aa40085cb9 --- /dev/null +++ b/docs/classcutlass_1_1TensorRefArray_1_1ConstIterator.html @@ -0,0 +1,440 @@ + + + + + + + +Cutlass: cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator Class Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator Class Reference
+
+
+ +

TensorRefIterator over TensorRef objects in TensorRefArray. +

+ +

#include <tensor_ref_collection.h>

+ + + + + +

+Public Types

typedef Base TensorRef
 TensorRef returned by the iterator. More...
 
+ + + + + + + + + + + + + + + + + + + + + + + + + + +

+Public Member Functions

CUTLASS_HOST_DEVICE ConstIterator (TensorArrayRef const &ref, int idx=0)
 Constructs a ConstIterator over the TensorRef objects. More...
 
CUTLASS_HOST_DEVICE TensorRefoperator () const
 Obtains a TensorRef pointed to by this iterator. More...
 
CUTLASS_HOST_DEVICE ConstIteratoroperator++ ()
 Advances to next TensorRef. More...
 
CUTLASS_HOST_DEVICE ConstIterator operator++ (int)
 Advances to next TensorRef. More...
 
CUTLASS_HOST_DEVICE ConstIterator operator+ (Index idx)
 
CUTLASS_HOST_DEVICE ConstIteratoroperator+= (Index idx)
 
CUTLASS_HOST_DEVICE ConstIteratoroperator-- ()
 
CUTLASS_HOST_DEVICE ConstIterator operator-- (int)
 Advances to next TensorRef. More...
 
CUTLASS_HOST_DEVICE ConstIteratoroperator-= (Index idx)
 
CUTLASS_HOST_DEVICE ConstIterator operator- (Index idx)
 
+

Member Typedef Documentation

+ +

◆ TensorRef

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef Base cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::TensorRef
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ ConstIterator()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::ConstIterator (TensorArrayRef const & ref,
int idx = 0 
)
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ operator()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE TensorRef* cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::operator () const
+
+inline
+
+ +
+
+ +

◆ operator+()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE ConstIterator cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::operator+ (Index idx)
+
+inline
+
+ +
+
+ +

◆ operator++() [1/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE ConstIterator& cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::operator++ ()
+
+inline
+
+ +
+
+ +

◆ operator++() [2/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE ConstIterator cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::operator++ (int )
+
+inline
+
+ +
+
+ +

◆ operator+=()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE ConstIterator& cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::operator+= (Index idx)
+
+inline
+
+ +
+
+ +

◆ operator-()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE ConstIterator cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::operator- (Index idx)
+
+inline
+
+ +
+
+ +

◆ operator--() [1/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE ConstIterator& cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::operator-- ()
+
+inline
+
+ +
+
+ +

◆ operator--() [2/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE ConstIterator cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::operator-- (int )
+
+inline
+
+ +
+
+ +

◆ operator-=()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE ConstIterator& cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::operator-= (Index idx)
+
+inline
+
+ +
+
+
The documentation for this class was generated from the following file: +
+ + + + diff --git a/docs/classcutlass_1_1TensorRefBatchStrided_1_1ConstIterator-members.html b/docs/classcutlass_1_1TensorRefBatchStrided_1_1ConstIterator-members.html new file mode 100644 index 0000000000..bb3876187e --- /dev/null +++ b/docs/classcutlass_1_1TensorRefBatchStrided_1_1ConstIterator-members.html @@ -0,0 +1,102 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator Member List
+
+
+ +

This is the complete list of members for cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator, including all inherited members.

+ + + + + + + + + + + + + +
ConstIterator(TensorRefBatchStrided const &ref, LongIndex offset=0)cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
operator() constcutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
operator+(Index idx)cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
operator++()cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
operator++(int)cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
operator+=(Index idx)cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
operator-(Index idx)cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
operator-(ConstIterator const &it)cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
operator--()cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
operator--(int)cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
operator-=(Index idx)cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIteratorinline
TensorRef typedefcutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator
+ + + + diff --git a/docs/classcutlass_1_1TensorRefBatchStrided_1_1ConstIterator.html b/docs/classcutlass_1_1TensorRefBatchStrided_1_1ConstIterator.html new file mode 100644 index 0000000000..c3dbd9dfc8 --- /dev/null +++ b/docs/classcutlass_1_1TensorRefBatchStrided_1_1ConstIterator.html @@ -0,0 +1,476 @@ + + + + + + + +Cutlass: cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator Class Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator Class Reference
+
+
+ +

Constant iterator over tensors implied by TensorRefBatchStrided. +

+ +

#include <tensor_ref_collection.h>

+ + + + + +

+Public Types

typedef Base TensorRef
 TensorRef returned by the iterator. More...
 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

+Public Member Functions

CUTLASS_HOST_DEVICE ConstIterator (TensorRefBatchStrided const &ref, LongIndex offset=0)
 Constructs a ConstIterator from a parent TensorRefBatchStrided. More...
 
CUTLASS_HOST_DEVICE TensorRefoperator () const
 Obtains a TensorRef pointed to by the iterator. More...
 
CUTLASS_HOST_DEVICE ConstIteratoroperator++ ()
 Advances the iterator to point to the next tensor. More...
 
CUTLASS_HOST_DEVICE ConstIterator operator++ (int)
 Advances the iterator to point to the next tensor. More...
 
CUTLASS_HOST_DEVICE ConstIterator operator+ (Index idx)
 Returns an iterator advanced by (idx) amount. More...
 
CUTLASS_HOST_DEVICE ConstIteratoroperator+= (Index idx)
 Advances this iterator by (idx) and returns a reference to self. More...
 
CUTLASS_HOST_DEVICE ConstIteratoroperator-- ()
 Moves to the previous tensor. More...
 
CUTLASS_HOST_DEVICE ConstIterator operator-- (int)
 Moves to the previous tensor. More...
 
CUTLASS_HOST_DEVICE ConstIterator operator- (Index idx)
 Returns an iterator moved forward by (idx) amount. More...
 
CUTLASS_HOST_DEVICE ConstIteratoroperator-= (Index idx)
 Moves this iterator by (idx) and returns a reference to self. More...
 
CUTLASS_HOST_DEVICE Stride operator- (ConstIterator const &it)
 Returns the difference in offset between two iterators. More...
 
+

Member Typedef Documentation

+ +

◆ TensorRef

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef Base cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::TensorRef
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ ConstIterator()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::ConstIterator (TensorRefBatchStrided const & ref,
LongIndex offset = 0 
)
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ operator()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE TensorRef* cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::operator () const
+
+inline
+
+ +
+
+ +

◆ operator+()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE ConstIterator cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::operator+ (Index idx)
+
+inline
+
+ +
+
+ +

◆ operator++() [1/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE ConstIterator& cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::operator++ ()
+
+inline
+
+ +
+
+ +

◆ operator++() [2/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE ConstIterator cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::operator++ (int )
+
+inline
+
+ +
+
+ +

◆ operator+=()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE ConstIterator& cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::operator+= (Index idx)
+
+inline
+
+ +
+
+ +

◆ operator-() [1/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE ConstIterator cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::operator- (Index idx)
+
+inline
+
+ +
+
+ +

◆ operator-() [2/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE Stride cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::operator- (ConstIterator const & it)
+
+inline
+
+ +
+
+ +

◆ operator--() [1/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE ConstIterator& cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::operator-- ()
+
+inline
+
+ +
+
+ +

◆ operator--() [2/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE ConstIterator cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::operator-- (int )
+
+inline
+
+ +
+
+ +

◆ operator-=()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE ConstIterator& cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstIterator::operator-= (Index idx)
+
+inline
+
+ +
+
+
The documentation for this class was generated from the following file: +
+ + + + diff --git a/docs/classcutlass_1_1TensorRef_3_01Storage___00_01Rank___00_01MapFunc___00_011_00_01Index___00_01LongIndex___01_4-members.html b/docs/classcutlass_1_1TensorRef_3_01Storage___00_01Rank___00_01MapFunc___00_011_00_01Index___00_01LongIndex___01_4-members.html new file mode 100644 index 0000000000..8af74ab9ba --- /dev/null +++ b/docs/classcutlass_1_1TensorRef_3_01Storage___00_01Rank___00_01MapFunc___00_011_00_01Index___00_01LongIndex___01_4-members.html @@ -0,0 +1,124 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ > Member List
+
+
+ +

This is the complete list of members for cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >, including all inherited members.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
add_pointer_offset(LongIndex delta)cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
at(TensorCoord const &coord) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
at(LongIndex idx) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
const_ref() constcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
ConstTensorRef typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >
Coord_t typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >
data() constcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
good() constcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
Index typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >
kRankcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >static
kStorageRankcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >static
leading_dim(int idx=0) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
LongIndex typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >
map(TensorCoord const &coord) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
MapFunc typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >
offset(TensorCoord const &coord) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
operator+(TensorCoord const &b) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
operator+=(TensorCoord const &b)cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
operator-(TensorCoord const &b) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
operator-=(TensorCoord const &b)cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
operator[](TensorCoord const &coord) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
operator[](LongIndex idx) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
Rankcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >static
reset(Storage *ptr=nullptr)cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
reset(Storage *ptr, StorageCoord const &stride)cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
Storage typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >
StorageCoord typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >
stride() constcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
stride(int dim) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
TensorCoord typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >
TensorRef(Storage *ptr=nullptr)cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
TensorRef(Storage *ptr, StrideVector const &stride)cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
TensorRef(Storage *ptr, StorageCoord const &stride)cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
TensorRef(TensorRef< typename platform::remove_const< Storage >::type, kRank, MapFunc, kStorageRank, Index, LongIndex > const &ref)cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >inline
+ + + + diff --git a/docs/classcutlass_1_1TensorRef_3_01Storage___00_01Rank___00_01MapFunc___00_011_00_01Index___00_01LongIndex___01_4.html b/docs/classcutlass_1_1TensorRef_3_01Storage___00_01Rank___00_01MapFunc___00_011_00_01Index___00_01LongIndex___01_4.html new file mode 100644 index 0000000000..2dfd10c99a --- /dev/null +++ b/docs/classcutlass_1_1TensorRef_3_01Storage___00_01Rank___00_01MapFunc___00_011_00_01Index___00_01LongIndex___01_4.html @@ -0,0 +1,1092 @@ + + + + + + + +Cutlass: cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ > Class Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ > Class Template Reference
+
+
+ +

Specialization for rank=1 case with no internal StrideVector. +

+ +

#include <tensor_ref.h>

+ + + + +

+Classes

struct  StrideVector
 
+ + + + + + + + + + + + + + + + + + + + + + + + + +

+Public Types

typedef Storage_ Storage
 Data type of individual access. More...
 
typedef MapFunc_ MapFunc
 Mapping function from logical coordinate to internal n-D array. More...
 
typedef Index_ Index
 Index type. More...
 
typedef LongIndex_ LongIndex
 Typically, strides in memory can be very large. More...
 
typedef Coord< kRankTensorCoord
 Coordinate in logical tensor space. More...
 
typedef Coord< kStorageRankStorageCoord
 Coordinate in storage n-D array. More...
 
typedef TensorRef< typename platform::remove_const< Storage >::type const, Rank_, MapFunc_, kStorageRank, Index_, LongIndex_ > ConstTensorRef
 Tensor reference to of constant value. More...
 
typedef TensorCoord Coord_t
 Coordinate in logical tensor space. More...
 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

+Public Member Functions

CUTLASS_HOST_DEVICE TensorRef (Storage *ptr=nullptr)
 Helper for 1-D memory. All higher ranks are projected onto the fastest changing rank. More...
 
CUTLASS_HOST_DEVICE TensorRef (Storage *ptr, StrideVector const &stride)
 Constructs from a single pointer and stride vector. More...
 
CUTLASS_HOST_DEVICE TensorRef (Storage *ptr, StorageCoord const &stride)
 
CUTLASS_HOST_DEVICE TensorRef (TensorRef< typename platform::remove_const< Storage >::type, kRank, MapFunc, kStorageRank, Index, LongIndex > const &ref)
 Enables conversion from TensorRef of non-const type. More...
 
CUTLASS_HOST_DEVICE ConstTensorRef const_ref () const
 Returns a reference to constant-valued tensor. More...
 
CUTLASS_HOST_DEVICE void reset (Storage *ptr=nullptr)
 Updates only the pointer. More...
 
CUTLASS_HOST_DEVICE void reset (Storage *ptr, StorageCoord const &stride)
 Updates the pointer, stride, and location within a TensorRef. More...
 
CUTLASS_HOST_DEVICE bool good () const
 Returns true if the TensorRef may be safely accessed. More...
 
CUTLASS_HOST_DEVICE Storagedata () const
 Returns the pointer to referenced data. More...
 
CUTLASS_HOST_DEVICE StorageCoord stride () const
 Returns the stride of the tensor. More...
 
CUTLASS_HOST_DEVICE Index stride (int dim) const
 Returns the stride of the tensor in the given dimension. More...
 
CUTLASS_HOST_DEVICE Index leading_dim (int idx=0) const
 Returns the maximum stride element as the 'leading dimension'. More...
 
CUTLASS_HOST_DEVICE StorageCoord map (TensorCoord const &coord) const
 Maps a logical coordinate to an n-D array in memory. More...
 
CUTLASS_HOST_DEVICE LongIndex offset (TensorCoord const &coord) const
 Computes the offset of an index from the origin of the tensor. More...
 
CUTLASS_HOST_DEVICE Storageat (TensorCoord const &coord) const
 Returns a reference to the element at a given Coord. More...
 
CUTLASS_HOST_DEVICE Storageat (LongIndex idx) const
 Returns a reference to the element at a given linear index. More...
 
CUTLASS_HOST_DEVICE Storageoperator[] (TensorCoord const &coord) const
 Returns a reference to the element at a given Coord. More...
 
CUTLASS_HOST_DEVICE Storageoperator[] (LongIndex idx) const
 Returns a reference to the element at a given linear index. More...
 
CUTLASS_HOST_DEVICE TensorRefadd_pointer_offset (LongIndex delta)
 Adds an offset to each pointer. More...
 
CUTLASS_HOST_DEVICE TensorRef operator+ (TensorCoord const &b) const
 Returns a TensorRef offset by a given amount. More...
 
CUTLASS_HOST_DEVICE TensorRefoperator+= (TensorCoord const &b)
 Returns a TensorRef offset by a given amount. More...
 
CUTLASS_HOST_DEVICE TensorRef operator- (TensorCoord const &b) const
 Returns a TensorRef offset by a given amount. More...
 
CUTLASS_HOST_DEVICE TensorRefoperator-= (TensorCoord const &b)
 Returns a TensorRef offset by a given amount. More...
 
+ + + + + + + + + + +

+Static Public Attributes

static int const kRank = Rank_
 Logical rank of tensor index space. More...
 
static int const kStorageRank = 1
 Rank of internal storage. More...
 
static int const Rank = kRank
 Logical rank of tensor index space. More...
 
+

Member Typedef Documentation

+ +

◆ ConstTensorRef

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + +
typedef TensorRef< typename platform::remove_const<Storage>::type const, Rank_, MapFunc_, kStorageRank, Index_, LongIndex_> cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::ConstTensorRef
+
+ +
+
+ +

◆ Coord_t

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + +
typedef TensorCoord cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::Coord_t
+
+ +
+
+ +

◆ Index

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + +
typedef Index_ cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::Index
+
+ +
+
+ +

◆ LongIndex

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + +
typedef LongIndex_ cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::LongIndex
+
+ +
+
+ +

◆ MapFunc

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + +
typedef MapFunc_ cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::MapFunc
+
+ +
+
+ +

◆ Storage

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + +
typedef Storage_ cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::Storage
+
+ +
+
+ +

◆ StorageCoord

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + +
typedef Coord<kStorageRank> cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::StorageCoord
+
+ +
+
+ +

◆ TensorCoord

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + +
typedef Coord<kRank> cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::TensorCoord
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ TensorRef() [1/4]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::TensorRef (Storageptr = nullptr)
+
+inline
+
+ +
+
+ +

◆ TensorRef() [2/4]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::TensorRef (Storageptr,
StrideVector const & stride 
)
+
+inline
+
+ +
+
+ +

◆ TensorRef() [3/4]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::TensorRef (Storageptr,
StorageCoord const & stride 
)
+
+inline
+
+

Constructs from a pointer and a stride vector of size kRank. If fastest changing stride is not 1, construction fails and subsequent calls to good() will return false.

+ +
+
+ +

◆ TensorRef() [4/4]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::TensorRef (TensorRef< typename platform::remove_const< Storage >::type, kRank, MapFunc, kStorageRank, Index, LongIndex > const & ref)
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ add_pointer_offset()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE TensorRef& cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::add_pointer_offset (LongIndex delta)
+
+inline
+
+ +
+
+ +

◆ at() [1/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE Storage& cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::at (TensorCoord const & coord) const
+
+inline
+
+ +
+
+ +

◆ at() [2/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE Storage& cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::at (LongIndex idx) const
+
+inline
+
+ +
+
+ +

◆ const_ref()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE ConstTensorRef cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::const_ref () const
+
+inline
+
+ +
+
+ +

◆ data()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE Storage* cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::data () const
+
+inline
+
+ +
+
+ +

◆ good()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE bool cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::good () const
+
+inline
+
+ +
+
+ +

◆ leading_dim()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE Index cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::leading_dim (int idx = 0) const
+
+inline
+
+ +
+
+ +

◆ map()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE StorageCoord cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::map (TensorCoord const & coord) const
+
+inline
+
+ +
+
+ +

◆ offset()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE LongIndex cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::offset (TensorCoord const & coord) const
+
+inline
+
+ +
+
+ +

◆ operator+()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE TensorRef cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::operator+ (TensorCoord const & b) const
+
+inline
+
+ +
+
+ +

◆ operator+=()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE TensorRef& cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::operator+= (TensorCoord const & b)
+
+inline
+
+ +
+
+ +

◆ operator-()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE TensorRef cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::operator- (TensorCoord const & b) const
+
+inline
+
+ +
+
+ +

◆ operator-=()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE TensorRef& cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::operator-= (TensorCoord const & b)
+
+inline
+
+ +
+
+ +

◆ operator[]() [1/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE Storage& cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::operator[] (TensorCoord const & coord) const
+
+inline
+
+ +
+
+ +

◆ operator[]() [2/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE Storage& cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::operator[] (LongIndex idx) const
+
+inline
+
+ +
+
+ +

◆ reset() [1/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE void cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::reset (Storageptr = nullptr)
+
+inline
+
+ +
+
+ +

◆ reset() [2/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE void cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::reset (Storageptr,
StorageCoord const & stride 
)
+
+inline
+
+ +
+
+ +

◆ stride() [1/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE StorageCoord cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::stride () const
+
+inline
+
+ +
+
+ +

◆ stride() [2/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE Index cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::stride (int dim) const
+
+inline
+
+ +
+
+

Member Data Documentation

+ +

◆ kRank

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + +
int const cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::kRank = Rank_
+
+static
+
+ +
+
+ +

◆ kStorageRank

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + +
int const cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::kStorageRank = 1
+
+static
+
+ +
+
+ +

◆ Rank

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ , typename Index_ , typename LongIndex_ >
+ + + + + +
+ + + + +
int const cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::Rank = kRank
+
+static
+
+ +
+
+
The documentation for this class was generated from the following file: +
+ + + + diff --git a/docs/classcutlass_1_1TensorView-members.html b/docs/classcutlass_1_1TensorView-members.html index e9401f9cc9..9f5c325352 100644 --- a/docs/classcutlass_1_1TensorView-members.html +++ b/docs/classcutlass_1_1TensorView-members.html @@ -73,51 +73,70 @@
-
cutlass::TensorView< T > Member List
+
cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > Member List
-

This is the complete list of members for cutlass::TensorView< T >, including all inherited members.

+

This is the complete list of members for cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >, including all inherited members.

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
advance(Coord< Rank > const &b)cutlass::TensorRef< T, 4 >inline
at(Coord_t const &coord) constcutlass::TensorView< T >inline
at(Offset_t idx) constcutlass::TensorView< T >inline
Base typedefcutlass::TensorView< T >
const_ref()cutlass::TensorView< T >inline
ConstTensorRef_t typedefcutlass::TensorView< T >
contains(Coord_t const &coord) constcutlass::TensorView< T >inline
convert()cutlass::TensorRef< T, 4 >inline
Coord_t typedefcutlass::TensorView< T >
data() constcutlass::TensorView< T >inline
good() constcutlass::TensorView< T >inline
leading_dim() constcutlass::TensorRef< T, 4 >inline
offset(Coord_t const &coord) constcutlass::TensorView< T >inline
Offset_t typedefcutlass::TensorView< T >
operator+(Coord< Rank > const &b) constcutlass::TensorRef< T, 4 >inline
operator-(Coord< Rank > const &b) constcutlass::TensorRef< T, 4 >inline
operator=(TensorView const &_tensor)cutlass::TensorView< T >inline
operator[](Coord< Rank > const &coord) constcutlass::TensorView< T >inline
TensorRef< T, 4 >::operator[](int idx) constcutlass::TensorRef< T, 4 >inline
Rankcutlass::TensorView< T >static
ref()cutlass::TensorView< T >inline
ref() constcutlass::TensorView< T >inline
reset(TensorRef_t const &_ref=TensorRef_t(0), Coord_t const &_size=Coord_t())cutlass::TensorView< T >inline
TensorRef< T, 4 >::reset(Storage *ptr=nullptr, Coord< Rank > stride=Coord< Rank >(0))cutlass::TensorRef< T, 4 >inline
size() constcutlass::TensorView< T >inline
size(int dim) constcutlass::TensorView< T >inline
Storage typedefcutlass::TensorRef< T, 4 >
stride() constcutlass::TensorView< T >inline
stride(int dim) constcutlass::TensorView< T >inline
subview(Coord_t const &location, Coord_t size) constcutlass::TensorView< T >inline
TensorRef()cutlass::TensorRef< T, 4 >inline
TensorRef(Storage *ptr, Coord< Rank > stride)cutlass::TensorRef< T, 4 >inline
TensorRef_t typedefcutlass::TensorView< T >
TensorView()cutlass::TensorView< T >inline
TensorView(TensorRef_t const &_ref, Coord_t const &_size)cutlass::TensorView< T >inline
add_pointer_offset(LongIndex delta)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
at(TensorCoord const &coord) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
at(LongIndex idx) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
Base typedefcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
capacity() constcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
const_ref() constcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
ConstTensorRef typedefcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
ConstTensorRef_t typedefcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
ConstTensorView typedefcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
contains(TensorCoord const &coord) constcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
Coord_t typedefcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
data() constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
good() constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
Index typedefcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
kRankcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >static
kStorageRankcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >static
leading_dim(int idx=0) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
LongIndex typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
map(TensorCoord const &coord) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
MapFunc typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
offset(TensorCoord const &coord) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
Offset_t typedefcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
operator+(TensorCoord const &b) constcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
operator+=(TensorCoord const &b)cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
operator-(TensorCoord const &b) constcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
operator-=(TensorCoord const &b)cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
operator=(TensorView const &_tensor)cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
operator[](TensorCoord const &coord) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
operator[](LongIndex idx) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
Rankcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >static
ref() constcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
reset(Base const &_ref=Base(), TensorCoord const &_size=TensorCoord())cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
cutlass::TensorRef::reset(Storage *ptr=nullptr)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
cutlass::TensorRef::reset(Storage *ptr, StorageCoord const &stride)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
size() constcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
size(int dim) constcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
Storage typedefcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
StorageCoord typedefcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
stride() constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
stride(int dim) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
StrideVector typedefcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
subview(TensorCoord const &location, TensorCoord size) constcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
TensorCoord typedefcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
TensorRef typedefcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
cutlass::TensorRef::TensorRef(Storage *ptr=nullptr)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
cutlass::TensorRef::TensorRef(Storage *ptr, Index ldm)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
cutlass::TensorRef::TensorRef(Storage *ptr, StrideVector const &stride)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
cutlass::TensorRef::TensorRef(Storage *ptr, StorageCoord const &stride)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
cutlass::TensorRef::TensorRef(TensorRef< typename platform::remove_const< Storage >::type, kRank, MapFunc, kStorageRank, Index, LongIndex > const &ref)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
TensorRef_t typedefcutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
TensorView()cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
TensorView(Base const &_ref, TensorCoord const &_size)cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
TensorView(Storage *ptr, StrideVector const &stride, TensorCoord const &size)cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
TensorView(Storage *ptr, StorageCoord const &stride, TensorCoord const &size)cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
diff --git a/docs/classcutlass_1_1TensorView.html b/docs/classcutlass_1_1TensorView.html index 7dba23228c..276d1077d5 100644 --- a/docs/classcutlass_1_1TensorView.html +++ b/docs/classcutlass_1_1TensorView.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::TensorView< T > Class Template Reference +Cutlass: cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > Class Template Reference @@ -78,332 +78,461 @@ Static Public Attributes | List of all members
-
cutlass::TensorView< T > Class Template Reference
+
cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > Class Template Reference
-

Host-side reference implementation of tensor operations. +

Defines a view into a logical tensor.

#include <tensor_view.h>

-Inheritance diagram for cutlass::TensorView< T >:
+Inheritance diagram for cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >:
- - -cutlass::TensorRef< T, 4 > + + +cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
- - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Public Types

typedef TensorRef< T, 4 > Base
 Reference and stride. More...
 
typedef Base TensorRef_t
 Reference and stride. More...
 
typedef TensorRef< T const, 4 > ConstTensorRef_t
 Reference to constant type. More...
 
typedef int Offset_t
 Type used to compute the offset of an element to the base of a tensor. More...
 
typedef Coord< RankCoord_t
 Coordinate into tensor. More...
 
- Public Types inherited from cutlass::TensorRef< T, 4 >
typedef T Storage
 Data type of individual access. More...
 
typedef TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > Base
 Base tensor reference. More...
 
typedef TensorRef< typename platform::remove_const< Storage_ >::type const, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > ConstTensorRef
 Tensor reference to of constant value. More...
 
typedef Base TensorRef
 Base tensor reference. More...
 
typedef Base::Storage Storage
 Storage type. More...
 
typedef Base::Index Index
 Index type. More...
 
typedef TensorRef::TensorCoord TensorCoord
 Coordinate in logical tensor space. More...
 
typedef TensorRef::StorageCoord StorageCoord
 Coordinate in storage n-D array. More...
 
typedef TensorRef::StrideVector StrideVector
 
typedef TensorView< typename platform::remove_const< Storage >::type const, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > ConstTensorView
 TensorView of constant value. More...
 
typedef TensorCoord Coord_t
 Coordinate in logical tensor space. More...
 
typedef Base::LongIndex Offset_t
 Type used to compute the offset of an element to the base of a tensor. More...
 
typedef TensorRef TensorRef_t
 Base class. More...
 
typedef TensorRef::ConstTensorRef ConstTensorRef_t
 TensorRef to const-valued type. More...
 
- Public Types inherited from cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
typedef Storage_ Storage
 Data type of individual access. More...
 
typedef MapFunc_ MapFunc
 Mapping function from logical coordinate to internal n-D array. More...
 
typedef Index_ Index
 Index type. More...
 
typedef LongIndex_ LongIndex
 Typically, strides in memory can be very large. More...
 
typedef Coord< kRankTensorCoord
 Coordinate in logical tensor space. More...
 
typedef Coord< kStorageRankStorageCoord
 Coordinate in storage n-D array. More...
 
typedef Coord< kStorageRank - 1 > StrideVector
 
typedef TensorRef< typename platform::remove_const< Storage >::type const, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > ConstTensorRef
 Tensor reference to of constant value. More...
 
typedef TensorCoord Coord_t
 Coordinate in logical tensor space. More...
 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Public Member Functions

CUTLASS_HOST_DEVICE TensorView ()
 Default constructor. More...
 
CUTLASS_HOST_DEVICE TensorView (TensorRef_t const &_ref, Coord_t const &_size)
 Constructs a Tensor_view from a TensorRef and size. More...
 
CUTLASS_HOST_DEVICE bool good () const
 Returns true if the Tensor_view is bound to some memory. More...
 
CUTLASS_HOST_DEVICE T * data () const
 Returns a pointer to data. More...
 
CUTLASS_HOST_DEVICE void reset (TensorRef_t const &_ref=TensorRef_t(0), Coord_t const &_size=Coord_t())
 Updates the reference and size of a Tensor_view object. More...
 
CUTLASS_HOST_DEVICE TensorRef_tref ()
 Accesses the tensor reference pointing to data. More...
 
CUTLASS_HOST_DEVICE ConstTensorRef_t const_ref ()
 
CUTLASS_HOST_DEVICE TensorRef_t const & ref () const
 Accesses the tensor reference pointing to data. More...
 
CUTLASS_HOST_DEVICE Coord_t const & size () const
 Accesses the size. More...
 
CUTLASS_HOST_DEVICE int size (int dim) const
 Accesses the size. More...
 
CUTLASS_HOST_DEVICE Coord_t const & stride () const
 Accesses the stride. More...
 
CUTLASS_HOST_DEVICE int const & stride (int dim) const
 Accesses the stride. More...
 
CUTLASS_HOST_DEVICE TensorViewoperator= (TensorView const &_tensor)
 Assigns the Tensor_view. More...
 
CUTLASS_HOST_DEVICE Offset_t offset (Coord_t const &coord) const
 Returns the index of an element. More...
 
CUTLASS_HOST_DEVICE bool contains (Coord_t const &coord) const
 Determines whether a location is within a tensor. More...
 
CUTLASS_HOST_DEVICE T & at (Coord_t const &coord) const
 Element-wise accessor. More...
 
T & operator[] (Coord< Rank > const &coord) const
 Element-wise accessor. More...
 
CUTLASS_HOST_DEVICE T & at (Offset_t idx) const
 Element-wise accessor. More...
 
CUTLASS_HOST_DEVICE TensorView< T > subview (Coord_t const &location, Coord_t size) const
 Returns a Tensor_view given location and size quantities. More...
 
- Public Member Functions inherited from cutlass::TensorRef< T, 4 >
CUTLASS_HOST_DEVICE TensorRef ()
 Default ctor. More...
 
CUTLASS_HOST_DEVICE TensorRef (Storage *ptr, Coord< Rank > stride)
 Constructs from a pointer, size, and stride. More...
 
CUTLASS_HOST_DEVICE void reset (Storage *ptr=nullptr, Coord< Rank > stride=Coord< Rank >(0))
 Updates the pointer, stride, and location within a TensorRef. More...
 
TensorRef< T, Rankconvert ()
 Conversion function. More...
 
CUTLASS_HOST_DEVICE bool good () const
 Returns true if the TensorRef may be safely accessed. More...
 
CUTLASS_HOST_DEVICE Storagedata () const
 Returns the pointer to referenced data. More...
 
CUTLASS_HOST_DEVICE Coord< Rank > const & stride () const
 Returns the stride of the tensor. More...
 
CUTLASS_HOST_DEVICE int const & stride (int dim) const
 Returns the stride of the tensor in the given dimension. More...
 
CUTLASS_HOST_DEVICE int leading_dim () const
 Returns the maximum stride element as the 'leading dimension'. More...
 
CUTLASS_HOST_DEVICE long long offset (Coord< Rank > const &coord) const
 Computes the offset of an index from the origin of the tensor. More...
 
CUTLASS_HOST_DEVICE Storageat (Coord< Rank > const &coord) const
 Returns a reference to the element at a given Coord. More...
 
CUTLASS_HOST_DEVICE Storageat (int idx) const
 Returns a reference to the element at a given Coord. More...
 
Storageoperator[] (Coord< Rank > const &coord) const
 Element-wise accessor. More...
 
Storageoperator[] (int idx) const
 Element-wise accessor. More...
 
CUTLASS_HOST_DEVICE TensorRefadvance (Coord< Rank > const &b)
 Adds an offset to the pointer. More...
 
CUTLASS_HOST_DEVICE TensorRef operator+ (Coord< Rank > const &b) const
 Returns a TensorRef offset by a given amount. More...
 
CUTLASS_HOST_DEVICE TensorRef operator- (Coord< Rank > const &b) const
 Returns a TensorRef offset by a given amount. More...
 
CUTLASS_HOST_DEVICE TensorView ()
 Default constructor. More...
 
CUTLASS_HOST_DEVICE TensorView (Base const &_ref, TensorCoord const &_size)
 Constructs a TensorView from a TensorRef and size. More...
 
CUTLASS_HOST_DEVICE TensorView (Storage *ptr, StrideVector const &stride, TensorCoord const &size)
 Constructs a TensorView from a pointer, a stride vector, and size. More...
 
CUTLASS_HOST_DEVICE TensorView (Storage *ptr, StorageCoord const &stride, TensorCoord const &size)
 Constructs a TensorView from a pointer, a stride vector, and size. More...
 
CUTLASS_HOST_DEVICE void reset (Base const &_ref=Base(), TensorCoord const &_size=TensorCoord())
 Updates the reference and size of a Tensor_view object. More...
 
CUTLASS_HOST_DEVICE TensorCoord const & size () const
 Accesses the size. More...
 
CUTLASS_HOST_DEVICE Index size (int dim) const
 Accesses the size. More...
 
CUTLASS_HOST_DEVICE TensorViewoperator= (TensorView const &_tensor)
 Assigns the Tensor_view. More...
 
CUTLASS_HOST_DEVICE bool contains (TensorCoord const &coord) const
 Determines whether a location is within a tensor. More...
 
CUTLASS_HOST_DEVICE TensorRef ref () const
 Returns a TensorRef pointing to the first element of the tensor. More...
 
CUTLASS_HOST_DEVICE ConstTensorRef const_ref () const
 Returns a TensorRef pointing to the first element of the tensor. More...
 
CUTLASS_HOST_DEVICE TensorView subview (TensorCoord const &location, TensorCoord size) const
 Returns a Tensor_view given location and size quantities. More...
 
CUTLASS_HOST_DEVICE size_t capacity () const
 Returns the number of scalar elements needed to store tensor. More...
 
CUTLASS_HOST_DEVICE TensorView operator+ (TensorCoord const &b) const
 Returns a TensorView offset by a given amount. More...
 
CUTLASS_HOST_DEVICE TensorViewoperator+= (TensorCoord const &b)
 Returns a TensorRef offset by a given amount. More...
 
CUTLASS_HOST_DEVICE TensorView operator- (TensorCoord const &b) const
 Returns a TensorRef offset by a given amount. More...
 
CUTLASS_HOST_DEVICE TensorViewoperator-= (TensorCoord const &b)
 Returns a TensorRef offset by a given amount. More...
 
- Public Member Functions inherited from cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
CUTLASS_HOST_DEVICE TensorRef (Storage *ptr=nullptr)
 Helper for 1-D memory. All higher ranks are projected onto the fastest changing rank. More...
 
CUTLASS_HOST_DEVICE TensorRef (Storage *ptr, Index ldm)
 Helper to construct from a pointer and single stride element for 2-D pitch linear memory. More...
 
CUTLASS_HOST_DEVICE TensorRef (Storage *ptr, StrideVector const &stride)
 Constructs from a single pointer and stride vector. More...
 
CUTLASS_HOST_DEVICE TensorRef (Storage *ptr, StorageCoord const &stride)
 
CUTLASS_HOST_DEVICE TensorRef (TensorRef< typename platform::remove_const< Storage >::type, kRank, MapFunc, kStorageRank, Index, LongIndex > const &ref)
 Enables conversion from TensorRef of non-const type. More...
 
CUTLASS_HOST_DEVICE ConstTensorRef const_ref () const
 Returns a reference to constant-valued tensor. More...
 
CUTLASS_HOST_DEVICE void reset (Storage *ptr=nullptr)
 Updates only the pointer. More...
 
CUTLASS_HOST_DEVICE void reset (Storage *ptr, StorageCoord const &stride)
 Updates the pointer, stride, and location within a TensorRef. More...
 
CUTLASS_HOST_DEVICE bool good () const
 Returns true if the TensorRef may be safely accessed. More...
 
CUTLASS_HOST_DEVICE Storagedata () const
 Returns the pointer to referenced data. More...
 
CUTLASS_HOST_DEVICE StorageCoord stride () const
 Returns the stride of the tensor. More...
 
CUTLASS_HOST_DEVICE Index stride (int dim) const
 Returns the stride of the tensor in the given dimension. More...
 
CUTLASS_HOST_DEVICE Index leading_dim (int idx=0) const
 Returns the maximum stride element as the 'leading dimension'. More...
 
CUTLASS_HOST_DEVICE StorageCoord map (TensorCoord const &coord) const
 Maps a logical coordinate to an n-D array in memory. More...
 
CUTLASS_HOST_DEVICE LongIndex offset (TensorCoord const &coord) const
 Computes the offset of an index from the origin of the tensor. More...
 
CUTLASS_HOST_DEVICE Storageat (TensorCoord const &coord) const
 Returns a reference to the element at a given Coord. More...
 
CUTLASS_HOST_DEVICE Storageat (LongIndex idx) const
 Returns a reference to the element at a given linear index. More...
 
CUTLASS_HOST_DEVICE Storageoperator[] (TensorCoord const &coord) const
 Returns a reference to the element at a given Coord. More...
 
CUTLASS_HOST_DEVICE Storageoperator[] (LongIndex idx) const
 Returns a reference to the element at a given linear index. More...
 
CUTLASS_HOST_DEVICE TensorRefadd_pointer_offset (LongIndex delta)
 Adds an offset to each pointer. More...
 
CUTLASS_HOST_DEVICE TensorRef operator+ (TensorCoord const &b) const
 Returns a TensorRef offset by a given amount. More...
 
CUTLASS_HOST_DEVICE TensorRefoperator+= (TensorCoord const &b)
 Returns a TensorRef offset by a given amount. More...
 
CUTLASS_HOST_DEVICE TensorRef operator- (TensorCoord const &b) const
 Returns a TensorRef offset by a given amount. More...
 
CUTLASS_HOST_DEVICE TensorRefoperator-= (TensorCoord const &b)
 Returns a TensorRef offset by a given amount. More...
 
- - - - - - - + + + + + + + + + + + + +

Static Public Attributes

static int const Rank = TensorRef_t::Rank
 Rank of tensor. More...
 
- Static Public Attributes inherited from cutlass::TensorRef< T, 4 >
static int const Rank
 Rank of tensor. More...
 
static int const Rank = Base::kRank
 Logical rank of tensor index space. More...
 
- Static Public Attributes inherited from cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
static int const kRank = Rank_
 Logical rank of tensor index space. More...
 
static int const kStorageRank = StorageRank_
 Rank of internal storage. More...
 
static int const Rank = kRank
 Logical rank of tensor index space. More...
 

Member Typedef Documentation

- -

◆ Base

+ +

◆ Base

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
- +
typedef TensorRef<T, 4> cutlass::TensorView< T >::Basetypedef TensorRef<Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_> cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::Base
- -

◆ ConstTensorRef_t

+ +

◆ ConstTensorRef

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
- +
typedef TensorRef<T const, 4> cutlass::TensorView< T >::ConstTensorRef_ttypedef TensorRef< typename platform::remove_const<Storage_>::type const, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_> cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstTensorRef
- -

◆ Coord_t

+ +

◆ ConstTensorRef_t

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
- +
typedef Coord<Rank> cutlass::TensorView< T >::Coord_ttypedef TensorRef::ConstTensorRef cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstTensorRef_t
- -

◆ Offset_t

+ +

◆ ConstTensorView

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
- +
typedef int cutlass::TensorView< T >::Offset_ttypedef TensorView< typename platform::remove_const<Storage>::type const, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_> cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ConstTensorView
- -

◆ TensorRef_t

+ +

◆ Coord_t

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
- +
typedef Base cutlass::TensorView< T >::TensorRef_ttypedef TensorCoord cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::Coord_t
-

Constructor & Destructor Documentation

- -

◆ TensorView() [1/2]

+ +

◆ Index

-template<typename T>
- - - - - -
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long> - - - - +
CUTLASS_HOST_DEVICE cutlass::TensorView< T >::TensorView ()typedef Base::Index cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::Index
-
-inline
- -

◆ TensorView() [2/2]

+ +

◆ Offset_t

-template<typename T>
- - - - - -
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long> - - - - + +
CUTLASS_HOST_DEVICE cutlass::TensorView< T >::TensorView (TensorRef_t const & _ref, typedef Base::LongIndex cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::Offset_t
+
+ +
+ + +

◆ Storage

+ +
+
+
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ - - - - + +
Coord_t const & _size typedef Base::Storage cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::Storage
+
+ +
+
+ +

◆ StorageCoord

+ +
+
+
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ - - - +
)typedef TensorRef::StorageCoord cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::StorageCoord
-
-inline
-

Member Function Documentation

- -

◆ at() [1/2]

+ +

◆ StrideVector

+ +
+
+
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef TensorRef::StrideVector cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::StrideVector
+
+

Stride vector in storage coordinate space Least significant stride is = 1 and not stored

+ +
+
+ +

◆ TensorCoord

+ +
+
+
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef TensorRef::TensorCoord cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::TensorCoord
+
+ +
+
+ +

◆ TensorRef

+ +
+
+
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef Base cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::TensorRef
+
+ +
+
+ +

◆ TensorRef_t

+ +
+
+
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef TensorRef cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::TensorRef_t
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ TensorView() [1/4]

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
@@ -415,23 +544,33 @@

-

◆ at() [2/2]

+ +

◆ TensorView() [2/4]

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

- + - - - + +
CUTLASS_HOST_DEVICE T& cutlass::TensorView< T >::at CUTLASS_HOST_DEVICE cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::TensorView (Coord_t const & coord) const)
@@ -443,22 +582,39 @@

-

◆ const_ref()

+ +

◆ TensorView() [3/4]

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

- + - - - + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE T& cutlass::TensorView< T >::at CUTLASS_HOST_DEVICE cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::TensorView (Offset_t idx) constBase const & _ref,
TensorCoord const & _size 
)
@@ -470,23 +626,39 @@

-

◆ contains()

+ +

◆ TensorView() [4/4]

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

- + - + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE ConstTensorRef_t cutlass::TensorView< T >::const_ref CUTLASS_HOST_DEVICE cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::TensorView ()Storageptr,
StrideVector const & stride,
TensorCoord const & size 
)
@@ -498,19 +670,20 @@

-

◆ data()

+

Member Function Documentation

+ +

◆ capacity()

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

- + - - - + + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE bool cutlass::TensorView< T >::contains CUTLASS_HOST_DEVICE cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::TensorView (Coord_t const & coord) constStorageptr,
StorageCoord const & stride,
TensorCoord const & size 
)
@@ -394,8 +433,8 @@

-

◆ GemmGlobalIteratorCd() [2/2]

+ +

◆ GemmGlobalIteratorCd() [2/2]

@@ -406,10 +445,10 @@

- + @@ -525,19 +698,19 @@

-

◆ good()

+ +

◆ const_ref()

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

CUTLASS_HOST_DEVICE T* cutlass::TensorView< T >::data CUTLASS_HOST_DEVICE size_t cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::capacity ( ) const
+ + + + + + @@ -185,7 +226,7 @@

diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb_1_1Params.png b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb_1_1Params.png index 0a3e71c3ee..9cce10a9a9 100644 Binary files a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb_1_1Params.png and b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb_1_1Params.png differ diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd-members.html b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd-members.html index e77b99eb67..f19fdc17cd 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd-members.html @@ -79,51 +79,55 @@

This is the complete list of members for cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >, including all inherited members.

- + @@ -552,21 +725,21 @@

-

◆ offset()

+ +

◆ contains()

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

CUTLASS_HOST_DEVICE bool cutlass::TensorView< T >::good CUTLASS_HOST_DEVICE ConstTensorRef cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::const_ref ( ) const
@@ -604,8 +685,8 @@

-

◆ inc_advance()

+ +

◆ inc_d()

@@ -616,7 +697,7 @@

- + - + @@ -580,23 +753,23 @@

-

◆ operator=()

+ +

◆ operator+()

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

CUTLASS_HOST_DEVICE Offset_t cutlass::TensorView< T >::offset CUTLASS_HOST_DEVICE bool cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::contains (Coord_t const & TensorCoord const &  coord) const
@@ -608,23 +781,23 @@

-

◆ operator[]()

+ +

◆ operator+=()

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

- + - - - + + +
CUTLASS_HOST_DEVICE TensorView& cutlass::TensorView< T >::operator= CUTLASS_HOST_DEVICE TensorView cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::operator+ (TensorView< T > const & _tensor)TensorCoord const & b) const
@@ -636,22 +809,23 @@

-

◆ ref() [1/2]

+ +

◆ operator-()

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

- + - - - + + +
T& cutlass::TensorView< T >::operator[] CUTLASS_HOST_DEVICE TensorView& cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::operator+= (Coord< Rank > const & coord) constTensorCoord const & b)
@@ -663,22 +837,23 @@

-

◆ ref() [2/2]

+ +

◆ operator-=()

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

- + - - + + +
CUTLASS_HOST_DEVICE TensorRef_t& cutlass::TensorView< T >::ref CUTLASS_HOST_DEVICE TensorView cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::operator- ()TensorCoord const & b) const
@@ -690,33 +865,23 @@

-

◆ reset()

+ +

◆ operator=()

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

- + - - + + +
CUTLASS_HOST_DEVICE TensorRef_t const& cutlass::TensorView< T >::ref CUTLASS_HOST_DEVICE TensorView& cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::operator-= () constTensorCoord const & b)
@@ -728,19 +893,19 @@

-

◆ size() [1/2]

+ +

◆ ref()

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

- + - - - - - - - - - - + + - -
CUTLASS_HOST_DEVICE void cutlass::TensorView< T >::reset CUTLASS_HOST_DEVICE TensorView& cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::operator= (TensorRef_t const & _ref = TensorRef_t(0),
Coord_t const & _size = Coord_t() 
TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > const & _tensor) )
- +
- + @@ -755,23 +920,33 @@

-

◆ size() [2/2]

+ +

◆ reset()

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

CUTLASS_HOST_DEVICE Coord_t const& cutlass::TensorView< T >::size CUTLASS_HOST_DEVICE TensorRef cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::ref ( ) const
@@ -783,19 +958,19 @@

-

◆ stride() [1/2]

+ +

◆ size() [1/2]

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

- + - - - + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE int cutlass::TensorView< T >::size CUTLASS_HOST_DEVICE void cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::reset (int dim) constBase const & _ref = Base(),
TensorCoord const & _size = TensorCoord() 
)
- + @@ -810,19 +985,19 @@

-

◆ stride() [2/2]

+ +

◆ size() [2/2]

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

CUTLASS_HOST_DEVICE Coord_t const& cutlass::TensorView< T >::stride CUTLASS_HOST_DEVICE TensorCoord const& cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::size ( ) const
+ + + + + + + + + + + + + + + @@ -109,6 +124,9 @@ + + + @@ -124,9 +142,6 @@ - - - @@ -145,12 +160,21 @@ + + + + + + + + + @@ -170,7 +194,7 @@ diff --git a/docs/dir_c5917a9a879e9a6c73eaf5237444ab84.html b/docs/dir_c5917a9a879e9a6c73eaf5237444ab84.html index a66eb22fa5..9011cf40c0 100644 --- a/docs/dir_c5917a9a879e9a6c73eaf5237444ab84.html +++ b/docs/dir_c5917a9a879e9a6c73eaf5237444ab84.html @@ -79,12 +79,16 @@
- + @@ -838,27 +1013,27 @@

-

◆ subview()

+ +

◆ subview()

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>

CUTLASS_HOST_DEVICE int const& cutlass::TensorView< T >::stride CUTLASS_HOST_DEVICE Index cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::size ( int  dim)
- + - + - + @@ -877,19 +1052,19 @@

Member Data Documentation

- -

◆ Rank

+ +

◆ Rank

-template<typename T>
+template<typename Storage_ , int Rank_ = 4, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
CUTLASS_HOST_DEVICE TensorView<T> cutlass::TensorView< T >::subview CUTLASS_HOST_DEVICE TensorView cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::subview (Coord_t const & TensorCoord const &  location,
Coord_t TensorCoord  size 
@@ -907,7 +1082,7 @@

diff --git a/docs/classcutlass_1_1TensorView.png b/docs/classcutlass_1_1TensorView.png index 40500e8a3a..46861ac917 100644 Binary files a/docs/classcutlass_1_1TensorView.png and b/docs/classcutlass_1_1TensorView.png differ diff --git a/docs/classcutlass_1_1ZipTileIterator-members.html b/docs/classcutlass_1_1ZipTileIterator-members.html new file mode 100644 index 0000000000..6de74a4943 --- /dev/null +++ b/docs/classcutlass_1_1ZipTileIterator-members.html @@ -0,0 +1,125 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+

- +
int const cutlass::TensorView< T >::Rank = TensorRef_t::Rankint const cutlass::TensorView< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::Rank = Base::kRank
+ + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+ + + + + + + + + +
+
+ + +
+ +
+ + + +
+
+
cutlass::ZipTileIterator< First_, Second_ > Member List
+
+
+ +

This is the complete list of members for cutlass::ZipTileIterator< First_, Second_ >, including all inherited members.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
add_pointer_offset(Index offset)cutlass::ZipTileIterator< First_, Second_ >inline
decrement(int count=1)cutlass::ZipTileIterator< First_, Second_ >inline
First typedefcutlass::ZipTileIterator< First_, Second_ >
firstcutlass::ZipTileIterator< First_, Second_ >
Fragment typedefcutlass::ZipTileIterator< First_, Second_ >
increment(int count=1)cutlass::ZipTileIterator< First_, Second_ >inline
Index typedefcutlass::ZipTileIterator< First_, Second_ >
initialize_predicates(PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &block_offset=make_Coord(0, 0, 0))cutlass::ZipTileIterator< First_, Second_ >inline
initialize_predicates(PredicateIterator predicate_it, PredicateFunctor const &functor, Coord< 3 > const &block_offset)cutlass::ZipTileIterator< First_, Second_ >inline
load(Fragment &fragment) constcutlass::ZipTileIterator< First_, Second_ >inline
load(Fragment &fragment, Coord< 4 > const &offset) constcutlass::ZipTileIterator< First_, Second_ >inline
load(Fragment &fragment, PredicateIterator pred_it) constcutlass::ZipTileIterator< First_, Second_ >inline
load_post_increment(Fragment &fragment)cutlass::ZipTileIterator< First_, Second_ >inline
load_post_increment(Fragment &fragment, Coord< 4 > const &offset)cutlass::ZipTileIterator< First_, Second_ >inline
load_post_increment(Fragment &fragment, PredicateIterator pred_it)cutlass::ZipTileIterator< First_, Second_ >inline
operator++()cutlass::ZipTileIterator< First_, Second_ >inline
operator+=(int count)cutlass::ZipTileIterator< First_, Second_ >inline
operator+=(Coord< 3 > const &offset)cutlass::ZipTileIterator< First_, Second_ >inline
operator--()cutlass::ZipTileIterator< First_, Second_ >inline
operator-=(int count)cutlass::ZipTileIterator< First_, Second_ >inline
PredicateVector typedefcutlass::ZipTileIterator< First_, Second_ >
secondcutlass::ZipTileIterator< First_, Second_ >
Second typedefcutlass::ZipTileIterator< First_, Second_ >
store(Fragment const &fragment) constcutlass::ZipTileIterator< First_, Second_ >inline
store(Fragment const &fragment, Coord< 4 > const &offset) constcutlass::ZipTileIterator< First_, Second_ >inline
store(Fragment const &fragment, PredicateIterator pred_it) constcutlass::ZipTileIterator< First_, Second_ >inline
store_post_increment(Fragment const &fragment)cutlass::ZipTileIterator< First_, Second_ >inline
store_post_increment(Fragment const &fragment, Coord< 4 > const &offset)cutlass::ZipTileIterator< First_, Second_ >inline
store_post_increment(Fragment const &fragment, PredicateIterator pred_it)cutlass::ZipTileIterator< First_, Second_ >inline
TensorRef typedefcutlass::ZipTileIterator< First_, Second_ >
ZipTileIterator()cutlass::ZipTileIterator< First_, Second_ >inline
ZipTileIterator(Params const &_params, Coord< 3 > const &threadblock_offset=make_Coord(0, 0, 0))cutlass::ZipTileIterator< First_, Second_ >inline
ZipTileIterator(First const &_first, Second const &_second)cutlass::ZipTileIterator< First_, Second_ >inline
ZipTileIterator(TensorRef const &ref)cutlass::ZipTileIterator< First_, Second_ >inline
ZipTileIterator(Params const &_params, TensorRef const &ref)cutlass::ZipTileIterator< First_, Second_ >inline
+ + + + diff --git a/docs/classcutlass_1_1ZipTileIterator.html b/docs/classcutlass_1_1ZipTileIterator.html new file mode 100644 index 0000000000..7cf7a392b5 --- /dev/null +++ b/docs/classcutlass_1_1ZipTileIterator.html @@ -0,0 +1,1290 @@ + + + + + + + +Cutlass: cutlass::ZipTileIterator< First_, Second_ > Class Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::ZipTileIterator< First_, Second_ > Class Template Reference
+
+
+ +

Constructs an iterator from a pair of iterators. +

+ +

#include <zip_tile_iterator.h>

+ + + + + +

+Classes

struct  Params
 Params object. More...
 
+ + + + + + + + + + + + + + + + + + + +

+Public Types

typedef First_ First
 First iterator type. More...
 
typedef Second_ Second
 Second iterator type. More...
 
typedef ZipFragment< typename First::Fragment, typename Second::Fragment > Fragment
 Fragment type. More...
 
typedef First::PredicateVector PredicateVector
 Predicate vector. More...
 
typedef First::Index Index
 Index type. More...
 
typedef ZipTensorRef< typename First::TensorRef, typename Second::TensorRef > TensorRef
 Tensor reference. More...
 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

+Public Member Functions

CUTLASS_DEVICE ZipTileIterator ()
 Default constructor. More...
 
CUTLASS_DEVICE ZipTileIterator (Params const &_params, Coord< 3 > const &threadblock_offset=make_Coord(0, 0, 0))
 Constructs a zip iterator from params. More...
 
CUTLASS_DEVICE ZipTileIterator (First const &_first, Second const &_second)
 Constructs a zip iterator from iterator instances. More...
 
CUTLASS_DEVICE ZipTileIterator (TensorRef const &ref)
 Constructs a zip iterator from iterator instances. More...
 
CUTLASS_DEVICE ZipTileIterator (Params const &_params, TensorRef const &ref)
 Constructs a zip iterator from iterator instances. More...
 
template<typename PredicateIterator >
CUTLASS_HOST_DEVICE void initialize_predicates (PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &block_offset=make_Coord(0, 0, 0))
 Initializes a predicate vector using a RegularTilePredicateFunctor. More...
 
template<typename PredicateIterator , typename PredicateFunctor >
CUTLASS_HOST_DEVICE void initialize_predicates (PredicateIterator predicate_it, PredicateFunctor const &functor, Coord< 3 > const &block_offset)
 Initializes a predicate vector using an arbitrary predicate functor. More...
 
template<typename Fragment >
CUTLASS_DEVICE void load_post_increment (Fragment &fragment)
 Loads a fragment and increments without predicates. More...
 
template<typename Fragment >
CUTLASS_DEVICE void load_post_increment (Fragment &fragment, Coord< 4 > const &offset)
 Loads a fragment and increments without predicates. More...
 
template<typename Fragment >
CUTLASS_DEVICE void load (Fragment &fragment) const
 Loads a fragment without predicates. More...
 
template<typename Fragment >
CUTLASS_DEVICE void load (Fragment &fragment, Coord< 4 > const &offset) const
 Loads a fragment without predicates. More...
 
template<typename Fragment >
CUTLASS_DEVICE void store_post_increment (Fragment const &fragment)
 Stores a fragment and increments without predicates. More...
 
template<typename Fragment >
CUTLASS_DEVICE void store_post_increment (Fragment const &fragment, Coord< 4 > const &offset)
 Stores a fragment and increments without predicates. More...
 
template<typename Fragment >
CUTLASS_DEVICE void store (Fragment const &fragment) const
 Stores a fragment without predicates. More...
 
template<typename Fragment >
CUTLASS_DEVICE void store (Fragment const &fragment, Coord< 4 > const &offset) const
 Stores a fragment without predicates. More...
 
template<typename Fragment , typename PredicateIterator >
CUTLASS_DEVICE void load_post_increment (Fragment &fragment, PredicateIterator pred_it)
 Loads a fragment and increments, using predicates. More...
 
template<typename Fragment , typename PredicateIterator >
CUTLASS_DEVICE void load (Fragment &fragment, PredicateIterator pred_it) const
 Loads a fragment with predicates. More...
 
template<typename Fragment , typename PredicateIterator >
CUTLASS_DEVICE void store_post_increment (Fragment const &fragment, PredicateIterator pred_it)
 Loads a fragment and increments, using predicates. More...
 
template<typename Fragment , typename PredicateIterator >
CUTLASS_DEVICE void store (Fragment const &fragment, PredicateIterator pred_it) const
 Loads a fragment with predicates. More...
 
CUTLASS_DEVICE ZipTileIteratorincrement (int count=1)
 Increments store iterator to next tile. More...
 
CUTLASS_DEVICE ZipTileIteratoroperator++ ()
 Increments to next tile. More...
 
CUTLASS_DEVICE ZipTileIteratoroperator+= (int count)
 
CUTLASS_DEVICE ZipTileIteratoroperator+= (Coord< 3 > const &offset)
 Adds a vector offset to the underlying iterators. More...
 
CUTLASS_DEVICE ZipTileIteratordecrement (int count=1)
 Increments store iterator to previous tile. More...
 
CUTLASS_DEVICE ZipTileIteratoroperator-- ()
 Increments to subsequent tile. More...
 
CUTLASS_DEVICE ZipTileIteratoroperator-= (int count)
 Decrements to previous tile. More...
 
CUTLASS_DEVICE void add_pointer_offset (Index offset)
 Adds an offset to both iterators. More...
 
+ + + + + + + +

+Public Attributes

First first
 First iterator. More...
 
Second second
 Second iterator. More...
 
+

Member Typedef Documentation

+ +

◆ First

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + +
typedef First_ cutlass::ZipTileIterator< First_, Second_ >::First
+
+ +
+
+ +

◆ Fragment

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + +
typedef ZipFragment<typename First::Fragment, typename Second::Fragment> cutlass::ZipTileIterator< First_, Second_ >::Fragment
+
+ +
+
+ +

◆ Index

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + +
typedef First::Index cutlass::ZipTileIterator< First_, Second_ >::Index
+
+ +
+
+ +

◆ PredicateVector

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + +
typedef First::PredicateVector cutlass::ZipTileIterator< First_, Second_ >::PredicateVector
+
+ +
+
+ +

◆ Second

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + +
typedef Second_ cutlass::ZipTileIterator< First_, Second_ >::Second
+
+ +
+
+ +

◆ TensorRef

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + +
typedef ZipTensorRef< typename First::TensorRef, typename Second::TensorRef> cutlass::ZipTileIterator< First_, Second_ >::TensorRef
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ ZipTileIterator() [1/5]

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE cutlass::ZipTileIterator< First_, Second_ >::ZipTileIterator ()
+
+inline
+
+ +
+
+ +

◆ ZipTileIterator() [2/5]

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE cutlass::ZipTileIterator< First_, Second_ >::ZipTileIterator (Params const & _params,
Coord< 3 > const & threadblock_offset = make_Coord(0, 0, 0) 
)
+
+inline
+
+ +
+
+ +

◆ ZipTileIterator() [3/5]

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE cutlass::ZipTileIterator< First_, Second_ >::ZipTileIterator (First const & _first,
Second const & _second 
)
+
+inline
+
+ +
+
+ +

◆ ZipTileIterator() [4/5]

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_DEVICE cutlass::ZipTileIterator< First_, Second_ >::ZipTileIterator (TensorRef const & ref)
+
+inline
+
+ +
+
+ +

◆ ZipTileIterator() [5/5]

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE cutlass::ZipTileIterator< First_, Second_ >::ZipTileIterator (Params const & _params,
TensorRef const & ref 
)
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ add_pointer_offset()

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_DEVICE void cutlass::ZipTileIterator< First_, Second_ >::add_pointer_offset (Index offset)
+
+inline
+
+ +
+
+ +

◆ decrement()

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_DEVICE ZipTileIterator& cutlass::ZipTileIterator< First_, Second_ >::decrement (int count = 1)
+
+inline
+
+ +
+
+ +

◆ increment()

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_DEVICE ZipTileIterator& cutlass::ZipTileIterator< First_, Second_ >::increment (int count = 1)
+
+inline
+
+ +
+
+ +

◆ initialize_predicates() [1/2]

+ +
+
+
+template<typename First_ , typename Second_ >
+
+template<typename PredicateIterator >
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE void cutlass::ZipTileIterator< First_, Second_ >::initialize_predicates (PredicateIterator predicate_it,
Coord< 3 > const & bounds,
Coord< 3 > const & block_offset = make_Coord(0,                                                                                           0,                                                                                           0) 
)
+
+inline
+
+ +
+
+ +

◆ initialize_predicates() [2/2]

+ +
+
+
+template<typename First_ , typename Second_ >
+
+template<typename PredicateIterator , typename PredicateFunctor >
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE void cutlass::ZipTileIterator< First_, Second_ >::initialize_predicates (PredicateIterator predicate_it,
PredicateFunctor const & functor,
Coord< 3 > const & block_offset 
)
+
+inline
+
+ +
+
+ +

◆ load() [1/3]

+ +
+
+
+template<typename First_ , typename Second_ >
+
+template<typename Fragment >
+ + + + + +
+ + + + + + + + +
CUTLASS_DEVICE void cutlass::ZipTileIterator< First_, Second_ >::load (Fragmentfragment) const
+
+inline
+
+ +
+
+ +

◆ load() [2/3]

+ +
+
+
+template<typename First_ , typename Second_ >
+
+template<typename Fragment >
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE void cutlass::ZipTileIterator< First_, Second_ >::load (Fragmentfragment,
Coord< 4 > const & offset 
) const
+
+inline
+
+ +
+
+ +

◆ load() [3/3]

+ +
+
+
+template<typename First_ , typename Second_ >
+
+template<typename Fragment , typename PredicateIterator >
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE void cutlass::ZipTileIterator< First_, Second_ >::load (Fragmentfragment,
PredicateIterator pred_it 
) const
+
+inline
+
+ +
+
+ +

◆ load_post_increment() [1/3]

+ +
+
+
+template<typename First_ , typename Second_ >
+
+template<typename Fragment >
+ + + + + +
+ + + + + + + + +
CUTLASS_DEVICE void cutlass::ZipTileIterator< First_, Second_ >::load_post_increment (Fragmentfragment)
+
+inline
+
+ +
+
+ +

◆ load_post_increment() [2/3]

+ +
+
+
+template<typename First_ , typename Second_ >
+
+template<typename Fragment >
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE void cutlass::ZipTileIterator< First_, Second_ >::load_post_increment (Fragmentfragment,
Coord< 4 > const & offset 
)
+
+inline
+
+ +
+
+ +

◆ load_post_increment() [3/3]

+ +
+
+
+template<typename First_ , typename Second_ >
+
+template<typename Fragment , typename PredicateIterator >
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE void cutlass::ZipTileIterator< First_, Second_ >::load_post_increment (Fragmentfragment,
PredicateIterator pred_it 
)
+
+inline
+
+ +
+
+ +

◆ operator++()

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE ZipTileIterator& cutlass::ZipTileIterator< First_, Second_ >::operator++ ()
+
+inline
+
+ +
+
+ +

◆ operator+=() [1/2]

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_DEVICE ZipTileIterator& cutlass::ZipTileIterator< First_, Second_ >::operator+= (int count)
+
+inline
+
+ +
+
+ +

◆ operator+=() [2/2]

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_DEVICE ZipTileIterator& cutlass::ZipTileIterator< First_, Second_ >::operator+= (Coord< 3 > const & offset)
+
+inline
+
+ +
+
+ +

◆ operator--()

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE ZipTileIterator& cutlass::ZipTileIterator< First_, Second_ >::operator-- ()
+
+inline
+
+ +
+
+ +

◆ operator-=()

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_DEVICE ZipTileIterator& cutlass::ZipTileIterator< First_, Second_ >::operator-= (int count)
+
+inline
+
+ +
+
+ +

◆ store() [1/3]

+ +
+
+
+template<typename First_ , typename Second_ >
+
+template<typename Fragment >
+ + + + + +
+ + + + + + + + +
CUTLASS_DEVICE void cutlass::ZipTileIterator< First_, Second_ >::store (Fragment const & fragment) const
+
+inline
+
+ +
+
+ +

◆ store() [2/3]

+ +
+
+
+template<typename First_ , typename Second_ >
+
+template<typename Fragment >
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE void cutlass::ZipTileIterator< First_, Second_ >::store (Fragment const & fragment,
Coord< 4 > const & offset 
) const
+
+inline
+
+ +
+
+ +

◆ store() [3/3]

+ +
+
+
+template<typename First_ , typename Second_ >
+
+template<typename Fragment , typename PredicateIterator >
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE void cutlass::ZipTileIterator< First_, Second_ >::store (Fragment const & fragment,
PredicateIterator pred_it 
) const
+
+inline
+
+ +
+
+ +

◆ store_post_increment() [1/3]

+ +
+
+
+template<typename First_ , typename Second_ >
+
+template<typename Fragment >
+ + + + + +
+ + + + + + + + +
CUTLASS_DEVICE void cutlass::ZipTileIterator< First_, Second_ >::store_post_increment (Fragment const & fragment)
+
+inline
+
+ +
+
+ +

◆ store_post_increment() [2/3]

+ +
+
+
+template<typename First_ , typename Second_ >
+
+template<typename Fragment >
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE void cutlass::ZipTileIterator< First_, Second_ >::store_post_increment (Fragment const & fragment,
Coord< 4 > const & offset 
)
+
+inline
+
+ +
+
+ +

◆ store_post_increment() [3/3]

+ +
+
+
+template<typename First_ , typename Second_ >
+
+template<typename Fragment , typename PredicateIterator >
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE void cutlass::ZipTileIterator< First_, Second_ >::store_post_increment (Fragment const & fragment,
PredicateIterator pred_it 
)
+
+inline
+
+ +
+
+

Member Data Documentation

+ +

◆ first

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + +
First cutlass::ZipTileIterator< First_, Second_ >::first
+
+ +
+
+ +

◆ second

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + +
Second cutlass::ZipTileIterator< First_, Second_ >::second
+
+ +
+
+
The documentation for this class was generated from the following file: +
+ + + + diff --git a/docs/classcutlass_1_1detail_1_1ScalarOrPointer-members.html b/docs/classcutlass_1_1detail_1_1ScalarOrPointer-members.html new file mode 100644 index 0000000000..8da7146658 --- /dev/null +++ b/docs/classcutlass_1_1detail_1_1ScalarOrPointer-members.html @@ -0,0 +1,101 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::detail::ScalarOrPointer< Scalar_ > Member List
+
+ + + + + diff --git a/docs/classcutlass_1_1detail_1_1ScalarOrPointer.html b/docs/classcutlass_1_1detail_1_1ScalarOrPointer.html new file mode 100644 index 0000000000..6a28c38f84 --- /dev/null +++ b/docs/classcutlass_1_1detail_1_1ScalarOrPointer.html @@ -0,0 +1,434 @@ + + + + + + + +Cutlass: cutlass::detail::ScalarOrPointer< Scalar_ > Class Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::detail::ScalarOrPointer< Scalar_ > Class Template Reference
+
+
+ +

#include <scalar_or_pointer.h>

+ + + + + +

+Public Types

typedef Scalar_ Scalar
 Underlying scalar type. More...
 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

+Public Member Functions

CUTLASS_HOST_DEVICE ScalarOrPointer ()
 Default ctor. More...
 
CUTLASS_HOST_DEVICE ScalarOrPointer (Scalar const &val)
 Object behaves as a scalar. More...
 
CUTLASS_HOST_DEVICE ScalarOrPointer (Scalar const *ptr_)
 Object behaves as a scalar. More...
 
CUTLASS_HOST_DEVICE bool is_pointer () const
 Returns true if is pointer. More...
 
CUTLASS_HOST_DEVICE Scalar const * get_ptr () const
 Gets the pointer value. More...
 
CUTLASS_HOST_DEVICE Scalar get_scalar () const
 Gets the pointer value. More...
 
CUTLASS_HOST_DEVICE ScalarOrPointeroperator= (Scalar const &scalar_)
 Assigns to a scalar and sets pointer to nullptr. More...
 
CUTLASS_HOST_DEVICE ScalarOrPointeroperator= (Scalar const *ptr_)
 Assigns to a pointer value. More...
 
CUTLASS_HOST_DEVICE Scalar get () const
 Access the element. More...
 
CUTLASS_HOST_DEVICE operator Scalar () const
 Accesses the element. More...
 
+

Detailed Description

+

template<typename Scalar_>
+class cutlass::detail::ScalarOrPointer< Scalar_ >

+ +

Helper class defines an object which operates as either a scalar or a pointer. If the pointer is non-null, it is dereferenced when the object is accessed.

+

Member Typedef Documentation

+ +

◆ Scalar

+ +
+
+
+template<typename Scalar_>
+ + + + +
typedef Scalar_ cutlass::detail::ScalarOrPointer< Scalar_ >::Scalar
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ ScalarOrPointer() [1/3]

+ +
+
+
+template<typename Scalar_>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE cutlass::detail::ScalarOrPointer< Scalar_ >::ScalarOrPointer ()
+
+inline
+
+ +
+
+ +

◆ ScalarOrPointer() [2/3]

+ +
+
+
+template<typename Scalar_>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::detail::ScalarOrPointer< Scalar_ >::ScalarOrPointer (Scalar const & val)
+
+inline
+
+ +
+
+ +

◆ ScalarOrPointer() [3/3]

+ +
+
+
+template<typename Scalar_>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::detail::ScalarOrPointer< Scalar_ >::ScalarOrPointer (Scalar const * ptr_)
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ get()

+ +
+
+
+template<typename Scalar_>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE Scalar cutlass::detail::ScalarOrPointer< Scalar_ >::get () const
+
+inline
+
+ +
+
+ +

◆ get_ptr()

+ +
+
+
+template<typename Scalar_>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE Scalar const* cutlass::detail::ScalarOrPointer< Scalar_ >::get_ptr () const
+
+inline
+
+ +
+
+ +

◆ get_scalar()

+ +
+
+
+template<typename Scalar_>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE Scalar cutlass::detail::ScalarOrPointer< Scalar_ >::get_scalar () const
+
+inline
+
+ +
+
+ +

◆ is_pointer()

+ +
+
+
+template<typename Scalar_>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE bool cutlass::detail::ScalarOrPointer< Scalar_ >::is_pointer () const
+
+inline
+
+ +
+
+ +

◆ operator Scalar()

+ +
+
+
+template<typename Scalar_>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE cutlass::detail::ScalarOrPointer< Scalar_ >::operator Scalar () const
+
+inline
+
+ +
+
+ +

◆ operator=() [1/2]

+ +
+
+
+template<typename Scalar_>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE ScalarOrPointer& cutlass::detail::ScalarOrPointer< Scalar_ >::operator= (Scalar const & scalar_)
+
+inline
+
+ +
+
+ +

◆ operator=() [2/2]

+ +
+
+
+template<typename Scalar_>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE ScalarOrPointer& cutlass::detail::ScalarOrPointer< Scalar_ >::operator= (Scalar const * ptr_)
+
+inline
+
+ +
+
+
The documentation for this class was generated from the following file: +
+ + + + diff --git a/docs/classcutlass_1_1gemm_1_1LinearScalingDevicePtr_1_1Params-members.html b/docs/classcutlass_1_1gemm_1_1LinearScalingDevicePtr_1_1Params-members.html new file mode 100644 index 0000000000..323b1406c9 --- /dev/null +++ b/docs/classcutlass_1_1gemm_1_1LinearScalingDevicePtr_1_1Params-members.html @@ -0,0 +1,98 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::gemm::LinearScalingDevicePtr< Scalar_, FragmentMultiplyAdd_ >::Params Member List
+
+ + + + + diff --git a/docs/classcutlass_1_1gemm_1_1LinearScalingDevicePtr_1_1Params.html b/docs/classcutlass_1_1gemm_1_1LinearScalingDevicePtr_1_1Params.html new file mode 100644 index 0000000000..5fc5d05e3d --- /dev/null +++ b/docs/classcutlass_1_1gemm_1_1LinearScalingDevicePtr_1_1Params.html @@ -0,0 +1,389 @@ + + + + + + + +Cutlass: cutlass::gemm::LinearScalingDevicePtr< Scalar_, FragmentMultiplyAdd_ >::Params Class Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::gemm::LinearScalingDevicePtr< Scalar_, FragmentMultiplyAdd_ >::Params Class Reference
+
+
+ +

The parameters. +

+ +

#include <linear_scaling_device_ptr.h>

+ + + + + + + + + + + + + + + + + + + + + + + + +

+Public Member Functions

CUTLASS_HOST_DEVICE Params ()
 
CUTLASS_HOST_DEVICE Params (Scalar alpha, Scalar beta)
 
CUTLASS_HOST_DEVICE Params (Scalar const *alpha_ptr, Scalar const *beta_ptr)
 
CUTLASS_HOST_DEVICE int initialize (Scalar alpha, Scalar beta)
 Initialize the parameters. More...
 
CUTLASS_HOST_DEVICE int initialize (Scalar const *alpha, Scalar const *beta)
 Initialize the parameters. More...
 
template<typename GemmDesc_ >
CUTLASS_HOST_DEVICE int initialize (GemmDesc_ const &desc)
 Initialize the parameters. More...
 
CUTLASS_HOST_DEVICE Scalar alpha () const
 Gets the alpha scalar. More...
 
CUTLASS_HOST_DEVICE Scalar beta () const
 Gets the beta scalar. More...
 
+

Constructor & Destructor Documentation

+ +

◆ Params() [1/3]

+ +
+
+
+template<typename Scalar_ , typename FragmentMultiplyAdd_ = FragmentMultiplyAdd<Scalar_, Scalar_>>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE cutlass::gemm::LinearScalingDevicePtr< Scalar_, FragmentMultiplyAdd_ >::Params::Params ()
+
+inline
+
+ +
+
+ +

◆ Params() [2/3]

+ +
+
+
+template<typename Scalar_ , typename FragmentMultiplyAdd_ = FragmentMultiplyAdd<Scalar_, Scalar_>>
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::gemm::LinearScalingDevicePtr< Scalar_, FragmentMultiplyAdd_ >::Params::Params (Scalar alpha,
Scalar beta 
)
+
+inline
+
+ +
+
+ +

◆ Params() [3/3]

+ +
+
+
+template<typename Scalar_ , typename FragmentMultiplyAdd_ = FragmentMultiplyAdd<Scalar_, Scalar_>>
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::gemm::LinearScalingDevicePtr< Scalar_, FragmentMultiplyAdd_ >::Params::Params (Scalar const * alpha_ptr,
Scalar const * beta_ptr 
)
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ alpha()

+ +
+
+
+template<typename Scalar_ , typename FragmentMultiplyAdd_ = FragmentMultiplyAdd<Scalar_, Scalar_>>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE Scalar cutlass::gemm::LinearScalingDevicePtr< Scalar_, FragmentMultiplyAdd_ >::Params::alpha () const
+
+inline
+
+ +
+
+ +

◆ beta()

+ +
+
+
+template<typename Scalar_ , typename FragmentMultiplyAdd_ = FragmentMultiplyAdd<Scalar_, Scalar_>>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE Scalar cutlass::gemm::LinearScalingDevicePtr< Scalar_, FragmentMultiplyAdd_ >::Params::beta () const
+
+inline
+
+ +
+
+ +

◆ initialize() [1/3]

+ +
+
+
+template<typename Scalar_ , typename FragmentMultiplyAdd_ = FragmentMultiplyAdd<Scalar_, Scalar_>>
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE int cutlass::gemm::LinearScalingDevicePtr< Scalar_, FragmentMultiplyAdd_ >::Params::initialize (Scalar alpha,
Scalar beta 
)
+
+inline
+
+ +
+
+ +

◆ initialize() [2/3]

+ +
+
+
+template<typename Scalar_ , typename FragmentMultiplyAdd_ = FragmentMultiplyAdd<Scalar_, Scalar_>>
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE int cutlass::gemm::LinearScalingDevicePtr< Scalar_, FragmentMultiplyAdd_ >::Params::initialize (Scalar const * alpha,
Scalar const * beta 
)
+
+inline
+
+ +
+
+ +

◆ initialize() [3/3]

+ +
+
+
+template<typename Scalar_ , typename FragmentMultiplyAdd_ = FragmentMultiplyAdd<Scalar_, Scalar_>>
+
+template<typename GemmDesc_ >
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE int cutlass::gemm::LinearScalingDevicePtr< Scalar_, FragmentMultiplyAdd_ >::Params::initialize (GemmDesc_ const & desc)
+
+inline
+
+ +
+
+
The documentation for this class was generated from the following file: +
+ + + + diff --git a/docs/classcutlass_1_1platform_1_1complex-members.html b/docs/classcutlass_1_1platform_1_1complex-members.html new file mode 100644 index 0000000000..3e19742e03 --- /dev/null +++ b/docs/classcutlass_1_1platform_1_1complex-members.html @@ -0,0 +1,100 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::platform::complex< T > Member List
+
+ + + + + diff --git a/docs/classcutlass_1_1platform_1_1complex.html b/docs/classcutlass_1_1platform_1_1complex.html new file mode 100644 index 0000000000..672fef7e9a --- /dev/null +++ b/docs/classcutlass_1_1platform_1_1complex.html @@ -0,0 +1,413 @@ + + + + + + + +Cutlass: cutlass::platform::complex< T > Class Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::platform::complex< T > Class Template Reference
+
+
+ +

#include <complex.h>

+ + + + + +

+Public Types

typedef T value_type
 Type alias for scalar type. More...
 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + +

+Public Member Functions

CUTLASS_HOST_DEVICE complex (T r=T(0), T i=T(0))
 Constructor. More...
 
CUTLASS_HOST_DEVICE complex (cuFloatComplex const &z)
 Conversion from cuFloatComplex. More...
 
CUTLASS_HOST_DEVICE complex (cuDoubleComplex const &z)
 Conversion from cuDoubleComplex. More...
 
CUTLASS_HOST_DEVICE T const & real () const
 Accesses the real part of the complex number. More...
 
CUTLASS_HOST_DEVICE T & real ()
 Accesses the real part of the complex number. More...
 
CUTLASS_HOST_DEVICE T const & imag () const
 Accesses the imaginary part of the complex number. More...
 
CUTLASS_HOST_DEVICE T & imag ()
 Accesses the imaginary part of the complex number. More...
 
CUTLASS_HOST_DEVICE operator cuFloatComplex () const
 Converts to cuFloatComplex. More...
 
CUTLASS_HOST_DEVICE operator cuDoubleComplex () const
 Converts to cuDoubleComplex. More...
 
+

Detailed Description

+

template<typename T>
+class cutlass::platform::complex< T >

+ +

Class for representing and manipulating complex numbers with conversions from built-in CUDA complex types.

+

Member Typedef Documentation

+ +

◆ value_type

+ +
+
+
+template<typename T>
+ + + + +
typedef T cutlass::platform::complex< T >::value_type
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ complex() [1/3]

+ +
+
+
+template<typename T>
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::platform::complex< T >::complex (r = T(0),
i = T(0) 
)
+
+inline
+
+ +
+
+ +

◆ complex() [2/3]

+ +
+
+
+template<typename T>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::platform::complex< T >::complex (cuFloatComplex const & z)
+
+inline
+
+ +
+
+ +

◆ complex() [3/3]

+ +
+
+
+template<typename T>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::platform::complex< T >::complex (cuDoubleComplex const & z)
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ imag() [1/2]

+ +
+
+
+template<typename T>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE T const& cutlass::platform::complex< T >::imag () const
+
+inline
+
+ +
+
+ +

◆ imag() [2/2]

+ +
+
+
+template<typename T>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE T& cutlass::platform::complex< T >::imag ()
+
+inline
+
+ +
+
+ +

◆ operator cuDoubleComplex()

+ +
+
+
+template<typename T>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE cutlass::platform::complex< T >::operator cuDoubleComplex () const
+
+inline
+
+ +
+
+ +

◆ operator cuFloatComplex()

+ +
+
+
+template<typename T>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE cutlass::platform::complex< T >::operator cuFloatComplex () const
+
+inline
+
+ +
+
+ +

◆ real() [1/2]

+ +
+
+
+template<typename T>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE T const& cutlass::platform::complex< T >::real () const
+
+inline
+
+ +
+
+ +

◆ real() [2/2]

+ +
+
+
+template<typename T>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE T& cutlass::platform::complex< T >::real ()
+
+inline
+
+ +
+
+
The documentation for this class was generated from the following file: +
+ + + + diff --git a/docs/classcutlass_1_1platform_1_1unique__ptr-members.html b/docs/classcutlass_1_1platform_1_1unique__ptr-members.html index 696f478842..1242de6835 100644 --- a/docs/classcutlass_1_1platform_1_1unique__ptr-members.html +++ b/docs/classcutlass_1_1platform_1_1unique__ptr-members.html @@ -98,7 +98,7 @@
diff --git a/docs/classcutlass_1_1platform_1_1unique__ptr.html b/docs/classcutlass_1_1platform_1_1unique__ptr.html index cf455f2e5f..625e790b8d 100644 --- a/docs/classcutlass_1_1platform_1_1unique__ptr.html +++ b/docs/classcutlass_1_1platform_1_1unique__ptr.html @@ -546,7 +546,7 @@

diff --git a/docs/classes.html b/docs/classes.html index 9896653f61..6a517312cc 100644 --- a/docs/classes.html +++ b/docs/classes.html @@ -72,100 +72,116 @@
Class Index
-
a | b | c | d | e | f | g | h | i | l | m | n | p | r | s | t | u | v | w
+
a | b | c | d | e | f | g | h | i | k | l | m | n | p | r | s | t | u | v | w | z
- - - - - - - - - - - - - - - - + + + + + + + + + - - - + + + + + + + + + + - - + + + + - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + - - + + + - + + + + + + + - - - - - - + + + + + + + + + + + - - - - - - - + + + + + + + + +
  a  
-
FragmentMultiplyAdd (cutlass::gemm)   IgemmEpilogueScalar (cutlass::gemm)   Load< Scalar_, Lanes_, Memory_, true, 8 > (cutlass)   GlobalLoadStreamBase::SharedStorage (cutlass::gemm)   
FragmentMultiplyAdd< half > (cutlass::gemm)   IgemmEpilogueScalar< int > (cutlass::gemm)   log2_down (cutlass)   SimplifiedGemmEpilogueTraits (cutlass::gemm)   
aligned_chunk (cutlass::platform)   FragmentStore (cutlass)   IgemmEpilogueTraits (cutlass::gemm)   log2_down< N, 1, Count > (cutlass)   SimplifiedGemmTraits (cutlass::gemm)   
aligned_storage (cutlass::platform)   FragmentStore< IteratorFragment::kScalar, kAccessSize, Scalar_, Memory_, FragmentElement_, kStride > (cutlass)   IgemmEpilogueTraitsHelper (cutlass::gemm)   log2_up (cutlass)   SimplifiedGemmTraitsHelper (cutlass::gemm)   
AlignedStruct (cutlass)   FragmentStore< IteratorFragment::kWmmaMatrix, kAccessSize, Scalar_, Memory_, FragmentElement_, kStride > (cutlass)   IgemmFloatToInt8Converter (cutlass::gemm)   log2_up< N, 1, Count > (cutlass)   sqrt_est (cutlass)   
alignment_of (cutlass::platform)   
  g  
-
IgemmGlobalLoadTransformer (cutlass::gemm)   
  m  
-
StorageType (cutlass)   
alignment_of< const value_t > (cutlass::platform)   IgemmGlobalLoadTransformer< Fragment< int8_t, kElements_ >, float > (cutlass::gemm)   StorageType< 1 > (cutlass)   
alignment_of< const volatile value_t > (cutlass::platform)   Gemm (cutlass::gemm)   IgemmGlobalStoreTransformer (cutlass::gemm)   GemmTraits::MainLoopSharedStorage (cutlass::gemm)   StorageType< 2 > (cutlass)   
alignment_of< double2 > (cutlass::platform)   GemmConfig (cutlass::gemm)   IgemmGlobalStoreTransformer< float, Fragment< int8_t, kElements_ > > (cutlass::gemm)   MatrixLayout (cutlass)   StorageType< 4 > (cutlass)   
alignment_of< double4 > (cutlass::platform)   GemmDesc (cutlass::gemm)   IgemmInt8ToFloatConverter (cutlass::gemm)   MemorySpace (cutlass)   Store (cutlass)   
alignment_of< float4 > (cutlass::platform)   GemmEpilogue (cutlass::gemm)   IgemmSharedStoreTransformer (cutlass::gemm)   
  n  
-
Store< double, 2, Memory_, true, 16 > (cutlass)   
alignment_of< int4 > (cutlass::platform)   GemmEpilogueTraits (cutlass::gemm)   IgemmSwizzle (cutlass::gemm)   Store< Scalar_, Lanes_, Memory_, true, 16 > (cutlass)   
alignment_of< long4 > (cutlass::platform)   GemmEpilogueTraitsHelper (cutlass::gemm)   IgemmTileTraitsHelperA (cutlass::gemm)   nullptr_t (cutlass::platform)   Store< Scalar_, Lanes_, Memory_, true, 4 > (cutlass)   
alignment_of< longlong2 > (cutlass::platform)   GemmGlobalIteratorAb (cutlass::gemm)   IgemmTileTraitsHelperA< MatrixLayout::kColumnMajor, GemmConfig_ > (cutlass::gemm)   
  p  
-
Store< Scalar_, Lanes_, Memory_, true, 8 > (cutlass)   
alignment_of< longlong4 > (cutlass::platform)   GemmGlobalIteratorCd (cutlass::gemm)   IgemmTileTraitsHelperB (cutlass::gemm)   GemmTraits::StreamSharedStorage (cutlass::gemm)   
alignment_of< uint4 > (cutlass::platform)   GemmGlobalTileCdTraits (cutlass::gemm)   IgemmTileTraitsHelperB< MatrixLayout::kRowMajor, GemmConfig_ > (cutlass::gemm)   alignment_of::pad (cutlass::platform)   GemmEpilogueTraits::StreamSharedStorage (cutlass::gemm)   
alignment_of< ulong4 > (cutlass::platform)   GemmGlobalTileTraits (cutlass::gemm)   IgemmTraits (cutlass::gemm)   WmmaGemmGlobalIteratorCd::Params (cutlass::gemm)   
  t  
+
GemmConfig (cutlass::gemm)   IgemmTraitsHelper (cutlass::gemm)   LinearScalingDevicePtr::Params (cutlass::gemm)   Store< double, 2, Memory_, FragmentElementType::kScalar, double, kStride, 16 > (cutlass)   
GemmCoord (cutlass::gemm)   IgemmTransformerA (cutlass::gemm)   GlobalLoadStream::Params (cutlass::gemm)   Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, 1, 2 > (cutlass)   
aligned_chunk (cutlass::platform)   GemmDesc (cutlass::gemm)   IgemmTransformerA< MatrixLayout::kColumnMajor, Iterator_ > (cutlass::gemm)   SharedStreamPair::Params (cutlass::gemm)   Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 16 > (cutlass)   
aligned_storage (cutlass::platform)   GemmEpilogue (cutlass::gemm)   IgemmTransformerA< MatrixLayout::kRowMajor, Iterator_ > (cutlass::gemm)   WmmaGemmGlobalIteratorCd::Params (cutlass::gemm)   Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 4 > (cutlass)   
AlignedStruct (cutlass)   GemmEpilogueTraits (cutlass::gemm)   IgemmTransformerB (cutlass::gemm)   ZipTileIterator::Params (cutlass)   Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 8 > (cutlass)   
alignment_of (cutlass::platform)   GemmEpilogueTraitsHelper (cutlass::gemm)   IgemmTransformerB< MatrixLayout::kColumnMajor, Iterator_ > (cutlass::gemm)   GemmTraits::Params (cutlass::gemm)   Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kWmmaMatrix, FragmentElement_, kStride, size > (cutlass)   
alignment_of< const value_t > (cutlass::platform)   GemmGlobalIteratorAb (cutlass::gemm)   IgemmTransformerB< MatrixLayout::kRowMajor, Iterator_ > (cutlass::gemm)   LinearScaling::Params (cutlass::gemm)   GemmEpilogueTraits::StreamSharedStorage (cutlass::gemm)   
alignment_of< const volatile value_t > (cutlass::platform)   GemmGlobalIteratorCd (cutlass::gemm)   int4_t (cutlass)   GemmGlobalIteratorAb::Params (cutlass::gemm)   TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::StrideVector (cutlass)   
alignment_of< double2 > (cutlass::platform)   GemmGlobalTileCdTraits (cutlass::gemm)   integral_constant (cutlass::platform)   GlobalLoadStreamPair::Params (cutlass::gemm)   swizzleDirection (cutlass::gemm)   
alignment_of< double4 > (cutlass::platform)   GemmGlobalTileTraits (cutlass::gemm)   is_arithmetic (cutlass::platform)   GemmGlobalIteratorCd::Params (cutlass::gemm)   
  t  
alignment_of< ulonglong2 > (cutlass::platform)   GemmMultiplicandTraits (cutlass::gemm)   IgemmTraitsHelper (cutlass::gemm)   GemmTraits::Params (cutlass::gemm)   
alignment_of< ulonglong4 > (cutlass::platform)   GemmOperand (cutlass)   IgemmTransformerA (cutlass::gemm)   GlobalLoadStreamBase::Params (cutlass::gemm)   TensorRef (cutlass)   
alignment_of< volatile value_t > (cutlass::platform)   GemmOperandTraitsAb (cutlass::gemm)   IgemmTransformerA< MatrixLayout::kColumnMajor, Iterator_ > (cutlass::gemm)   TileIteratorBase::Params (cutlass)   TensorView (cutlass)   
alignment_of< float4 > (cutlass::platform)   GemmMultiplicandTraits (cutlass::gemm)   is_base_of (cutlass::platform)   GemmEpilogueTraits::Params (cutlass::gemm)   
alignment_of< int4 > (cutlass::platform)   GemmOperand (cutlass)   is_base_of_helper (cutlass::platform)   TileIteratorBase::Params (cutlass)   TensorRef (cutlass)   
alignment_of< long4 > (cutlass::platform)   GemmOperandTraitsAb (cutlass::gemm)   is_floating_point (cutlass::platform)   TileLoadIterator::Params (cutlass)   TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ > (cutlass)   
alignment_of< longlong2 > (cutlass::platform)   GemmSharedLoadTileATraits (cutlass::gemm)   is_fundamental (cutlass::platform)   TileStoreIterator::Params (cutlass)   TensorRefArray (cutlass)   
alignment_of< longlong4 > (cutlass::platform)   GemmSharedLoadTileBTraits (cutlass::gemm)   is_integral (cutlass::platform)   TileLoadStream::Params (cutlass)   TensorRefBatchStrided (cutlass)   
alignment_of< uint4 > (cutlass::platform)   GemmSharedLoadTileDTraits (cutlass::gemm)   is_integral< char > (cutlass::platform)   TileStoreStream::Params (cutlass)   TensorView (cutlass)   
alignment_of< ulong4 > (cutlass::platform)   GemmSharedStoreTileAbTraits (cutlass::gemm)   is_integral< const T > (cutlass::platform)   SharedLoadStream::Params (cutlass::gemm)   ThreadMultiplyAdd (cutlass::gemm)   
alignment_of< ulonglong2 > (cutlass::platform)   GemmSharedStoreTileDTraits (cutlass::gemm)   is_integral< const volatile T > (cutlass::platform)   plus (cutlass::platform)   ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, half, half, float > (cutlass::gemm)   
alignment_of< ulonglong4 > (cutlass::platform)   GemmSharedStoreWithSkewTileAbTraits (cutlass::gemm)   is_integral< int > (cutlass::platform)   PredicatedTileLoadStream (cutlass)   ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, half, half, half > (cutlass::gemm)   
alignment_of< volatile value_t > (cutlass::platform)   GemmTileTraitsHelperA (cutlass::gemm)   is_integral< long > (cutlass::platform)   PredicatedTileStoreStream (cutlass)   ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, int8_t, int8_t, int > (cutlass::gemm)   
  b  
-
GemmSharedLoadTileATraits (cutlass::gemm)   IgemmTransformerA< MatrixLayout::kRowMajor, Iterator_ > (cutlass::gemm)   GemmGlobalIteratorCd::Params (cutlass::gemm)   ThreadMultiplyAdd (cutlass::gemm)   
GemmSharedLoadTileBTraits (cutlass::gemm)   IgemmTransformerB (cutlass::gemm)   TileLoadIterator::Params (cutlass)   ThreadMultiplyAdd< AccumulatorsPerThread_, ThreadsPerWarp_, half, half, half > (cutlass::gemm)   
bool_constant (cutlass::platform)   GemmSharedLoadTileDTraits (cutlass::gemm)   IgemmTransformerB< MatrixLayout::kColumnMajor, Iterator_ > (cutlass::gemm)   TileStoreIterator::Params (cutlass)   ThreadMultiplyAdd< AccumulatorsPerThread_, ThreadsPerWarp_, int8_t, int8_t, int > (cutlass::gemm)   
GemmTileTraitsHelperA< MatrixLayout::kColumnMajor, GemmConfig_ > (cutlass::gemm)   is_integral< long long > (cutlass::platform)   PredicateTileAdapter (cutlass)   GemmSharedStoreTileAbTraits::ThreadOffset (cutlass::gemm)   
GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > (cutlass::gemm)   is_integral< short > (cutlass::platform)   TileLoadStream::PredicateVector (cutlass)   WmmaGemmGlobalIteratorCdTraits::ThreadOffset (cutlass::gemm)   
bin1_t (cutlass)   GemmTileTraitsHelperB (cutlass::gemm)   is_integral< signed char > (cutlass::platform)   PredicateVector (cutlass)   GemmGlobalTileCdTraits::ThreadOffset (cutlass::gemm)   
bool_constant (cutlass::platform)   GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > (cutlass::gemm)   is_integral< unsigned char > (cutlass::platform)   TileStoreStream::PredicateVector (cutlass)   GemmSharedLoadTileATraits::ThreadOffset (cutlass::gemm)   
  c  
-
GemmSharedStoreTileAbTraits (cutlass::gemm)   IgemmTransformerB< MatrixLayout::kRowMajor, Iterator_ > (cutlass::gemm)   GemmEpilogueTraits::Params (cutlass::gemm)   GemmSharedLoadTileBTraits::ThreadOffset (cutlass::gemm)   
GemmSharedStoreTileDTraits (cutlass::gemm)   integral_constant (cutlass::platform)   Gemm::Params (cutlass::gemm)   GemmGlobalTileCdTraits::ThreadOffset (cutlass::gemm)   
ClearAccumulators (cutlass::gemm)   GemmSharedStoreWithSkewTileAbTraits (cutlass::gemm)   is_arithmetic (cutlass::platform)   SharedLoadStream::Params (cutlass::gemm)   IgemmContiguousGlobalTileTraits::ThreadOffset (cutlass::gemm)   
ComputeOffsetFromShape (cutlass)   GemmTileTraitsHelperA (cutlass::gemm)   is_base_of (cutlass::platform)   LinearScaling::Params (cutlass::gemm)   GemmGlobalTileTraits::ThreadOffset (cutlass::gemm)   
ComputeOffsetFromShape< Shape< 1, kSh_, kSw_, 1 > > (cutlass)   GemmTileTraitsHelperA< MatrixLayout::kColumnMajor, GemmConfig_ > (cutlass::gemm)   is_base_of_helper (cutlass::platform)   GemmGlobalIteratorAb::Params (cutlass::gemm)   GemmSharedLoadTileDTraits::ThreadOffset (cutlass::gemm)   
ComputeOffsetFromShape< Shape< 1, kSh_, kSw_, kSc_ > > (cutlass)   GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > (cutlass::gemm)   is_floating_point (cutlass::platform)   plus (cutlass::platform)   GemmSharedLoadTileATraits::ThreadOffset (cutlass::gemm)   
ComputeOffsetFromStrides (cutlass)   GemmTileTraitsHelperB (cutlass::gemm)   is_fundamental (cutlass::platform)   PredicateTileAdapter (cutlass)   GemmSharedStoreTileDTraits::ThreadOffset (cutlass::gemm)   
ComputeOffsetFromStrides< Shape< 1, S_h_, S_w_, 1 > > (cutlass)   GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > (cutlass::gemm)   is_integral (cutlass::platform)   PredicateVector (cutlass)   HgemmCrosswiseGlobalTileTraits::ThreadOffset (cutlass::gemm)   
ComputeOffsetFromStrides< Shape< 1, S_h_, S_w_, S_c_ > > (cutlass)   GemmTileTraitsHelperB< MatrixLayout::kRowMajor, GemmConfig_ > (cutlass::gemm)   is_integral< char > (cutlass::platform)   ProjectOperand (cutlass::gemm)   GemmSharedStoreTileAbTraits::ThreadOffset (cutlass::gemm)   
ComputeThreadOffsetFromStrides (cutlass)   GemmTraits (cutlass::gemm)   is_integral< const T > (cutlass::platform)   ProjectOperand< GemmOperand::kA, Kstrided > (cutlass::gemm)   TileTraitsWarpRake::ThreadOffset (cutlass)   
ComputeThreadOffsetFromStrides< Shape< 1, T_h_, T_w_, 1 >, Shape< 1, S_h_, S_w_, 1 > > (cutlass)   GetExtent (cutlass::gemm)   is_integral< const volatile T > (cutlass::platform)   ProjectOperand< GemmOperand::kB, Kstrided > (cutlass::gemm)   GemmSharedStoreWithSkewTileAbTraits::ThreadOffset (cutlass::gemm)   
ComputeThreadOffsetFromStrides< Shape< 1, T_h_, T_w_, T_c_ >, Shape< 1, S_h_, S_w_, S_c_ > > (cutlass)   GetExtent< GemmOperand::kA, Tile_ > (cutlass::gemm)   is_integral< int > (cutlass::platform)   ProjectOperand< GemmOperand::kC, true > (cutlass::gemm)   WmmaGemmGlobalIteratorCdTraits::ThreadOffset (cutlass::gemm)   
conditional (cutlass::platform)   GetExtent< GemmOperand::kB, Tile_ > (cutlass::gemm)   is_integral< long > (cutlass::platform)   ProjectOperand< GemmOperand::kD, true > (cutlass::gemm)   TiledThreadOffset (cutlass)   
conditional< false, T, F > (cutlass::platform)   GemmTraits::GlobalLoadStream (cutlass::gemm)   is_integral< long long > (cutlass::platform)   
  r  
-
TileIteratorBase (cutlass)   
PredicateVector::ConstIterator (cutlass)   GlobalLoadStream (cutlass::gemm)   is_integral< short > (cutlass::platform)   TileLoadIterator (cutlass)   
ConstPredicateTileAdapter (cutlass)   GlobalLoadStreamBase (cutlass::gemm)   is_integral< signed char > (cutlass::platform)   remove_const (cutlass::platform)   TileStoreIterator (cutlass)   
Convert (cutlass)   greater (cutlass::platform)   is_integral< unsigned char > (cutlass::platform)   remove_const< const T > (cutlass::platform)   TileTraits (cutlass)   
Convert< Fragment< InputScalar_, kScalars_ >, Fragment< OutputScalar_, kScalars_ > > (cutlass)   
  h  
-
is_integral< unsigned int > (cutlass::platform)   remove_cv (cutlass::platform)   TileTraitsContiguousMajor (cutlass)   
Coord (cutlass)   is_integral< unsigned long > (cutlass::platform)   remove_volatile (cutlass::platform)   TileTraitsStandard (cutlass)   
Copy (cutlass)   HgemmConfig (cutlass::gemm)   is_integral< unsigned long long > (cutlass::platform)   remove_volatile< volatile T > (cutlass::platform)   TileTraitsStrideMajor (cutlass)   
  d  
-
HgemmCrosswiseGlobalTileTraits (cutlass::gemm)   is_integral< unsigned short > (cutlass::platform)   ReshapeThreads (cutlass::gemm)   TileTraitsWarpRake (cutlass)   
HgemmSwizzle (cutlass::gemm)   is_integral< volatile T > (cutlass::platform)   ReshapeThreads< Tile_, Threads_, true > (cutlass::gemm)   PredicateVector::TrivialIterator (cutlass)   
default_delete (cutlass::platform)   HgemmTileTraitsHelperA (cutlass::gemm)   is_pointer (cutlass::platform)   ReshapeTile (cutlass)   TrivialPredicateTileAdapter (cutlass)   
default_delete< T[]> (cutlass::platform)   HgemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > (cutlass::gemm)   is_pointer_helper (cutlass::platform)   ReshapeTile< Tile_, kAccessSize_, true > (cutlass)   
  u  
-
DgemmConfig (cutlass::gemm)   HgemmTileTraitsHelperB (cutlass::gemm)   is_pointer_helper< T * > (cutlass::platform)   
  s  
+
GemmTileTraitsHelperB< MatrixLayout::kRowMajor, GemmConfig_ > (cutlass::gemm)   is_integral< unsigned int > (cutlass::platform)   ProjectOperand (cutlass::gemm)   GemmSharedStoreWithSkewTileAbTraits::ThreadOffset (cutlass::gemm)   
GemmTraits (cutlass::gemm)   is_integral< unsigned long > (cutlass::platform)   ProjectOperand< GemmOperand::kA, Kstrided > (cutlass::gemm)   IgemmGlobalTileTraits::ThreadOffset (cutlass::gemm)   
ClearAccumulators (cutlass::gemm)   GetExtent (cutlass::gemm)   is_integral< unsigned long long > (cutlass::platform)   ProjectOperand< GemmOperand::kB, Kstrided > (cutlass::gemm)   GemmSharedLoadTileBTraits::ThreadOffset (cutlass::gemm)   
MatrixLayout::ColumnMajor (cutlass)   GetExtent< GemmOperand::kA, Tile_ > (cutlass::gemm)   is_integral< unsigned short > (cutlass::platform)   ProjectOperand< GemmOperand::kC, true > (cutlass::gemm)   GemmGlobalTileTraits::ThreadOffset (cutlass::gemm)   
MatrixLayout::ColumnMajorBlockLinear (cutlass)   GetExtent< GemmOperand::kB, Tile_ > (cutlass::gemm)   is_integral< volatile T > (cutlass::platform)   ProjectOperand< GemmOperand::kD, true > (cutlass::gemm)   GemmSharedLoadTileDTraits::ThreadOffset (cutlass::gemm)   
ColumnMajorBlockSwizzle (cutlass::gemm)   GlobalLoadStream (cutlass::gemm)   is_pointer (cutlass::platform)   
  r  
+
TileTraitsWarpRake::ThreadOffset (cutlass)   
MatrixLayout::ColumnMajorInterleaved (cutlass)   GlobalLoadStreamPair (cutlass::gemm)   is_pointer_helper (cutlass::platform)   GemmSharedStoreTileDTraits::ThreadOffset (cutlass::gemm)   
complex (cutlass::platform)   greater (cutlass::platform)   is_pointer_helper< T * > (cutlass::platform)   RegularTilePredicateFunctor (cutlass)   HgemmCrosswiseGlobalTileTraits::ThreadOffset (cutlass::gemm)   
ComputeOffsetFromShape (cutlass)   
  h  
+
is_pow2 (cutlass)   remove_const (cutlass::platform)   TileAllocation (cutlass)   
ComputeOffsetFromStrides (cutlass)   is_same (cutlass::platform)   remove_const< const T > (cutlass::platform)   TileCoord (cutlass)   
ComputeThreadOffsetFromStrides (cutlass)   HgemmConfig (cutlass::gemm)   is_same< A, A > (cutlass::platform)   remove_cv (cutlass::platform)   TiledThreadOffset (cutlass)   
ComputeThreadOffsetFromStrides< Shape< 1, T_h_, T_w_, 1 >, Shape< 1, S_h_, S_w_, 1 > > (cutlass)   HgemmCrosswiseGlobalTileTraits (cutlass::gemm)   is_trivially_copyable (cutlass::platform)   remove_volatile (cutlass::platform)   TileIteratorBase (cutlass)   
ComputeThreadOffsetFromStrides< Shape< 1, T_h_, T_w_, T_c_ >, Shape< 1, S_h_, S_w_, S_c_ > > (cutlass)   HgemmSwizzle (cutlass::gemm)   is_void (cutlass::platform)   remove_volatile< volatile T > (cutlass::platform)   TileLoadIterator (cutlass)   
conditional (cutlass::platform)   HgemmTileTraitsHelperA (cutlass::gemm)   is_volatile (cutlass::platform)   ReshapeThreads (cutlass::gemm)   TileLoadStream (cutlass)   
conditional< false, T, F > (cutlass::platform)   HgemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > (cutlass::gemm)   is_volatile< volatile T > (cutlass::platform)   ReshapeThreads< Tile_, Threads_, true > (cutlass::gemm)   TileStoreIterator (cutlass)   
PredicateVector::ConstIterator (cutlass)   HgemmTileTraitsHelperB (cutlass::gemm)   PredicateVector::Iterator (cutlass)   ReshapeTile (cutlass)   TileStoreStream (cutlass)   
TensorRefBatchStrided::ConstIterator (cutlass)   HgemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > (cutlass::gemm)   IteratorAdvance (cutlass)   ReshapeTile< Tile_, kAccessSize_, true > (cutlass)   TileTraits (cutlass)   
TensorRefArray::ConstIterator (cutlass)   HgemmTraits (cutlass::gemm)   
  k  
+
MatrixLayout::RowMajor (cutlass)   TileTraitsContiguousMajor (cutlass)   
ConstPredicateTileAdapter (cutlass)   HgemmTraitsHelper (cutlass::gemm)   MatrixLayout::RowMajorBlockLinear (cutlass)   TileTraitsStandard (cutlass)   
MatrixLayout::ContiguousLayout (cutlass)   HgemmTransformerA (cutlass::gemm)   KernelLaunchConfiguration (cutlass)   RowMajorBlockSwizzle (cutlass::gemm)   TileTraitsStrideMajor (cutlass)   
Convert (cutlass)   HgemmTransformerA< MatrixLayout::kColumnMajor, Iterator_ > (cutlass::gemm)   
  l  
+
MatrixLayout::RowMajorInterleaved (cutlass)   TileTraitsWarpRake (cutlass)   
Convert< Fragment< InputScalar_, kScalars_ >, Fragment< OutputScalar_, kScalars_ > > (cutlass)   HgemmTransformerA< MatrixLayout::kRowMajor, Iterator_ > (cutlass::gemm)   
  s  
+
PredicateVector::TrivialIterator (cutlass)   
Coord (cutlass)   HgemmTransformerB (cutlass::gemm)   Launch (cutlass::gemm)   TrivialPredicateTileAdapter (cutlass)   
Copy (cutlass)   HgemmTransformerB< MatrixLayout::kColumnMajor, Iterator_ > (cutlass::gemm)   Launch< Gemm, false > (cutlass::gemm)   ScalarIO (cutlass)   
  u  
DgemmTraits (cutlass::gemm)   HgemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > (cutlass::gemm)   is_pow2 (cutlass)   unique_ptr (cutlass::platform)   
divide_assert (cutlass)   HgemmTraits (cutlass::gemm)   is_same (cutlass::platform)   SgemmConfig (cutlass::gemm)   
  v  
+
  d  
+
HgemmTransformerB< MatrixLayout::kRowMajor, Iterator_ > (cutlass::gemm)   less (cutlass::platform)   ScalarOrPointer (cutlass::detail)   
  i  
+
LinearScaling (cutlass::gemm)   SgemmConfig (cutlass::gemm)   uint4_t (cutlass)   
DebugType   LinearScalingDevicePtr (cutlass::gemm)   SgemmLBTraits (cutlass::gemm)   unique_ptr (cutlass::platform)   
DebugValue   Identity (cutlass)   Load (cutlass)   SgemmTraits (cutlass::gemm)   
  v  
is_base_of_helper::dummy (cutlass::platform)   HgemmTraitsHelper (cutlass::gemm)   is_same< A, A > (cutlass::platform)   SgemmTraits (cutlass::gemm)   
default_delete (cutlass::platform)   IdentityBlockSwizzle (cutlass::gemm)   Load< double, 2, Memory_, FragmentElementType::kScalar, double, kStride, 16 > (cutlass)   Shape (cutlass)   
default_delete< T[]> (cutlass::platform)   IdentityTensorMapFunc (cutlass)   Load< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, 1, 2 > (cutlass)   ShapeAdd (cutlass)   Vector (cutlass)   
DgemmConfig (cutlass::gemm)   IgemmConfig (cutlass::gemm)   Load< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 16 > (cutlass)   ShapeCount (cutlass)   Vector< bin1_t, kLanes_ > (cutlass)   
DgemmTraits (cutlass::gemm)   IgemmConfig< OutputTile_, int8_t, ThreadGemmShape_ > (cutlass::gemm)   Load< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 4 > (cutlass)   ShapeDiv (cutlass)   Vector< half, 1 > (cutlass)   
divide_assert (cutlass)   IgemmEpilogue (cutlass::gemm)   Load< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 8 > (cutlass)   ShapeDivCeiling (cutlass)   Vector< half, kLanes_ > (cutlass)   
is_base_of_helper::dummy (cutlass::platform)   IgemmEpilogue< GemmEpilogueTraits_, true > (cutlass::gemm)   Load< Scalar_, kAccessSize, Memory_, FragmentElementType::kWmmaMatrix, FragmentElement_, kStride, size > (cutlass)   ShapeMax (cutlass)   Vector< int4_t, kLanes_ > (cutlass)   
DumpType (cutlass)   IgemmEpilogueScalar (cutlass::gemm)   Load< Vector< bin1_t, 32 >, kAccessSize, Memory_, FragmentElementType::kWmmaMatrix, FragmentElement_, kStride, size > (cutlass)   ShapeMin (cutlass)   Vector< uint4_t, kLanes_ > (cutlass)   
  e  
-
HgemmTransformerA (cutlass::gemm)   is_trivially_copyable (cutlass::platform)   Shape (cutlass)   Vector (cutlass)   
HgemmTransformerA< MatrixLayout::kColumnMajor, Iterator_ > (cutlass::gemm)   is_void (cutlass::platform)   ShapeAdd (cutlass)   Vector< half, kLanes_ > (cutlass)   
enable_if (cutlass::platform)   HgemmTransformerA< MatrixLayout::kRowMajor, Iterator_ > (cutlass::gemm)   is_volatile (cutlass::platform)   ShapeCount (cutlass)   Vectorize (cutlass)   
enable_if< false, T > (cutlass::platform)   HgemmTransformerB (cutlass::gemm)   is_volatile< volatile T > (cutlass::platform)   ShapeDiv (cutlass)   Vectorize< Element_, 1 > (cutlass)   
Extent (cutlass)   HgemmTransformerB< MatrixLayout::kColumnMajor, Iterator_ > (cutlass::gemm)   PredicateVector::Iterator (cutlass)   ShapeMax (cutlass)   VectorTraits (cutlass)   
Extent< Vector< T, Lanes > > (cutlass)   HgemmTransformerB< MatrixLayout::kRowMajor, Iterator_ > (cutlass::gemm)   IteratorAdvance (cutlass)   ShapeMin (cutlass)   VectorTraits< Vector< T, Lanes > > (cutlass)   
Extent< Vector< T, Lanes > const > (cutlass)   
  i  
-
IteratorFragment (cutlass)   ShapeMul (cutlass)   VectorTraits< Vector< T, Lanes > const > (cutlass)   
IgemmEpilogueScalar< int > (cutlass::gemm)   Load< Vector< int4_t, 8 >, kAccessSize, Memory_, FragmentElementType::kWmmaMatrix, FragmentElement_, kStride, size > (cutlass)   ShapeMul (cutlass)   Vectorize (cutlass)   
IgemmEpilogueTraits (cutlass::gemm)   Load< Vector< uint4_t, 8 >, kAccessSize, Memory_, FragmentElementType::kWmmaMatrix, FragmentElement_, kStride, size > (cutlass)   ShapeScale (cutlass)   Vectorize< Vector< bin1_t, 32 >, kLanes_ > (cutlass)   
enable_if (cutlass::platform)   IgemmEpilogueTraitsHelper (cutlass::gemm)   log2_down (cutlass)   ShapeStrides (cutlass)   Vectorize< Vector< int4_t, 8 >, kLanes_ > (cutlass)   
enable_if< false, T > (cutlass::platform)   IgemmFloatToInt8Converter (cutlass::gemm)   log2_down< N, 1, Count > (cutlass)   ShapeSub (cutlass)   Vectorize< Vector< uint4_t, 8 >, kLanes_ > (cutlass)   
Extent (cutlass)   IgemmGlobalIteratorAb (cutlass::gemm)   log2_up (cutlass)   SharedLoadStream (cutlass::gemm)   VectorTraits (cutlass)   
Extent< Vector< T, Lanes > > (cutlass)   IgemmGlobalLoadTransformer (cutlass::gemm)   log2_up< N, 1, Count > (cutlass)   GemmEpilogueTraits::SharedStorage (cutlass::gemm)   VectorTraits< Vector< T, Lanes > > (cutlass)   
Extent< Vector< T, Lanes > const > (cutlass)   IgemmGlobalLoadTransformer< Fragment< int8_t, kElements_ >, float > (cutlass::gemm)   
  m  
+
GlobalLoadStreamPair::SharedStorage (cutlass::gemm)   VectorTraits< Vector< T, Lanes > const > (cutlass)   
  f  
-
  l  
-
ShapeScale (cutlass)   
  w  
+
IgemmGlobalStoreTransformer (cutlass::gemm)   GemmTraits::SharedStorage (cutlass::gemm)   
  w  
+
IgemmGlobalStoreTransformer< float, Fragment< int8_t, kElements_ > > (cutlass::gemm)   GemmTraits::MainLoopSharedStorage (cutlass::gemm)   GlobalLoadStream::SharedStorage (cutlass::gemm)   
Fp16SgemmConfig (cutlass::gemm)   IgemmGlobalTileTraits (cutlass::gemm)   MatrixCoord (cutlass)   ClearAccumulators::SharedStorage (cutlass::gemm)   WmmaGemmGlobalIteratorCd (cutlass::gemm)   
Fp16SgemmSgemmTraits (cutlass::gemm)   IgemmInt8ToFloatConverter (cutlass::gemm)   MatrixLayout (cutlass)   SharedStreamPair (cutlass::gemm)   WmmaGemmGlobalIteratorCdTraits (cutlass::gemm)   
Fragment (cutlass)   IgemmSharedStoreTransformer (cutlass::gemm)   MatrixTransform (cutlass)   SimplifiedGemmEpilogueTraits (cutlass::gemm)   
  z  
Identity (cutlass)   ShapeStrides (cutlass)   
Fragment (cutlass)   IdentityBlockSwizzle (cutlass::gemm)   less (cutlass::platform)   ShapeSub (cutlass)   WmmaGemmGlobalIteratorCd (cutlass::gemm)   
FragmentConstIterator (cutlass)   IgemmConfig (cutlass::gemm)   LinearScaling (cutlass::gemm)   GemmTraits::SharedLoadStream (cutlass::gemm)   WmmaGemmGlobalIteratorCdTraits (cutlass::gemm)   
FragmentIterator (cutlass)   IgemmConfig< OutputTile_, int8_t, AccumulatorsPerThread_ > (cutlass::gemm)   Load (cutlass)   SharedLoadStream (cutlass::gemm)   
FragmentLoad (cutlass)   IgemmContiguousGlobalTileTraits (cutlass::gemm)   Load< double, 2, Memory_, true, 16 > (cutlass)   ClearAccumulators::SharedStorage (cutlass::gemm)   
FragmentLoad< IteratorFragment::kScalar, kAccessSize, Scalar_, Memory_, FragmentElement_, kStride > (cutlass)   IgemmEpilogue (cutlass::gemm)   Load< Scalar_, Lanes_, Memory_, true, 16 > (cutlass)   GemmEpilogueTraits::SharedStorage (cutlass::gemm)   
FragmentLoad< IteratorFragment::kWmmaMatrix, kAccessSize, Scalar_, Memory_, FragmentElement_, kStride > (cutlass)   IgemmEpilogue< GemmEpilogueTraits_, true > (cutlass::gemm)   Load< Scalar_, Lanes_, Memory_, true, 4 > (cutlass)   GemmTraits::SharedStorage (cutlass::gemm)   
FragmentConstIterator (cutlass)   IgemmSwizzle (cutlass::gemm)   Max (cutlass)   SimplifiedGemmTraits (cutlass::gemm)   
FragmentElementType (cutlass)   IgemmTileTraitsHelperA (cutlass::gemm)   MemorySpace (cutlass)   SimplifiedGemmTraitsHelper (cutlass::gemm)   ZipConvert (cutlass)   
FragmentIterator (cutlass)   IgemmTileTraitsHelperA< MatrixLayout::kColumnMajor, GemmConfig_, Index_ > (cutlass::gemm)   Min (cutlass)   sqrt_est (cutlass)   ZipFragment (cutlass)   
FragmentMultiplyAdd (cutlass::gemm)   IgemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_, Index_ > (cutlass::gemm)   
  n  
+
StorageType (cutlass)   ZipTensorRef (cutlass)   
FragmentMultiplyAdd< half, half, true > (cutlass::gemm)   IgemmTileTraitsHelperB (cutlass::gemm)   StorageType< 1 > (cutlass)   ZipTileAllocation (cutlass)   
  g  
+
IgemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_, Index_ > (cutlass::gemm)   nullptr_t (cutlass::platform)   StorageType< 2 > (cutlass)   ZipTileIterator (cutlass)   
IgemmTileTraitsHelperB< MatrixLayout::kRowMajor, GemmConfig_, Index_ > (cutlass::gemm)   
  p  
+
StorageType< 4 > (cutlass)   
Gemm (cutlass::gemm)   IgemmTraits (cutlass::gemm)   Store (cutlass)   
alignment_of::pad (cutlass::platform)   
-
a | b | c | d | e | f | g | h | i | l | m | n | p | r | s | t | u | v | w
+
a | b | c | d | e | f | g | h | i | k | l | m | n | p | r | s | t | u | v | w | z
diff --git a/docs/clear__accumulators_8h.html b/docs/clear__accumulators_8h.html index b4bd3b39c2..cd8f6307a7 100644 --- a/docs/clear__accumulators_8h.html +++ b/docs/clear__accumulators_8h.html @@ -82,7 +82,7 @@

Defines abstractions for efficiently clearing accumulator tiles. More...

-
#include <cutlass/vector.h>
+
#include "cutlass/vector.h"

Go to the source code of this file.

@@ -104,7 +104,7 @@ diff --git a/docs/clear__accumulators_8h_source.html b/docs/clear__accumulators_8h_source.html index 1a6f517fb5..7c0423a5f9 100644 --- a/docs/clear__accumulators_8h_source.html +++ b/docs/clear__accumulators_8h_source.html @@ -76,16 +76,17 @@
clear_accumulators.h
-Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
28 #pragma once
29 
30 #include <cutlass/vector.h>
31 
32 namespace cutlass {
33 namespace gemm {
34 
36 
37 template <typename Scalar_, int kLanes_ = 1>
40  struct SharedStorage {};
41 
43  CUTLASS_DEVICE ClearAccumulators(SharedStorage& shared_storage) {}
44 
46  template <typename Fragment_>
47  CUTLASS_DEVICE void clear(Fragment_& fragment) {
48  fragment.clear();
49  }
50 };
51 
53 
54 } // namespace gemm
55 } // namespace cutlass
Definition: convert.h:33
+Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
28 #pragma once
29 
30 #include "cutlass/vector.h"
31 
32 namespace cutlass {
33 namespace gemm {
34 
36 
37 template <typename Scalar_, int kLanes_ = 1>
40  struct SharedStorage {};
41 
43  CUTLASS_DEVICE ClearAccumulators(SharedStorage& shared_storage) {}
44 
46  CUTLASS_DEVICE ClearAccumulators() {}
47 
49  template <typename Fragment_>
50  CUTLASS_DEVICE void clear(Fragment_& fragment) {
51  fragment.clear();
52  }
53 };
54 
56 
57 } // namespace gemm
58 } // namespace cutlass
Definition: convert.h:33
Definition: clear_accumulators.h:38
CUTLASS_DEVICE ClearAccumulators(SharedStorage &shared_storage)
Ctor.
Definition: clear_accumulators.h:43
Defines a 1D vector of elements held in the registers of each thread.
-
CUTLASS_DEVICE void clear(Fragment_ &fragment)
Clear the fragment.
Definition: clear_accumulators.h:47
+
CUTLASS_DEVICE void clear(Fragment_ &fragment)
Clear the fragment.
Definition: clear_accumulators.h:50
The shared storage.
Definition: clear_accumulators.h:40
+
CUTLASS_DEVICE ClearAccumulators()
Ctor.
Definition: clear_accumulators.h:46
diff --git a/docs/complex_8h.html b/docs/complex_8h.html new file mode 100644 index 0000000000..e94494d213 --- /dev/null +++ b/docs/complex_8h.html @@ -0,0 +1,263 @@ + + + + + + + +Cutlass: complex.h File Reference + + + + + + + + + + +
+
+
+ + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + + +
+ +
+
complex.h File Reference
+
+
+
#include <cuComplex.h>
+#include "cutlass/cutlass.h"
+#include <iosfwd>
+
+

Go to the source code of this file.

+ + + + +

+Classes

class  cutlass::platform::complex< T >
 
+ + + + + +

+Namespaces

 cutlass
 
 cutlass::platform
 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

+Functions

CUTLASS_HOST_DEVICE float const & cutlass::platform::real (cuFloatComplex const &z)
 Returns the real part of the complex number. More...
 
CUTLASS_HOST_DEVICE float & cutlass::platform::real (cuFloatComplex &z)
 Returns the real part of the complex number. More...
 
CUTLASS_HOST_DEVICE double const & cutlass::platform::real (cuDoubleComplex const &z)
 Returns the real part of the complex number. More...
 
CUTLASS_HOST_DEVICE double & cutlass::platform::real (cuDoubleComplex &z)
 Returns the real part of the complex number. More...
 
CUTLASS_HOST_DEVICE float const & cutlass::platform::imag (cuFloatComplex const &z)
 Returns the imaginary part of the complex number. More...
 
CUTLASS_HOST_DEVICE float & cutlass::platform::imag (cuFloatComplex &z)
 Returns the imaginary part of the complex number. More...
 
CUTLASS_HOST_DEVICE double const & cutlass::platform::imag (cuDoubleComplex const &z)
 Returns the imaginary part of the complex number. More...
 
CUTLASS_HOST_DEVICE double & cutlass::platform::imag (cuDoubleComplex &z)
 Returns the imaginary part of the complex number. More...
 
template<typename T >
CUTLASS_HOST_DEVICE T const & cutlass::platform::real (complex< T > const &z)
 Returns the real part of the complex number. More...
 
template<typename T >
CUTLASS_HOST_DEVICE T & cutlass::platform::real (complex< T > &z)
 Returns the real part of the complex number. More...
 
template<typename T >
CUTLASS_HOST_DEVICE T const & cutlass::platform::imag (complex< T > const &z)
 Returns the imaginary part of the complex number. More...
 
template<typename T >
CUTLASS_HOST_DEVICE T & cutlass::platform::imag (complex< T > &z)
 Returns the imaginary part of the complex number. More...
 
template<typename T >
std::ostream & cutlass::platform::operator<< (std::ostream &out, complex< T > const &z)
 
template<typename T >
CUTLASS_HOST_DEVICE bool cutlass::platform::operator== (complex< T > const &lhs, complex< T > const &rhs)
 Equality operator. More...
 
template<typename T >
CUTLASS_HOST_DEVICE bool cutlass::platform::operator!= (complex< T > const &lhs, complex< T > const &rhs)
 Inequality operator. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > cutlass::platform::operator+ (complex< T > const &lhs, complex< T > const &rhs)
 Addition. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > cutlass::platform::operator- (complex< T > const &lhs, complex< T > const &rhs)
 Subtraction. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > cutlass::platform::operator* (complex< T > const &lhs, complex< T > const &rhs)
 Multiplication. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > cutlass::platform::operator* (complex< T > const &lhs, T const &s)
 Scalar Multiplication. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > cutlass::platform::operator* (T const &s, complex< T > const &rhs)
 Scalar Multiplication. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > cutlass::platform::operator/ (complex< T > const &lhs, complex< T > const &rhs)
 Division. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > cutlass::platform::operator/ (complex< T > const &lhs, T const &s)
 Scalar Division. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > cutlass::platform::operator/ (T const &s, complex< T > const &rhs)
 Scalar divided by complex. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > & cutlass::platform::operator+= (complex< T > &lhs, complex< T > const &rhs)
 Addition. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > & cutlass::platform::operator-= (complex< T > &lhs, complex< T > const &rhs)
 Subtraction. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > & cutlass::platform::operator*= (complex< T > &lhs, complex< T > const &rhs)
 Multiplication. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > & cutlass::platform::operator*= (complex< T > &lhs, T s)
 Scalar multiplication. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > & cutlass::platform::operator/= (complex< T > &lhs, complex< T > const &rhs)
 Division. More...
 
template<typename T >
CUTLASS_HOST_DEVICEcutlass::platform::abs (complex< T > const &z)
 Returns the magnitude of the complex number. More...
 
template<typename T >
CUTLASS_HOST_DEVICEcutlass::platform::arg (complex< T > const &z)
 Returns the magnitude of the complex number. More...
 
template<typename T >
CUTLASS_HOST_DEVICEcutlass::platform::norm (complex< T > const &z)
 Returns the squared magnitude. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > cutlass::platform::conj (complex< T > const &z)
 Returns the complex conjugate. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > cutlass::platform::proj (complex< T > const &z)
 Projects the complex number z onto the Riemann sphere. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > cutlass::platform::polar (T const &r, T const &theta=T())
 Returns a complex number with magnitude r and phase theta. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > cutlass::platform::exp (complex< T > const &z)
 Computes the complex exponential of z. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > cutlass::platform::log (complex< T > const &z)
 Computes the complex exponential of z. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > cutlass::platform::log10 (complex< T > const &z)
 Computes the complex exponential of z. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > cutlass::platform::sqrt (complex< T > const &z)
 Computes the square root of complex number z. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > cutlass::platform::cos (complex< T > const &z)
 Computes the cosine of complex z. More...
 
template<typename T >
CUTLASS_HOST_DEVICE complex< T > cutlass::platform::sin (complex< T > const &z)
 Computes the sin of complex z. More...
 
+
+ + + + diff --git a/docs/complex_8h_source.html b/docs/complex_8h_source.html new file mode 100644 index 0000000000..6270d22da7 --- /dev/null +++ b/docs/complex_8h_source.html @@ -0,0 +1,123 @@ + + + + + + + +Cutlass: complex.h Source File + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
complex.h
+
+
+Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
25 #pragma once
26 
27 #include <cuComplex.h>
28 #include "cutlass/cutlass.h"
29 #include <iosfwd>
30 
31 namespace cutlass {
32 namespace platform {
33 
35 
36 //
37 // Accessors for CUDA complex types
38 //
39 
41 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
42  // host-only type
44 float const &real(cuFloatComplex const &z) { return z.x; }
45 
47 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
48  // host-only type
50 float &real(cuFloatComplex &z) { return z.x; }
51 
53 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
54  // host-only type
56 double const &real(cuDoubleComplex const &z) { return z.x; }
57 
59 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
60  // host-only type
62 double &real(cuDoubleComplex &z) { return z.x; }
63 
65 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
66  // host-only type
68 float const &imag(cuFloatComplex const &z) { return z.y; }
69 
71 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
72  // host-only type
74 float &imag(cuFloatComplex &z) { return z.y; }
75 
77 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
78  // host-only type
80 double const &imag(cuDoubleComplex const &z) { return z.y; }
81 
83 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
84  // host-only type
86 double &imag(cuDoubleComplex &z) { return z.y; }
87 
89 
92 template <typename T>
93 class complex {
94  public:
96  typedef T value_type;
97 
98  private:
99  //
100  // Data members
101  //
102 
104  T _real;
105 
107  T _imag;
108 
109  public:
110 //
111 // Methods
112 //
113 
115 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
116  // host-only type
118  complex(T r = T(0), T i = T(0)) : _real(r), _imag(i) {}
119 
121 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
122  // host-only type
124  complex(cuFloatComplex const &z) : _real(platform::real(z)), _imag(platform::imag(z)) {}
125 
127 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
128  // host-only type
130  complex(cuDoubleComplex const &z) : _real(platform::real(z)), _imag(platform::imag(z)) {}
131 
133 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
134  // host-only type
136  T const &real() const { return _real; }
137 
139 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
140  // host-only type
142  T &real() { return _real; }
143 
145 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
146  // host-only type
148  T const &imag() const { return _imag; }
149 
151 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
152  // host-only type
154  T &imag() { return _imag; }
155 
157 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
158  // host-only type
160  operator cuFloatComplex() const { return make_cuFloatComplex(real(), imag()); }
161 
163 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
164  // host-only type
166  operator cuDoubleComplex() const { return make_cuDoubleComplex(real(), imag()); }
167 };
168 
169 //
170 // Accessors for complex template
171 //
172 
174 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
175  // host-only type
176 template <typename T>
177 CUTLASS_HOST_DEVICE T const &real(complex<T> const &z) {
178  return z.real();
179 }
180 
182 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
183  // host-only type
184 template <typename T>
186  return z.real();
187 }
188 
190 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
191  // host-only type
192 template <typename T>
193 CUTLASS_HOST_DEVICE T const &imag(complex<T> const &z) {
194  return z.imag();
195 }
196 
198 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
199  // host-only type
200 template <typename T>
202  return z.imag();
203 }
204 
205 //
206 // Output operators
207 //
208 
209 template <typename T>
210 std::ostream &operator<<(std::ostream &out, complex<T> const &z) {
211  T _r = real(z);
212  T _i = imag(z);
213  return out << _r << "+i" << _i;
214 }
215 
216 //
217 // Non-member operators defined for complex types
218 //
219 
221 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
222  // host-only type
223 template <typename T>
224 CUTLASS_HOST_DEVICE bool operator==(complex<T> const &lhs, complex<T> const &rhs) {
225  return real(lhs) == (rhs) && imag(lhs) == imag(rhs);
226 }
227 
229 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
230  // host-only type
231 template <typename T>
232 CUTLASS_HOST_DEVICE bool operator!=(complex<T> const &lhs, complex<T> const &rhs) {
233  return !(lhs == rhs);
234 }
235 
237 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
238  // host-only type
239 template <typename T>
241  return complex<T>(real(lhs) + real(rhs), imag(lhs) + imag(rhs));
242 }
243 
245 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
246  // host-only type
247 template <typename T>
249  return complex<T>(real(lhs) - real(rhs), imag(lhs) - imag(rhs));
250 }
251 
253 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
254  // host-only type
255 template <typename T>
257  return complex<T>(real(lhs) * real(rhs) - imag(lhs) * imag(rhs),
258  real(lhs) * imag(rhs) + imag(lhs) * real(rhs));
259 }
260 
262 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
263  // host-only type
264 template <typename T>
266  return complex<T>(real(lhs) * s, imag(lhs) * s);
267 }
268 
270 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
271  // host-only type
272 template <typename T>
274  return complex<T>(s * real(rhs), s * imag(rhs));
275 }
276 
278 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
279  // host-only type
280 template <typename T>
282  T d = (real(rhs) * (rhs) + imag(rhs) * imag(rhs));
283 
284  return complex<T>((real(lhs) * (rhs) + imag(lhs) * imag(rhs)) / d,
285  (imag(lhs) * (rhs)-real(lhs) * imag(rhs)) / d);
286 }
287 
289 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
290  // host-only type
291 template <typename T>
293  return complex<T>(real(lhs) / s, imag(lhs) / s);
294 }
295 
297 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
298  // host-only type
299 template <typename T>
301  T d = (real(rhs) * (rhs) + imag(rhs) * imag(rhs));
302 
303  return complex<T>((s * (rhs)) / d, -(s * imag(rhs)) / d);
304 }
305 
307 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
308  // host-only type
309 template <typename T>
311  lhs = (lhs + rhs);
312  return lhs;
313 }
314 
316 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
317  // host-only type
318 template <typename T>
320  lhs = (lhs - rhs);
321  return lhs;
322 }
323 
325 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
326  // host-only type
327 template <typename T>
329  lhs = (lhs * rhs);
330  return lhs;
331 }
332 
334 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
335  // host-only type
336 template <typename T>
338  lhs = (lhs * s);
339  return lhs;
340 }
341 
343 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
344  // host-only type
345 template <typename T>
347  lhs = (lhs / rhs);
348  return lhs;
349 }
350 
351 //
352 // Non-member functions defined for complex numbers
353 //
354 
356 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
357  // host-only type
358 template <typename T>
360  return sqrt(norm(z));
361 }
362 
364 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
365  // host-only type
366 template <typename T>
368  return atan2(imag(z), real(z));
369 }
370 
372 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
373  // host-only type
374 template <typename T>
376  return real(z) * real(z) + imag(z) * imag(z);
377 }
378 
380 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
381  // host-only type
382 template <typename T>
384  return complex<T>(real(z), -imag(z));
385 }
386 
388 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
389  // host-only type
390 template <typename T>
392  T d = real(z) * real(z) + imag(z) * imag(z) + T(1);
393  return complex<T>((T(2) * real(z)) / d, (T(2) * imag(z)) / d);
394 }
395 
397 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
398  // host-only type
399 template <typename T>
400 CUTLASS_HOST_DEVICE complex<T> polar(T const &r, T const &theta = T()) {
401  return complex<T>(r * cos(theta), r * sin(theta));
402 }
403 
405 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
406  // host-only type
407 template <typename T>
409  return complex<T>(real(z) * cos(imag(z)), real(z) * sin(imag(z)));
410 }
411 
413 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
414  // host-only type
415 template <typename T>
417  return complex<T>(log(abs(z)), arg(z));
418 }
419 
421 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
422  // host-only type
423 template <typename T>
425  return log(z) / T(log(T(10)));
426 }
427 
429 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
430  // host-only type
431 template <typename T>
433  return sqrt(T(2)) / T(2) *
434  complex<T>(sqrt(sqrt(norm(z)) + real(z)),
435  (imag(z) < 0 ? T(-1) : T(1)) * sqrt(sqrt(norm(z)) - real(z)));
436 }
437 
439 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
440  // host-only type
441 template <typename T>
443  return (exp(z) + exp(-z)) / T(2);
444 }
445 
447 #pragma hd_warning_disable // Suppresses warnings when attempting to instantiate complex<T> with a
448  // host-only type
449 template <typename T>
451  return (exp(-z) - exp(z)) * complex<T>(T(0), T(1) / T(2));
452 }
453 
455 
456 } // namespace platform
457 } // namespace cutlass
CUTLASS_HOST_DEVICE complex< T > proj(complex< T > const &z)
Projects the complex number z onto the Riemann sphere.
Definition: complex.h:391
+
Definition: convert.h:33
+
CUTLASS_HOST_DEVICE T & imag()
Accesses the imaginary part of the complex number.
Definition: complex.h:154
+
CUTLASS_HOST_DEVICE bool operator==(complex< T > const &lhs, complex< T > const &rhs)
Equality operator.
Definition: complex.h:224
+
CUTLASS_HOST_DEVICE T const & imag() const
Accesses the imaginary part of the complex number.
Definition: complex.h:148
+
CUTLASS_HOST_DEVICE complex< T > operator*(complex< T > const &lhs, complex< T > const &rhs)
Multiplication.
Definition: complex.h:256
+
CUTLASS_HOST_DEVICE complex< T > & operator-=(complex< T > &lhs, complex< T > const &rhs)
Subtraction.
Definition: complex.h:319
+
CUTLASS_HOST_DEVICE complex< T > operator-(complex< T > const &lhs, complex< T > const &rhs)
Subtraction.
Definition: complex.h:248
+
CUTLASS_HOST_DEVICE T & real()
Accesses the real part of the complex number.
Definition: complex.h:142
+
CUTLASS_HOST_DEVICE float const & real(cuFloatComplex const &z)
Returns the real part of the complex number.
Definition: complex.h:44
+
CUTLASS_HOST_DEVICE complex< T > sin(complex< T > const &z)
Computes the sin of complex z.
Definition: complex.h:450
+
CUTLASS_HOST_DEVICE complex(cuFloatComplex const &z)
Conversion from cuFloatComplex.
Definition: complex.h:124
+
CUTLASS_HOST_DEVICE complex< T > cos(complex< T > const &z)
Computes the cosine of complex z.
Definition: complex.h:442
+
CUTLASS_HOST_DEVICE complex< T > operator+(complex< T > const &lhs, complex< T > const &rhs)
Addition.
Definition: complex.h:240
+
CUTLASS_HOST_DEVICE complex< T > polar(T const &r, T const &theta=T())
Returns a complex number with magnitude r and phase theta.
Definition: complex.h:400
+
CUTLASS_HOST_DEVICE T const & real() const
Accesses the real part of the complex number.
Definition: complex.h:136
+
CUTLASS_HOST_DEVICE complex< T > & operator/=(complex< T > &lhs, complex< T > const &rhs)
Division.
Definition: complex.h:346
+
CUTLASS_HOST_DEVICE complex< T > sqrt(complex< T > const &z)
Computes the square root of complex number z.
Definition: complex.h:432
+
CUTLASS_HOST_DEVICE complex< T > & operator+=(complex< T > &lhs, complex< T > const &rhs)
Addition.
Definition: complex.h:310
+
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:46
+
CUTLASS_HOST_DEVICE float const & imag(cuFloatComplex const &z)
Returns the imaginary part of the complex number.
Definition: complex.h:68
+
CUTLASS_HOST_DEVICE complex< T > exp(complex< T > const &z)
Computes the complex exponential of z.
Definition: complex.h:408
+
CUTLASS_HOST_DEVICE complex< T > log10(complex< T > const &z)
Computes the complex exponential of z.
Definition: complex.h:424
+
CUTLASS_HOST_DEVICE T norm(complex< T > const &z)
Returns the squared magnitude.
Definition: complex.h:375
+
CUTLASS_HOST_DEVICE bool operator!=(complex< T > const &lhs, complex< T > const &rhs)
Inequality operator.
Definition: complex.h:232
+
CUTLASS_HOST_DEVICE T abs(complex< T > const &z)
Returns the magnitude of the complex number.
Definition: complex.h:359
+
CUTLASS_HOST_DEVICE complex< T > & operator*=(complex< T > &lhs, complex< T > const &rhs)
Multiplication.
Definition: complex.h:328
+
CUTLASS_HOST_DEVICE complex(cuDoubleComplex const &z)
Conversion from cuDoubleComplex.
Definition: complex.h:130
+
CUTLASS_HOST_DEVICE T arg(complex< T > const &z)
Returns the magnitude of the complex number.
Definition: complex.h:367
+
CUTLASS_HOST_DEVICE complex(T r=T(0), T i=T(0))
Constructor.
Definition: complex.h:118
+
Definition: complex.h:93
+
CUTLASS_HOST_DEVICE complex< T > log(complex< T > const &z)
Computes the complex exponential of z.
Definition: complex.h:416
+
T value_type
Type alias for scalar type.
Definition: complex.h:96
+
Basic include for CUTLASS macros.
+
CUTLASS_HOST_DEVICE complex< T > operator/(complex< T > const &lhs, complex< T > const &rhs)
Division.
Definition: complex.h:281
+
CUTLASS_HOST_DEVICE complex< T > conj(complex< T > const &z)
Returns the complex conjugate.
Definition: complex.h:383
+
+ + + + diff --git a/docs/convert_8h.html b/docs/convert_8h.html index 422c520173..cd3bf4bb8a 100644 --- a/docs/convert_8h.html +++ b/docs/convert_8h.html @@ -82,7 +82,7 @@

Defines conversion operations among Fragments of different base type. More...

-
#include <cutlass/fragment.h>
+
#include "cutlass/fragment.h"

Go to the source code of this file.

@@ -103,7 +103,7 @@ diff --git a/docs/convert_8h_source.html b/docs/convert_8h_source.html index 6e877d293e..22ec9d4b81 100644 --- a/docs/convert_8h_source.html +++ b/docs/convert_8h_source.html @@ -76,7 +76,7 @@
convert.h
-Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include <cutlass/fragment.h>
32 
33 namespace cutlass {
34 
36 
37 template <typename InputFragment_, typename OutputFragment_>
38 struct Convert {};
39 
41 
42 template <typename InputScalar_, typename OutputScalar_, int kScalars_>
43 struct Convert<Fragment<InputScalar_, kScalars_>, Fragment<OutputScalar_, kScalars_> > {
48 
50  CUTLASS_DEVICE Convert() {}
51 
53  CUTLASS_DEVICE void transform(InputFragment const& src, OutputFragment& dst) {
54  transform(src, 0, dst);
55  }
56 
58  template <typename Fragment_>
59  CUTLASS_DEVICE void transform(Fragment_ const& src, int offset, OutputFragment& dst) {
60  for (int i = 0; i < kScalars_; ++i) {
61  dst[i] = static_cast<OutputScalar_>(src[i + offset]);
62  }
63  }
64 };
65 
67 
68 template <typename Fragment_>
69 struct Copy {
71  typedef Fragment_ InputFragment;
73  typedef Fragment_ OutputFragment;
74 
76  CUTLASS_DEVICE Copy() {}
77 
79  CUTLASS_DEVICE void transform(Fragment_ const& src, Fragment_& dst) { transform(src, 0, dst); }
80 
82  template <typename InputFragment_>
83  CUTLASS_DEVICE void transform(InputFragment_ const& src, int offset, Fragment_& dst) {
84  if (sizeof(typename Fragment_::Element) == 8) {
85  uint64_t const* src_ptr = reinterpret_cast<uint64_t const*>(&src[offset]);
86  uint64_t* dst_ptr = reinterpret_cast<uint64_t*>(&dst[0]);
87  for (int i = 0; i < sizeof(Fragment_) / 8; ++i) {
88  dst_ptr[i] = src_ptr[i];
89  }
90  } else {
91  uint32_t const* src_ptr = reinterpret_cast<uint32_t const*>(&src[offset]);
92  uint32_t* dst_ptr = reinterpret_cast<uint32_t*>(&dst[0]);
93  for (int i = 0; i < sizeof(Fragment_) / 4; ++i) {
94  dst_ptr[i] = src_ptr[i];
95  }
96  }
97  }
98 };
99 
101 
102 } // namespace cutlass
Definition: convert.h:33
+Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include "cutlass/fragment.h"
32 
33 namespace cutlass {
34 
36 
37 template <typename InputFragment_, typename OutputFragment_>
38 struct Convert {};
39 
41 
42 template <typename InputScalar_, typename OutputScalar_, int kScalars_>
43 struct Convert<Fragment<InputScalar_, kScalars_>, Fragment<OutputScalar_, kScalars_> > {
48 
50  CUTLASS_DEVICE Convert() {}
51 
53  CUTLASS_DEVICE void transform(InputFragment const& src, OutputFragment& dst) {
54  transform(src, 0, dst);
55  }
56 
58  template <typename Fragment_>
59  CUTLASS_DEVICE void transform(Fragment_ const& src, int offset, OutputFragment& dst) {
60  for (int i = 0; i < kScalars_; ++i) {
61  dst[i] = static_cast<OutputScalar_>(src[i + offset]);
62  }
63  }
64 };
65 
67 
68 template <typename Fragment_>
69 struct Copy {
71  typedef Fragment_ InputFragment;
73  typedef Fragment_ OutputFragment;
74 
76  CUTLASS_DEVICE Copy() {}
77 
79  CUTLASS_DEVICE void transform(Fragment_ const& src, Fragment_& dst) { transform(src, 0, dst); }
80 
82  template <typename InputFragment_>
83  CUTLASS_DEVICE void transform(InputFragment_ const& src, int offset, Fragment_& dst) {
84  if (sizeof(typename Fragment_::Element) == 8) {
85  uint64_t const* src_ptr = reinterpret_cast<uint64_t const*>(&src[offset]);
86  uint64_t* dst_ptr = reinterpret_cast<uint64_t*>(&dst[0]);
87  for (int i = 0; i < sizeof(Fragment_) / 8; ++i) {
88  dst_ptr[i] = src_ptr[i];
89  }
90  } else {
91  uint32_t const* src_ptr = reinterpret_cast<uint32_t const*>(&src[offset]);
92  uint32_t* dst_ptr = reinterpret_cast<uint32_t*>(&dst[0]);
93  for (int i = 0; i < sizeof(Fragment_) / 4; ++i) {
94  dst_ptr[i] = src_ptr[i];
95  }
96  }
97  }
98 };
99 
101 
102 } // namespace cutlass
Definition: convert.h:33
Fragment< OutputScalar_, kScalars_ > OutputFragment
The output fragment.
Definition: convert.h:47
Definition: convert.h:69
CUTLASS_DEVICE void transform(Fragment_ const &src, Fragment_ &dst)
Transform a fragment.
Definition: convert.h:79
@@ -94,7 +94,7 @@
diff --git a/docs/coord_8h.html b/docs/coord_8h.html index 5165038675..8bb9bea4d2 100644 --- a/docs/coord_8h.html +++ b/docs/coord_8h.html @@ -83,7 +83,8 @@

A Coord is a coordinate of arbitrary rank into a tensor or matrix. More...

-
@@ -92,7 +93,7 @@ - +
struct  cutlass::Identity
 Describes identity elements. More...
 
struct  cutlass::Coord< N_ >
struct  cutlass::Coord< Rank_, Index_ >
 Statically-sized array specifying Coords within a tensor. More...
 
@@ -115,23 +116,14 @@ - - - - - - - - - - - - + + +
CUTLASS_HOST_DEVICE Coord< 4 > cutlass::make_Coord (int _0, int _1, int _2, int _3)
 Helper to make a 4-element coordinate. More...
 
CUTLASS_HOST_DEVICE Coord< 2 > cutlass::get_Coord_hw (Coord< 3 > const &coord)
 Getter. More...
 
CUTLASS_HOST_DEVICE Coord< 2 > cutlass::get_Coord_hw (Coord< 4 > const &coord)
 Getter. More...
 
CUTLASS_HOST_DEVICE Coord< 3 > cutlass::get_Coord_hwc (Coord< 4 > const &coord)
 Getter. More...
 
CUTLASS_HOST_DEVICE Coord< 3 > cutlass::get_Coord_dhw (Coord< 4 > const &coord)
 Getter. More...
 
template<typename Shape_ >
CUTLASS_HOST_DEVICE Coord< 3 > cutlass::make_Coord_from_shape ()
 
diff --git a/docs/coord_8h_source.html b/docs/coord_8h_source.html index 71ec92e1af..b0e2162ccb 100644 --- a/docs/coord_8h_source.html +++ b/docs/coord_8h_source.html @@ -76,50 +76,54 @@
coord.h
-Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include <cutlass/cutlass.h>
32 
33 namespace cutlass {
34 
36 
38 struct Identity {
41  enum Kind { Additive = 0, Multiplicative = 1 };
42 };
43 
45 
47 template <int N_>
48 struct Coord {
49  //
50  // Type and constant definitions
51  //
52 
53  static int const N = N_;
54 
55  //
56  // Data members
57  //
58 
60  int idx[N];
61 
62  //
63  // Methods
64  //
65 
68  Coord(int value = 0) {
69  for (int i = 0; i < N; ++i) {
70  idx[i] = value;
71  }
72  }
73 
76  Coord(int _idx[]) {
77  for (int i = 0; i < N; ++i) {
78  idx[i] = _idx[i];
79  }
80  }
81 
84  Coord operator+(Coord const& b) const {
85  Coord c;
86  for (int i = 0; i < N; ++i) {
87  c.idx[i] = idx[i] + b.idx[i];
88  }
89  return c;
90  }
91 
94  Coord operator-(Coord const& b) const {
95  Coord c;
96  for (int i = 0; i < N; ++i) {
97  c.idx[i] = idx[i] - b.idx[i];
98  }
99  return c;
100  }
101 
104  Coord operator*(Coord const& b) const {
105  Coord c;
106  for (int i = 0; i < N; ++i) {
107  c.idx[i] = idx[i] * b.idx[i];
108  }
109  return c;
110  }
111 
114  Coord operator/(Coord const& b) const {
115  Coord c;
116  for (int i = 0; i < N; ++i) {
117  c.idx[i] = idx[i] / b.idx[i];
118  }
119  return c;
120  }
121 
124  Coord& operator+=(Coord const& b) {
125  for (int i = 0; i < N; ++i) {
126  idx[i] += b.idx[i];
127  }
128  return *this;
129  }
130 
133  Coord& operator-=(Coord const& b) {
134  for (int i = 0; i < N; ++i) {
135  idx[i] -= b.idx[i];
136  }
137  return *this;
138  }
139 
142  Coord& operator*=(Coord const& b) {
143  for (int i = 0; i < N; ++i) {
144  idx[i] *= b.idx[i];
145  }
146  return *this;
147  }
148 
151  Coord& operator/=(Coord const& b) {
152  for (int i = 0; i < N; ++i) {
153  idx[i] /= b.idx[i];
154  }
155  return *this;
156  }
157 
159  CUTLASS_HOST_DEVICE int& operator[](int dim) { return idx[dim]; }
160 
162  CUTLASS_HOST_DEVICE int const& operator[](int dim) const { return idx[dim]; }
163 
165  template <typename T>
166  CUTLASS_HOST_DEVICE T dot(Coord const& b, T sum) const {
167  for (int i = 0; i < N; ++i) {
168  sum += idx[i] * b.idx[i];
169  }
170  return sum;
171  }
172 
174  template <typename T>
175  CUTLASS_HOST_DEVICE T dot(Coord const& b) const {
176  T sum = T(0);
177  for (int i = 0; i < N; ++i) {
178  sum += idx[i] * b.idx[i];
179  }
180  return sum;
181  }
182 
184  template <int Dim>
186  return idx[Dim];
187  }
188 
191  int& at(int dim) { return idx[dim]; }
192 
194  template <int Dim>
195  CUTLASS_HOST_DEVICE int const& at() const {
196  return idx[Dim];
197  }
198 
201  int const& at(int dim) const { return idx[dim]; }
202 
205  bool operator==(Coord<N> const& b) const {
206  bool equal = true;
207  for (int i = 0; equal && i < N; ++i) {
208  equal = (idx[i] == b.idx[i]);
209  }
210  return equal;
211  }
212 
215  bool operator!=(Coord<N> const& b) const { return !(*this == b); }
216 
219  Coord& clamp(Coord<N> const& max, Coord<N> const& min = Coord<N>()) {
220  for (int i = 0; i < N; ++i) {
221  idx[i] = __NV_STD_MAX(__NV_STD_MIN(idx[i], max.idx[i]), min.idx[i]);
222  }
223  return *this;
224  }
225 
228  int count() const {
229  int product = idx[0];
230  for (int i = 1; i < N; ++i) {
231  product *= idx[i];
232  }
233  return product;
234  }
235 };
236 
238 
242  int values[1] = {_0};
243  return Coord<1>(values);
244 }
245 
248 Coord<2> make_Coord(int _0, int _1) {
249  int values[2] = {_0, _1};
250  return Coord<2>(values);
251 }
252 
255 Coord<3> make_Coord(int _0, int _1, int _2) {
256  int values[3] = {_0, _1, _2};
257  return Coord<3>(values);
258 }
259 
262 Coord<4> make_Coord(int _0, int _1, int _2, int _3) {
263  int values[4] = {_0, _1, _2, _3};
264  return Coord<4>(values);
265 }
266 
268 
271 Coord<2> get_Coord_hw(Coord<3> const& coord) { return make_Coord(coord[1], coord[2]); }
272 
275 Coord<2> get_Coord_hw(Coord<4> const& coord) { return make_Coord(coord[1], coord[2]); }
276 
279 Coord<3> get_Coord_hwc(Coord<4> const& coord) { return make_Coord(coord[1], coord[2], coord[3]); }
280 
283 Coord<3> get_Coord_dhw(Coord<4> const& coord) { return make_Coord(coord[0], coord[1], coord[2]); }
284 
286 
287 } // namespace cutlass
CUTLASS_HOST_DEVICE int const & operator[](int dim) const
Member access operator.
Definition: coord.h:162
-
CUTLASS_HOST_DEVICE int count() const
Returns the product of all elements.
Definition: coord.h:228
-
Describes identity elements.
Definition: coord.h:38
-
CUTLASS_HOST_DEVICE constexpr const T & max(const T &a, const T &b)
std::max
Definition: platform.h:207
+Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include "cutlass/cutlass.h"
32 #include "cutlass/util/platform.h"
33 
34 namespace cutlass {
35 
37 
39 struct Identity {
42  enum Kind { Additive = 0, Multiplicative = 1 };
43 };
44 
46 
48 template <int Rank_, typename Index_ = int>
49 struct Coord {
50  //
51  // Type and constant definitions
52  //
53 
55  static int const kRank = Rank_;
56 
58  static int const N = Rank_;
59 
61  typedef Index_ Index;
62 
63  //
64  // Data members
65  //
66 
69 
70  //
71  // Methods
72  //
73 
76  Coord(Index value = 0) {
77  for (int i = 0; i < kRank; ++i) {
78  idx[i] = value;
79  }
80  }
81 
84  Coord(Index _idx[]) {
85  for (int i = 0; i < kRank; ++i) {
86  idx[i] = _idx[i];
87  }
88  }
89 
92  Coord(Coord<kRank> const &coord) {
93  for (int i = 0; i < kRank; ++i) {
94  idx[i] = coord[i];
95  }
96  }
97 
100  template <int Slice>
102  Coord<Slice> slice(int start = 0, Index identity = 0) const {
103  Coord<Slice> result;
104  for (int i = 0; i < Slice; ++i) {
105  if (i + start < kRank) {
106  slice[i] = idx[i + start];
107  }
108  else {
109  slice[i] = identity;
110  }
111  }
112  return result;
113  }
114 
117  operator bool() const {
118  for (int i = 0; i < kRank; ++i) {
119  if (idx[i]) {
120  return true;
121  }
122  }
123  return false;
124  }
125 
128  bool operator!() const {
129  for (int i = 0; i < kRank; ++i) {
130  if (idx[i]) {
131  return false;
132  }
133  }
134  return true;
135  }
136 
139  Coord operator+(Coord const& b) const {
140  Coord c;
141  for (int i = 0; i < kRank; ++i) {
142  c.idx[i] = idx[i] + b.idx[i];
143  }
144  return c;
145  }
146 
149  Coord operator-(Coord const& b) const {
150  Coord c;
151  for (int i = 0; i < kRank; ++i) {
152  c.idx[i] = idx[i] - b.idx[i];
153  }
154  return c;
155  }
156 
159  Coord operator*(Coord const& b) const {
160  Coord c;
161  for (int i = 0; i < kRank; ++i) {
162  c.idx[i] = idx[i] * b.idx[i];
163  }
164  return c;
165  }
166 
169  Coord operator/(Coord const& b) const {
170  Coord c;
171  for (int i = 0; i < kRank; ++i) {
172  c.idx[i] = idx[i] / b.idx[i];
173  }
174  return c;
175  }
176 
179  Coord& operator+=(Coord const& b) {
180  for (int i = 0; i < kRank; ++i) {
181  idx[i] += b.idx[i];
182  }
183  return *this;
184  }
185 
188  Coord& operator-=(Coord const& b) {
189  for (int i = 0; i < kRank; ++i) {
190  idx[i] -= b.idx[i];
191  }
192  return *this;
193  }
194 
197  Coord& operator*=(Coord const& b) {
198  for (int i = 0; i < kRank; ++i) {
199  idx[i] *= b.idx[i];
200  }
201  return *this;
202  }
203 
206  Coord& operator/=(Coord const& b) {
207  for (int i = 0; i < kRank; ++i) {
208  idx[i] /= b.idx[i];
209  }
210  return *this;
211  }
212 
214  CUTLASS_HOST_DEVICE Index& operator[](int dim) { return idx[dim]; }
215 
217  CUTLASS_HOST_DEVICE Index const& operator[](int dim) const { return idx[dim]; }
218 
220  template <typename T>
221  CUTLASS_HOST_DEVICE T dot(Coord const& b, T sum) const {
222  for (int i = 0; i < kRank; ++i) {
223  sum += idx[i] * b.idx[i];
224  }
225  return sum;
226  }
227 
229  template <typename T>
230  CUTLASS_HOST_DEVICE T dot(Coord const& b) const {
231  T sum = T(0);
232  for (int i = 0; i < kRank; ++i) {
233  sum += idx[i] * b.idx[i];
234  }
235  return sum;
236  }
237 
239  template <int Dim>
241  return idx[Dim];
242  }
243 
246  Index& at(int dim) { return idx[dim]; }
247 
249  template <int Dim>
250  CUTLASS_HOST_DEVICE Index const& at() const {
251  return idx[Dim];
252  }
253 
256  Index const& at(int dim) const { return idx[dim]; }
257 
260  bool operator==(Coord<kRank> const& b) const {
261  bool equal = true;
262  for (int i = 0; equal && i < kRank; ++i) {
263  equal = (idx[i] == b.idx[i]);
264  }
265  return equal;
266  }
267 
270  bool operator!=(Coord<kRank> const& b) const { return !(*this == b); }
271 
275  for (int i = 0; i < kRank; ++i) {
276  idx[i] = __NV_STD_MAX(__NV_STD_MIN(idx[i], max.idx[i]), min.idx[i]);
277  }
278  return *this;
279  }
280 
283  Index count() const {
284  Index product = idx[0];
285  for (int i = 1; i < kRank; ++i) {
286  product *= idx[i];
287  }
288  return product;
289  }
290 
293  bool operator<(Coord<kRank> const &b) const {
294  for (int i = 0; i < kRank; ++i) {
295  if (!(idx[i] < b[i])) {
296  return false;
297  }
298  }
299  return true;
300  }
301 
304  bool operator<=(Coord<kRank> const &b) const {
305  for (int i = 0; i < kRank; ++i) {
306  if (!(idx[i] <= b[i])) {
307  return false;
308  }
309  }
310  return true;
311  }
312 };
313 
315 
319  int values[1] = {_0};
320  return Coord<1>(values);
321 }
322 
325 Coord<2> make_Coord(int _0, int _1) {
326  int values[2] = {_0, _1};
327  return Coord<2>(values);
328 }
329 
332 Coord<3> make_Coord(int _0, int _1, int _2) {
333  int values[3] = {_0, _1, _2};
334  return Coord<3>(values);
335 }
336 
339 Coord<4> make_Coord(int _0, int _1, int _2, int _3) {
340  int values[4] = {_0, _1, _2, _3};
341  return Coord<4>(values);
342 }
343 
345 
346 template <typename Shape_>
348  return make_Coord(Shape_::kD, Shape_::kH, Shape_::kW);
349 }
350 
352 
353 } // namespace cutlass
Describes identity elements.
Definition: coord.h:39
+
CUTLASS_HOST_DEVICE constexpr const T & max(const T &a, const T &b)
std::max
Definition: platform.h:215
Definition: convert.h:33
-
CUTLASS_HOST_DEVICE bool operator==(Coord< N > const &b) const
Determines if two Coord<> objects are equal.
Definition: coord.h:205
-
CUTLASS_HOST_DEVICE Coord & operator+=(Coord const &b)
In-place addition.
Definition: coord.h:124
-
CUTLASS_HOST_DEVICE bool operator!=(Coord< N > const &b) const
Not equal.
Definition: coord.h:215
-
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:241
-
CUTLASS_HOST_DEVICE Coord< 3 > get_Coord_hwc(Coord< 4 > const &coord)
Getter.
Definition: coord.h:279
-
CUTLASS_HOST_DEVICE Coord< 3 > get_Coord_dhw(Coord< 4 > const &coord)
Getter.
Definition: coord.h:283
-
CUTLASS_HOST_DEVICE Coord & clamp(Coord< N > const &max, Coord< N > const &min=Coord< N >())
Clamps a coordinate to a range specified by maximum and minimum values.
Definition: coord.h:219
-
CUTLASS_HOST_DEVICE int const & at() const
Gets the index of a given Coord element.
Definition: coord.h:195
-
CUTLASS_HOST_DEVICE Coord operator/(Coord const &b) const
Element-wise division.
Definition: coord.h:114
-
Kind
Definition: coord.h:41
-
CUTLASS_HOST_DEVICE T dot(Coord const &b, T sum) const
Computes the dot product of two Coord instances.
Definition: coord.h:166
-
CUTLASS_HOST_DEVICE Coord(int _idx[])
Constructs from an array of integers.
Definition: coord.h:76
-
#define __NV_STD_MAX(a, b)
Select maximum(a, b)
Definition: platform.h:155
-
CUTLASS_HOST_DEVICE int & at(int dim)
Access via index; may limit unrolling potential.
Definition: coord.h:191
-
CUTLASS_HOST_DEVICE int & operator[](int dim)
Member access operator.
Definition: coord.h:159
-
CUTLASS_HOST_DEVICE Coord & operator-=(Coord const &b)
In-place subtraction.
Definition: coord.h:133
-
CUTLASS_HOST_DEVICE Coord operator*(Coord const &b) const
Element-wise multiplication.
Definition: coord.h:104
-
CUTLASS_HOST_DEVICE Coord(int value=0)
Default ctor initializes uniformly.
Definition: coord.h:68
-
CUTLASS_HOST_DEVICE Coord< 2 > get_Coord_hw(Coord< 3 > const &coord)
Getter.
Definition: coord.h:271
-
static int const N
Definition: coord.h:53
-
#define __NV_STD_MIN(a, b)
Select minimum(a, b)
Definition: platform.h:160
-
CUTLASS_HOST_DEVICE T dot(Coord const &b) const
Computes the dot product of two Coord instances.
Definition: coord.h:175
-
CUTLASS_HOST_DEVICE Coord operator-(Coord const &b) const
Element-wise subtraction.
Definition: coord.h:94
+
CUTLASS_HOST_DEVICE Coord operator-(Coord const &b) const
Element-wise subtraction.
Definition: coord.h:149
+
CUTLASS_HOST_DEVICE Index const & at(int dim) const
Access via index; may limit unrolling potential.
Definition: coord.h:256
+
CUTLASS_HOST_DEVICE Index const & operator[](int dim) const
Member access operator.
Definition: coord.h:217
+
CUTLASS_HOST_DEVICE Coord operator/(Coord const &b) const
Element-wise division.
Definition: coord.h:169
+
CUTLASS_HOST_DEVICE Index & operator[](int dim)
Member access operator.
Definition: coord.h:214
+
CUTLASS_HOST_DEVICE Coord< 1 > make_Coord(int _0)
Helper to make a 2-element coordinate.
Definition: coord.h:318
+
static int const kRank
Number of elements in Coord.
Definition: coord.h:55
+
Index_ Index
Index type used to store elements.
Definition: coord.h:61
+
CUTLASS_HOST_DEVICE Coord & operator*=(Coord const &b)
In-place multiplication.
Definition: coord.h:197
+
CUTLASS_HOST_DEVICE Index & at(int dim)
Access via index; may limit unrolling potential.
Definition: coord.h:246
+
C++ features that may be otherwise unimplemented for CUDA device functions.
+
CUTLASS_HOST_DEVICE Index count() const
Returns the product of all elements.
Definition: coord.h:283
+
CUTLASS_HOST_DEVICE Coord operator*(Coord const &b) const
Element-wise multiplication.
Definition: coord.h:159
+
Kind
Definition: coord.h:42
+
CUTLASS_HOST_DEVICE Coord< 3 > make_Coord_from_shape()
Definition: coord.h:347
+
CUTLASS_HOST_DEVICE bool operator==(Coord< kRank > const &b) const
Determines if two Coord<> objects are equal.
Definition: coord.h:260
+
static int const N
Number of elements in Coord, aliased for compatibility.
Definition: coord.h:58
+
#define __NV_STD_MAX(a, b)
Select maximum(a, b)
Definition: platform.h:163
+
Index idx[kRank]
Indices.
Definition: coord.h:68
+
#define __NV_STD_MIN(a, b)
Select minimum(a, b)
Definition: platform.h:168
+
CUTLASS_HOST_DEVICE Coord & operator-=(Coord const &b)
In-place subtraction.
Definition: coord.h:188
+
CUTLASS_HOST_DEVICE Coord & operator+=(Coord const &b)
In-place addition.
Definition: coord.h:179
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:46
-
CUTLASS_HOST_DEVICE constexpr const T & min(const T &a, const T &b)
std::min
Definition: platform.h:201
-
Definition: coord.h:41
-
Statically-sized array specifying Coords within a tensor.
Definition: coord.h:48
-
CUTLASS_HOST_DEVICE int & at()
Gets the index of a given Coord element.
Definition: coord.h:185
-
int idx[N]
Indices.
Definition: coord.h:60
-
Definition: coord.h:41
-
CUTLASS_HOST_DEVICE int const & at(int dim) const
Access via index; may limit unrolling potential.
Definition: coord.h:201
+
CUTLASS_HOST_DEVICE bool operator!=(Coord< kRank > const &b) const
Not equal.
Definition: coord.h:270
+
CUTLASS_HOST_DEVICE constexpr const T & min(const T &a, const T &b)
std::min
Definition: platform.h:209
+
CUTLASS_HOST_DEVICE Index & at()
Gets the index of a given Coord element.
Definition: coord.h:240
+
CUTLASS_HOST_DEVICE Coord & operator/=(Coord const &b)
In-place division.
Definition: coord.h:206
+
Definition: coord.h:42
+
CUTLASS_HOST_DEVICE Coord< Slice > slice(int start=0, Index identity=0) const
Definition: coord.h:102
+
Statically-sized array specifying Coords within a tensor.
Definition: coord.h:49
+
CUTLASS_HOST_DEVICE Index const & at() const
Gets the index of a given Coord element.
Definition: coord.h:250
+
CUTLASS_HOST_DEVICE T dot(Coord const &b, T sum) const
Computes the dot product of two Coord instances.
Definition: coord.h:221
+
CUTLASS_HOST_DEVICE Coord(Index value=0)
Default ctor initializes uniformly.
Definition: coord.h:76
+
Definition: coord.h:42
+
CUTLASS_HOST_DEVICE Coord & clamp(Coord< kRank > const &max, Coord< kRank > const &min=Coord< kRank >())
Clamps a coordinate to a range specified by maximum and minimum values.
Definition: coord.h:274
+
CUTLASS_HOST_DEVICE Coord(Index _idx[])
Constructs from an array of integers.
Definition: coord.h:84
+
CUTLASS_HOST_DEVICE T dot(Coord const &b) const
Computes the dot product of two Coord instances.
Definition: coord.h:230
+
CUTLASS_HOST_DEVICE Coord operator+(Coord const &b) const
Element-wise addition.
Definition: coord.h:139
Basic include for CUTLASS macros.
-
CUTLASS_HOST_DEVICE Coord & operator*=(Coord const &b)
In-place multiplication.
Definition: coord.h:142
-
CUTLASS_HOST_DEVICE Coord operator+(Coord const &b) const
Element-wise addition.
Definition: coord.h:84
-
CUTLASS_HOST_DEVICE Coord & operator/=(Coord const &b)
In-place division.
Definition: coord.h:151
+
CUTLASS_HOST_DEVICE Coord(Coord< kRank > const &coord)
Constructs from an array of integers.
Definition: coord.h:92
+
CUTLASS_HOST_DEVICE bool operator!() const
Returns true if Coord is uniformly zero.
Definition: coord.h:128
diff --git a/docs/core__io_8h.html b/docs/core__io_8h.html index d71c397167..2f50d78515 100644 --- a/docs/core__io_8h.html +++ b/docs/core__io_8h.html @@ -73,6 +73,8 @@
core_io.h File Reference
@@ -83,51 +85,56 @@ More...

#include <iosfwd>
#include <typeinfo>
-#include <cutlass/coord.h>
+#include "cutlass/coord.h"
+#include "cutlass/vector.h"

Go to the source code of this file.

+ + + + +

+Classes

struct  cutlass::ScalarIO< T >
 Helper to enable formatted printing of CUTLASS scalar types to an ostream. More...
 
+ + + +

+Namespaces

 cutlass
 
- - - + + + + + + + + + + + + + + + + + + + + + + + + + + +

Functions

template<int Rank>
std::ostream & operator<< (std::ostream &out, cutlass::Coord< Rank > const &coord)
 
template<int Rank>
std::ostream & cutlass::operator<< (std::ostream &out, Coord< Rank > const &coord)
 
template<typename T >
std::ostream & cutlass::operator<< (std::ostream &out, ScalarIO< T > const &scalar)
 Default printing to ostream. More...
 
template<>
std::ostream & cutlass::operator<< (std::ostream &out, ScalarIO< int8_t > const &scalar)
 Printing to ostream of int8_t as integer rather than character. More...
 
template<>
std::ostream & cutlass::operator<< (std::ostream &out, ScalarIO< uint8_t > const &scalar)
 Printing to ostream of uint8_t as integer rather than character. More...
 
template<>
std::ostream & cutlass::operator<< (std::ostream &out, ScalarIO< cutlass::Vector< cutlass::bin1_t, 32 > > const &scalar)
 Printing to ostream of vector of 1b elements. More...
 
template<>
std::ostream & cutlass::operator<< (std::ostream &out, ScalarIO< cutlass::Vector< cutlass::int4_t, 8 > > const &scalar)
 Printing to ostream of vector of 4b signed integer elements. More...
 
template<>
std::ostream & cutlass::operator<< (std::ostream &out, ScalarIO< cutlass::Vector< cutlass::uint4_t, 8 > > const &scalar)
 Printing to ostream of vector of 4b unsigned integer elements. More...
 
-

Function Documentation

- -

◆ operator<<()

- -
-
-
-template<int Rank>
- - - - - - - - - - - - - - - - - - -
std::ostream& operator<< (std::ostream & out,
cutlass::Coord< Rank > const & coord 
)
-
- -
-
diff --git a/docs/core__io_8h_source.html b/docs/core__io_8h_source.html index 7c076c94da..21b790113f 100644 --- a/docs/core__io_8h_source.html +++ b/docs/core__io_8h_source.html @@ -76,11 +76,19 @@
core_io.h
-Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
25 #pragma once
26 
31 #pragma once
32 
33 #include <iosfwd>
34 #include <typeinfo>
35 
36 #include <cutlass/coord.h>
37 
38 template <int Rank>
39 std::ostream& operator<<(std::ostream& out, cutlass::Coord<Rank> const& coord) {
40  for (int i = 0; i < Rank; ++i) {
41  out << (i ? ", " : "") << coord.idx[i];
42  }
43  return out;
44 }
A Coord is a coordinate of arbitrary rank into a tensor or matrix.
+Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include <iosfwd>
32 #include <typeinfo>
33 
34 #include "cutlass/coord.h"
35 #include "cutlass/vector.h"
36 
37 namespace cutlass {
38 
40 
41 template <int Rank>
42 std::ostream& operator<<(std::ostream& out, Coord<Rank> const& coord) {
43  for (int i = 0; i < Rank; ++i) {
44  out << (i ? ", " : "") << coord.idx[i];
45  }
46  return out;
47 }
48 
50 
52 template <typename T>
53 struct ScalarIO {
54 
56  T value;
57 
59  ScalarIO() { }
60 
63 };
64 
66 
68 template <typename T>
69 inline std::ostream &operator<<(std::ostream &out, ScalarIO<T> const &scalar) {
70  return out << scalar.value;
71 }
72 
74 template <>
75 inline std::ostream &operator<<(std::ostream &out, ScalarIO<int8_t> const &scalar) {
76  return out << int(scalar.value);
77 }
78 
80 template <>
81 inline std::ostream &operator<<(std::ostream &out, ScalarIO<uint8_t> const &scalar) {
82  return out << unsigned(scalar.value);
83 }
84 
86 template <>
87 inline std::ostream &operator<<(
88  std::ostream &out,
90 
91  for (int i = 0; i < 32; i++) {
92  out << int(scalar.value[i]);
93  out << ((i != 31) ? ", " : "");
94  }
95  return out;
96 }
97 
99 template <>
100 inline std::ostream &operator<<(
101  std::ostream &out,
103 
104  for (int i = 0; i < 8; i++) {
105  out << int(scalar.value[i]);
106  out << ((i != 7) ? ", " : "");
107  }
108  return out;
109 }
110 
112 template <>
113 inline std::ostream &operator<<(
114  std::ostream &out,
116 
117  for (int i = 0; i < 8; i++) {
118  out << unsigned(scalar.value[i]);
119  out << ((i != 7) ? ", " : "");
120  }
121  return out;
122 }
123 
125 
126 } // namespace cutlass
Definition: convert.h:33
+
A Coord is a coordinate of arbitrary rank into a tensor or matrix.
+
ScalarIO(T value)
Constructs from a value.
Definition: core_io.h:62
+
ScalarIO()
Default ctor.
Definition: core_io.h:59
+
std::ostream & operator<<(std::ostream &out, Coord< Rank > const &coord)
Definition: core_io.h:42
+
Helper to enable formatted printing of CUTLASS scalar types to an ostream.
Definition: core_io.h:53
+
Definition: vector.h:62
+
T value
Value to print.
Definition: core_io.h:56
+
Defines a 1D vector of elements held in the registers of each thread.
diff --git a/docs/cutlass_8h.html b/docs/cutlass_8h.html index bbb0463c91..419c9123f3 100644 --- a/docs/cutlass_8h.html +++ b/docs/cutlass_8h.html @@ -73,8 +73,10 @@
cutlass.h File Reference
@@ -85,6 +87,13 @@

Go to the source code of this file.

+ + + + + +

+Classes

struct  DebugType< T >
 
struct  DebugValue< Value >
 
@@ -96,18 +105,26 @@ - + + + - - + + +

Namespaces

 cutlass
 
#define CUTLASS_MINOR   0
 
#define CUTLASS_PATCH   0
#define CUTLASS_PATCH   1
 
#define CUTLASS_VERSION   ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH)
 
#define CUTLASS_HOST_DEVICE
 
#define CUTLASS_ASSERT(x)   assert(x)
 
#define CUTLASS_PRAGMA_UNROLL
 
#define CUTLASS_PRAGMA_NO_UNROLL
 
#define CUTLASS_ASSERT(x)   assert(x)
 
#define CUTLASS_GEMM_LOOP   CUTLASS_PRAGMA_NO_UNROLL
 
+ + + +

+Functions

template<typename T >
void DebugTypeFunc (T const &t)
 

Macro Definition Documentation

@@ -126,6 +143,20 @@

+ + + +

◆ CUTLASS_GEMM_LOOP

+ +
+
+ + + + +
#define CUTLASS_GEMM_LOOP   CUTLASS_PRAGMA_NO_UNROLL
+
+
@@ -177,7 +208,7 @@

- +
#define CUTLASS_PATCH   0#define CUTLASS_PATCH   1
+ +

Function Documentation

+
+

◆ DebugTypeFunc()

+ +
+
+
+template<typename T >
+ + + + + + + + +
void DebugTypeFunc (T const & t)
+
+
diff --git a/docs/cutlass_8h_source.html b/docs/cutlass_8h_source.html index d2f442295e..9c9fb2b290 100644 --- a/docs/cutlass_8h_source.html +++ b/docs/cutlass_8h_source.html @@ -76,11 +76,14 @@
cutlass.h
-Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
25 
30 #pragma once
31 
33 
34 #define CUTLASS_MAJOR 1
35 #define CUTLASS_MINOR 0
36 #define CUTLASS_PATCH 0
37 #define CUTLASS_VERSION ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH)
38 
39 #ifdef __NVCC__
40 #define CUTLASS_HOST_DEVICE __forceinline__ __device__ __host__
41 #define CUTLASS_DEVICE __forceinline__ __device__
42 #elif defined(__CUDACC_RTC__)
43 #define CUTLASS_HOST_DEVICE __forceinline__ __device__
44 #define CUTLASS_DEVICE __forceinline__ __device__
45 #else
46 #define CUTLASS_HOST_DEVICE
47 // CUTLASS_DEVICE is an error if not compiling device code
48 #endif
49 
50 // CUTLASS_PRAGMA_UNROLL inserts a CUTLASS_PRAGMA_UNROLL if supported by the compiler
51 #if defined(__CUDA_ARCH__)
52 #if defined(_MSC_VER)
53 #define CUTLASS_PRAGMA_UNROLL __pragma("unroll")
54 #define CUTLASS_PRAGMA_NO_UNROLL __pragma("unroll 1")
55 #else
56 #define CUTLASS_PRAGMA_UNROLL _Pragma("unroll")
57 #define CUTLASS_PRAGMA_NO_UNROLL _Pragma("unroll 1")
58 #endif
59 #else
60 #define CUTLASS_PRAGMA_UNROLL
61 #define CUTLASS_PRAGMA_NO_UNROLL
62 #endif
63 
64 #define CUTLASS_ASSERT(x) assert(x)
65 
66 namespace cutlass {
67 
69 static const int kWarpSize = 32;
70 
71 } // namespace cutlass
72 
Definition: convert.h:33
+Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
25 
30 #pragma once
31 
33 
34 #define CUTLASS_MAJOR 1
35 #define CUTLASS_MINOR 0
36 #define CUTLASS_PATCH 1
37 #define CUTLASS_VERSION ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH)
38 
39 #ifdef __NVCC__
40 #define CUTLASS_HOST_DEVICE __forceinline__ __device__ __host__
41 #define CUTLASS_DEVICE __forceinline__ __device__
42 #elif defined(__CUDACC_RTC__)
43 #define CUTLASS_HOST_DEVICE __forceinline__ __device__
44 #define CUTLASS_DEVICE __forceinline__ __device__
45 #else
46 #define CUTLASS_HOST_DEVICE
47 // CUTLASS_DEVICE is an error if not compiling device code
48 #endif
49 
50 #define CUTLASS_ASSERT(x) assert(x)
51 
52 // CUTLASS_PRAGMA_(UNROLL|NO_UNROLL) optimization directives for the CUDA compiler.
53 #if defined(__CUDA_ARCH__)
54 #if defined(_MSC_VER)
55 #define CUTLASS_PRAGMA_UNROLL __pragma("unroll")
56 #define CUTLASS_PRAGMA_NO_UNROLL __pragma("unroll 1")
57 #else
58 #define CUTLASS_PRAGMA_UNROLL _Pragma("unroll")
59 #define CUTLASS_PRAGMA_NO_UNROLL _Pragma("unroll 1")
60 #endif
61 #else
62 #define CUTLASS_PRAGMA_UNROLL
63 #define CUTLASS_PRAGMA_NO_UNROLL
64 #endif
65 
66 #define CUTLASS_GEMM_LOOP CUTLASS_PRAGMA_NO_UNROLL
67 
68 // A small helper class to dump a type at compile time
69 // Usage:: DumpType<Class>::Class
70 template <typename T>
71 struct DebugType {};
72 
73 template <typename T>
74 void DebugTypeFunc(T const& t) {
75  T::t;
76 }
77 
78 // A small helper class to dump a compile time constant at compile time
79 // Usage: DumpValue<Class::kConstant>::kConstant
80 template <int Value>
81 struct DebugValue {};
82 
83 namespace cutlass {
84 
86 static const int kWarpSize = 32;
87 
88 } // namespace cutlass
89 
Definition: convert.h:33
+
Definition: cutlass.h:81
+
Definition: cutlass.h:71
+
void DebugTypeFunc(T const &t)
Definition: cutlass.h:74
diff --git a/docs/cutlass__math_8h.html b/docs/cutlass__math_8h.html index 953b0d4c70..c4dbc54b0b 100644 --- a/docs/cutlass__math_8h.html +++ b/docs/cutlass__math_8h.html @@ -83,7 +83,7 @@

Math utilities. More...

-
#include <cutlass/util/platform.h>
+

Go to the source code of this file.

@@ -103,6 +103,10 @@ + + + +
 
struct  cutlass::divide_assert< Dividend, Divisor >
 
struct  cutlass::Min< A, B >
 
struct  cutlass::Max< A, B >
 
@@ -120,11 +124,17 @@ + + + + + +

Namespaces

template<typename value_t >
CUTLASS_HOST_DEVICE value_t cutlass::lcm (value_t a, value_t b)
 
template<typename value_t >
CUTLASS_HOST_DEVICE value_t cutlass::clz (value_t x)
 
template<typename value_t >
CUTLASS_HOST_DEVICE value_t cutlass::find_log2 (value_t x)
 
diff --git a/docs/cutlass__math_8h_source.html b/docs/cutlass__math_8h_source.html index 2809a84568..8381f641a1 100644 --- a/docs/cutlass__math_8h_source.html +++ b/docs/cutlass__math_8h_source.html @@ -76,27 +76,33 @@
cutlass_math.h
-Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
25 
26 #pragma once
27 
33 #include <cutlass/util/platform.h>
34 
35 namespace cutlass {
36 
37 /******************************************************************************
38  * Static math utilities
39  ******************************************************************************/
40 
44 template <int N>
45 struct is_pow2 : platform::integral_constant<bool, (N & (N - 1)) == 0> {};
46 
50 template <int N, int CurrentVal = N, int Count = 0>
51 struct log2_down {
53  enum { value = log2_down<N, (CurrentVal >> 1), Count + 1>::value };
54 };
55 
56 // Base case
57 template <int N, int Count>
58 struct log2_down<N, 1, Count> {
59  enum { value = Count };
60 };
61 
65 template <int N, int CurrentVal = N, int Count = 0>
66 struct log2_up {
68  enum { value = log2_up<N, (CurrentVal >> 1), Count + 1>::value };
69 };
70 
71 // Base case
72 template <int N, int Count>
73 struct log2_up<N, 1, Count> {
74  enum { value = ((1 << Count) < N) ? Count + 1 : Count };
75 };
76 
80 template <int N>
81 struct sqrt_est {
82  enum { value = 1 << (log2_up<N>::value / 2) };
83 };
84 
89 template <int Dividend, int Divisor>
90 struct divide_assert {
91  enum { value = Dividend / Divisor };
92 
93  static_assert((Dividend % Divisor == 0), "Not an even multiple");
94 };
95 
96 /******************************************************************************
97  * Rounding
98  ******************************************************************************/
99 
103 template <typename dividend_t, typename divisor_t>
104 CUTLASS_HOST_DEVICE dividend_t round_nearest(dividend_t dividend, divisor_t divisor) {
105  return ((dividend + divisor - 1) / divisor) * divisor;
106 }
107 
111 template <typename value_t>
112 CUTLASS_HOST_DEVICE value_t gcd(value_t a, value_t b) {
113  for (;;) {
114  if (a == 0) return b;
115  b %= a;
116  if (b == 0) return a;
117  a %= b;
118  }
119 }
120 
124 template <typename value_t>
125 CUTLASS_HOST_DEVICE value_t lcm(value_t a, value_t b) {
126  value_t temp = gcd(a, b);
127 
128  return temp ? (a / temp * b) : 0;
129 }
130 
131 } // namespace cutlass
Definition: cutlass_math.h:91
+Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
25 
26 #pragma once
27 
33 #include "cutlass/util/platform.h"
34 
35 namespace cutlass {
36 
37 /******************************************************************************
38  * Static math utilities
39  ******************************************************************************/
40 
44 template <int N>
45 struct is_pow2 : platform::integral_constant<bool, (N & (N - 1)) == 0> {};
46 
50 template <int N, int CurrentVal = N, int Count = 0>
51 struct log2_down {
53  enum { value = log2_down<N, (CurrentVal >> 1), Count + 1>::value };
54 };
55 
56 // Base case
57 template <int N, int Count>
58 struct log2_down<N, 1, Count> {
59  enum { value = Count };
60 };
61 
65 template <int N, int CurrentVal = N, int Count = 0>
66 struct log2_up {
68  enum { value = log2_up<N, (CurrentVal >> 1), Count + 1>::value };
69 };
70 
71 // Base case
72 template <int N, int Count>
73 struct log2_up<N, 1, Count> {
74  enum { value = ((1 << Count) < N) ? Count + 1 : Count };
75 };
76 
80 template <int N>
81 struct sqrt_est {
82  enum { value = 1 << (log2_up<N>::value / 2) };
83 };
84 
89 template <int Dividend, int Divisor>
90 struct divide_assert {
91  enum { value = Dividend / Divisor };
92 
93  static_assert((Dividend % Divisor == 0), "Not an even multiple");
94 };
95 
96 /******************************************************************************
97  * Rounding
98  ******************************************************************************/
99 
103 template <typename dividend_t, typename divisor_t>
104 CUTLASS_HOST_DEVICE dividend_t round_nearest(dividend_t dividend, divisor_t divisor) {
105  return ((dividend + divisor - 1) / divisor) * divisor;
106 }
107 
111 template <typename value_t>
112 CUTLASS_HOST_DEVICE value_t gcd(value_t a, value_t b) {
113  for (;;) {
114  if (a == 0) return b;
115  b %= a;
116  if (b == 0) return a;
117  a %= b;
118  }
119 }
120 
124 template <typename value_t>
125 CUTLASS_HOST_DEVICE value_t lcm(value_t a, value_t b) {
126  value_t temp = gcd(a, b);
127 
128  return temp ? (a / temp * b) : 0;
129 }
130 
136 template <typename value_t>
137 CUTLASS_HOST_DEVICE value_t clz(value_t x) {
138  for (int i = 31; i >= 0; --i) {
139  if ((1 << i) & x) return 31 - i;
140  }
141  return 32;
142 }
143 
144 template <typename value_t>
145 CUTLASS_HOST_DEVICE value_t find_log2(value_t x) {
146  int a = 31 - clz(x);
147  a += (x & (x - 1)) != 0; // Round up, add 1 if not a power of 2.
148  return a;
149 }
150 
151 /******************************************************************************
152  * Min/Max
153  ******************************************************************************/
154 
155 template <int A, int B>
156 struct Min {
157  static int const kValue = (A < B) ? A : B;
158 };
159 
160 template <int A, int B>
161 struct Max {
162  static int const kValue = (A > B) ? A : B;
163 };
164 
165 } // namespace cutlass
Definition: cutlass_math.h:91
Definition: convert.h:33
+
static int const kValue
Definition: cutlass_math.h:157
+
CUTLASS_HOST_DEVICE value_t find_log2(value_t x)
Definition: cutlass_math.h:145
Definition: cutlass_math.h:51
C++ features that may be otherwise unimplemented for CUDA device functions.
+
Definition: cutlass_math.h:156
Definition: cutlass_math.h:53
CUTLASS_HOST_DEVICE value_t lcm(value_t a, value_t b)
Definition: cutlass_math.h:125
CUTLASS_HOST_DEVICE dividend_t round_nearest(dividend_t dividend, divisor_t divisor)
Definition: cutlass_math.h:104
Definition: cutlass_math.h:68
-
std::integral_constant
Definition: platform.h:274
+
std::integral_constant
Definition: platform.h:282
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:46
-
#define static_assert(__e, __m)
Definition: platform.h:145
+
#define static_assert(__e, __m)
Definition: platform.h:153
+
Definition: cutlass_math.h:161
Definition: cutlass_math.h:82
CUTLASS_HOST_DEVICE value_t gcd(value_t a, value_t b)
Definition: cutlass_math.h:112
Definition: cutlass_math.h:90
Definition: cutlass_math.h:66
+
CUTLASS_HOST_DEVICE value_t clz(value_t x)
Definition: cutlass_math.h:137
Definition: cutlass_math.h:45
+
static int const kValue
Definition: cutlass_math.h:162
Definition: cutlass_math.h:81
diff --git a/docs/debug_8h.html b/docs/debug_8h.html index 1f88396ab8..81ed9f3ca6 100644 --- a/docs/debug_8h.html +++ b/docs/debug_8h.html @@ -231,7 +231,7 @@

diff --git a/docs/debug_8h_source.html b/docs/debug_8h_source.html index 881b4e3f05..c404b41106 100644 --- a/docs/debug_8h_source.html +++ b/docs/debug_8h_source.html @@ -81,7 +81,7 @@

diff --git a/docs/dgemm__traits_8h.html b/docs/dgemm__traits_8h.html index eebc2f364c..ac6d33b0cb 100644 --- a/docs/dgemm__traits_8h.html +++ b/docs/dgemm__traits_8h.html @@ -82,21 +82,21 @@

Defines structural traits of double-precision GEMM. More...

-
#include <cutlass/gemm/gemm.h>
-#include <cutlass/gemm/gemm_epilogue.h>
-#include <cutlass/gemm/gemm_epilogue_traits.h>
-#include <cutlass/gemm/gemm_global_tile.h>
-#include <cutlass/gemm/gemm_shared_tile.h>
-#include <cutlass/gemm/gemm_traits.h>
-#include <cutlass/gemm/thread_multiply_add.h>
+

Go to the source code of this file.

- + - +

Classes

struct  cutlass::gemm::DgemmConfig< OutputTile_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_ >
struct  cutlass::gemm::DgemmConfig< OutputTile_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_ >
 
struct  cutlass::gemm::DgemmTraits< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_, GemmConfig_, GemmEpilogueTraits_ >
struct  cutlass::gemm::DgemmTraits< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_, GemmConfig_, GemmEpilogueTraits_ >
 
- - - + + + @@ -128,12 +128,24 @@ + + + + + + + + + + + + @@ -143,11 +155,20 @@ + + + + + + + + +

@@ -109,7 +109,7 @@ diff --git a/docs/dgemm__traits_8h_source.html b/docs/dgemm__traits_8h_source.html index 9cf2c8738a..d7cdbe5295 100644 --- a/docs/dgemm__traits_8h_source.html +++ b/docs/dgemm__traits_8h_source.html @@ -76,26 +76,26 @@
dgemm_traits.h
-Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
28 #pragma once
29 
30 #include <cutlass/gemm/gemm.h>
37 
38 namespace cutlass {
39 namespace gemm {
40 
42 
43 template <
45  typename OutputTile_,
47  typename AccumulatorsPerThread_,
49  int kScalarsPerLdgA_ = 1,
51  int kScalarsPerLdgB_ = 1>
53  : public GemmConfig<
55  double,
57  double,
59  double,
61  double,
63  OutputTile_,
65  ThreadMultiplyAdd<AccumulatorsPerThread_, Shape<1, 4, 8>, double, double, double>,
67  kScalarsPerLdgA_,
69  kScalarsPerLdgA_,
71  2,
73  kScalarsPerLdgB_,
75  kScalarsPerLdgB_,
77  2,
79  1,
81  2,
83  1,
85  2> {};
86 
88 
89 template <
91  MatrixLayout::Kind kLayoutA_,
93  MatrixLayout::Kind kLayoutB_,
95  typename OutputTile_ = Shape<8, 64, 128>,
97  typename EpilogueFunctor_ = LinearScaling<double>,
99  typename AccumulatorsPerThread_ = Shape<8, 8, 8>,
101  int kScalarsPerLdgA_ = 1,
103  int kScalarsPerLdgB_ = 1,
105  typename Index_ = int,
107  typename GemmConfig_ =
110  typename GemmEpilogueTraits_ =
113  // The layout for A.
114  kLayoutA_,
115  // The layout for B.
116  kLayoutB_,
117  // The config.
118  GemmConfig_,
119  // The epilogue.
120  GemmEpilogue<GemmEpilogueTraits_>,
121  // The index.
122  Index_> {};
123 
125 
126 } // namespace gemm
127 } // namespace cutlass
Definition: convert.h:33
+Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
28 #pragma once
29 
30 #include "cutlass/gemm/gemm.h"
37 
38 namespace cutlass {
39 namespace gemm {
40 
42 
43 template <
45  typename OutputTile_,
47  typename ThreadGemmShape_,
49  int kScalarsPerLdgA_ = 1,
51  int kScalarsPerLdgB_ = 1>
53  : public GemmConfig<
55  double,
57  double,
59  double,
61  double,
63  OutputTile_,
65  ThreadMultiplyAdd<ThreadGemmShape_, Shape<1, 4, 8>, double, double, double>,
67  kScalarsPerLdgA_,
69  kScalarsPerLdgA_,
71  2,
73  kScalarsPerLdgB_,
75  kScalarsPerLdgB_,
77  2,
79  1,
81  2,
83  1,
85  2,
87  false,
89  false,
91  false
92  >{};
93 
95 
96 template <
98  MatrixLayout::Kind kLayoutA_,
100  MatrixLayout::Kind kLayoutB_,
102  typename OutputTile_ = Shape<8, 64, 128>,
104  typename EpilogueFunctor_ = LinearScaling<double>,
106  typename ThreadGemmShape_ = Shape<8, 8, 8>,
108  int kScalarsPerLdgA_ = 1,
110  int kScalarsPerLdgB_ = 1,
112  typename Index_ = int,
114  typename GemmConfig_ =
117  typename GemmEpilogueTraits_ =
120  // The layout for A.
121  kLayoutA_,
122  // The layout for B.
123  kLayoutB_,
124  // The config.
125  GemmConfig_,
126  // The epilogue.
127  GemmEpilogue<GemmEpilogueTraits_>,
128  // The index.
129  Index_> {};
130 
132 
133 } // namespace gemm
134 } // namespace cutlass
Definition: convert.h:33
Defines iterators for efficiently loading and storing to global memory.
Defines structural properties of complete GEMM computation.
Template implementing matrix multiply-add operations on fragments.
Implements the epilogue phase of the GEMM kernel that efficiently updates global memory with the comp...
Defines iterators for efficiently loading and storing tiles to and from shared memory.
-
Definition: gemm_traits.h:79
-
Definition: dgemm_traits.h:112
+
Definition: gemm_config.h:76
+
Definition: dgemm_traits.h:119
Definition: dgemm_traits.h:52
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64
-
Definition: gemm_epilogue_traits.h:300
-
Kind
Definition: matrix_traits.h:36
-
Functor to compute linear combination of fragments.
Definition: linear_scaling.h:40
+
Definition: gemm_epilogue_traits.h:323
+
Kind
Enumeration defining fundamental contiguous layouts.
Definition: matrix_traits.h:159
+
Functor to compute linear combination of fragments.
Definition: linear_scaling.h:51
Implements a software-pipelined efficient GEMM.
Defines structural properties of the GEMM epilogue.
-
Definition: gemm_traits.h:723
+
Definition: gemm_traits.h:650
diff --git a/docs/dir_1417ee5ebebc309c36b7962f26a92c39.html b/docs/dir_1417ee5ebebc309c36b7962f26a92c39.html index d7393ef13f..6555e36cf4 100644 --- a/docs/dir_1417ee5ebebc309c36b7962f26a92c39.html +++ b/docs/dir_1417ee5ebebc309c36b7962f26a92c39.html @@ -101,15 +101,15 @@

file  fragment.h [code]
 Defines Fragment, a statically-sized array for storing parts of matrices within a thread's registers.
 
file  fragment_load_store.h [code]
 Defines accessors for loading and storing fragments to memory efficiently.
 
file  fragment_multiply_add.h [code]
 Defines multiply-add operations on fragments within a thread.
 
file  iterator_access.h [code]
 Free functions for loading and storing to implementations of tile iteartor concepts.
 
file  kernel_launch.h [code]
 Defines structures and helpers to launch CUDA kernels within CUTLASS.
 
file  load_store.h [code]
 Defines abstractions for efficiently loading and storing vectors to memory.
 
file  tensor_ref.h [code]
 Defines a structure containing strides, bounds, and a pointer to tensor data.
 
file  tensor_ref_collection.h [code]
 Introduces TensorRefCollection concept and defines TensorRefBatch and TensorRefArray.
 
file  tensor_view.h [code]
 Defines a structure containing strides and a pointer to tensor data.
 
file  tile_allocation.h [code]
 Defines a fragment based on a Shape<> template.
 
file  tile_coord.h [code]
 Defines a coordinate used for the CUTLASS 4-D tile structure.
 
file  tile_iterator.h [code]
 Defines the Tile Traits concept and iterators for loading and storing to tiles efficiently.
 
file  tile_stream.h [code]
 Implements the tile stream concept, composing an iterator with a transformation. Offers split-phase semantics, separating the initiation of an asynchronous memory operation with a fence forcing it to complete.
 
file  tile_traits_standard.h [code]
 Defines tile traits for several tile partitioning arrangements of threads expected to achieve efficient streaming performance.
 
file  wmma_matrix.h [code]
 Abstractions for loading and storing matrices using the CUDA WMMA API.
 
file  zip_fragment.h [code]
 Models a pair of fragments.
 
file  zip_tensor_ref.h [code]
 Defines a structure containing a pair of TensorRef-like objects.
 
file  zip_tile_iterator.h [code]
 Constructs an iterator that owns two tile iterator instances.
 
diff --git a/docs/dir_18d6a367a3982a494d65599933fc67a3.html b/docs/dir_18d6a367a3982a494d65599933fc67a3.html index 161267475b..b606ad3e06 100644 --- a/docs/dir_18d6a367a3982a494d65599933fc67a3.html +++ b/docs/dir_18d6a367a3982a494d65599933fc67a3.html @@ -85,9 +85,24 @@

file  dgemm_traits.h [code]
 Defines structural traits of double-precision GEMM.
 
file  fp16_sgemm_multiply_add.h [code]
 Template implementing matrix multiply-add operations on fragments.
 
file  fp16_sgemm_traits.h [code]
 Defies structural properties of single-precision GEMM where any number of the input/output could be fp16 or fp32. The accumulator type stays in fp32.
 
file  gemm.h [code]
 Implements a software-pipelined efficient GEMM.
 
file  gemm_config.h [code]
 Defines properties of GEMM computation that impose some constraints on caller.
 
file  gemm_coord.h [code]
 GemmCoord is a structure derived from Coord<4> that specifies a location within the coordinate system of a GEMM problem.
 
file  gemm_desc.h [code]
 Implements a software-pipelined efficient GEMM.
 
file  gemm_epilogue.h [code]
 Implements the epilogue phase of the GEMM kernel that efficiently updates global memory with the computed matrix product.
 
file  gemm_shared_tile.h [code]
 Defines iterators for efficiently loading and storing tiles to and from shared memory.
 
file  gemm_stream_pair.h [code]
 Defines a pair of GEMM tile streams.
 
file  gemm_traits.h [code]
 Defines structural properties of complete GEMM computation.
 
file  hgemm_traits.h [code]
 Defies structural properties of half-precision GEMM computation.
 
file  identity_block_swizzle.h [code]
 Defies functors for mapping blockIdx to partitions of the GEMM computation.
 
file  igemm_epilogue.h [code]
 Defines the epilogue phase of the GEMM computation for IGEMM, supporting integer and floating-point output matrix formats.
 
file  linear_scaling.h [code]
 Implements the BLAS linear scaling function alpha*AB + beta*C.
 
file  linear_scaling_device_ptr.h [code]
 Implements the BLAS linear scaling function alpha*AB + beta*C.
 
file  scalar_or_pointer.h [code]
 Implements the BLAS linear scaling function alpha*AB + beta*C.
 
file  sgemm_traits.h [code]
 Defies structural properties of single-precision GEMM.
 
file  thread_multiply_add.h [code]
 Template implementing matrix multiply-add operations on fragments.
 
file  threadblock_swizzle.h [code]
 Defies functors for mapping blockIdx to partitions of the GEMM computation.
 
file  wmma_gemm_epilogue_traits.h [code]
 Defines structural properties of WMMA GEMM's epilogue phase.
 
+ + + + @@ -92,7 +96,7 @@ diff --git a/docs/files.html b/docs/files.html index 2c06de5a89..1019520907 100644 --- a/docs/files.html +++ b/docs/files.html @@ -75,62 +75,79 @@
Here is a list of all files with brief descriptions:

Files

file  complex.h [code]
 
file  cutlass_math.h [code]
 Math utilities.
 
file  debug.h [code]
 Debugging and logging functionality.
 
file  numeric_types.h [code]
 
file  platform.h [code]
 C++ features that may be otherwise unimplemented for CUDA device functions.
 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 clear_accumulators.hDefines abstractions for efficiently clearing accumulator tiles
 convert.hDefines conversion operations among Fragments of different base type
 coord.hA Coord is a coordinate of arbitrary rank into a tensor or matrix
 core_io.hHelpers for printing cutlass/core objects
 cutlass.hBasic include for CUTLASS macros
 cutlass_math.hMath utilities
 debug.hDebugging and logging functionality
 dgemm_traits.hDefines structural traits of double-precision GEMM
 fragment.hDefines Fragment, a statically-sized array for storing parts of matrices within a thread's registers
 fragment_load_store.hDefines accessors for loading and storing fragments to memory efficiently
 fragment_multiply_add.hDefines multiply-add operations on fragments within a thread
 gemm.hImplements a software-pipelined efficient GEMM
 gemm_epilogue.hImplements the epilogue phase of the GEMM kernel that efficiently updates global memory with the computed matrix product
 gemm_epilogue_traits.hDefines structural properties of the GEMM epilogue
 gemm_global_stream.hImplements efficient loading of the thread block-level tile from global memory and storing to shared memory
 gemm_global_tile.hDefines iterators for efficiently loading and storing to global memory
 gemm_operand.hDefines constant expressions for mapping GEMM problem size and strides onto pitch-linear memory
 gemm_shared_stream.hDefines abstractions for managing loading and storing fragments to shared memory in the efficient GEMM pipeline
 gemm_shared_tile.hDefines iterators for efficiently loading and storing tiles to and from shared memory
 gemm_traits.hDefines structural properties of complete GEMM computation
 hgemm_global_tile.hTile traits used to construct global tile iterator for HGEMM. This is intended to partition the thread block-level tile into 2D subtiles loaded by the threads and facilitate memory accesses larger than 16 bits
 hgemm_multiply_add.hSpecialization implementing multiply-add operation on half-precision floating point fragments
 hgemm_swizzle.hTransposes a tile of 16b elements. Used by HGEMM to construct a K-strided layout in shared memory for multiplicands
 hgemm_traits.hDefies structural properties of half-precision GEMM computation
 identity_block_swizzle.hDefies functors for mapping blockIdx to partitions of the GEMM computation
 igemm_epilogue.hDefines the epilogue phase of the GEMM computation for IGEMM, supporting integer and floating-point output matrix formats
 igemm_global_tile.hImplements tile iterators to partition the thread block tile into 2D subtiles and efficiently load each. Applies permute transformation to construct 'interleaved K-strided' data layout in which 4-element dot products from the same K index are arranged in consecutive locations within shared memory
 igemm_multiply_add.hImplements matrix multiply accumulate operation of 8-bit integer data using DP4A instruction
 igemm_swizzle.hTransposes a fragment of data containing packed 8-bit integer elements
 igemm_traits.hDefies structural properties of mixed-precision integer GEMM. Multiplicands are assumed to be packed 8bit integers, accumulators are assumed to be 32b signed integers, and output formats vary
 iterator_access.hFree functions for loading and storing to implementations of tile iteartor concepts
 linear_scaling.hImplements the BLAS linear scaling function alpha*AB + beta*C
 load_store.hDefines abstractions for efficiently loading and storing vectors to memory
 matrix_traits.hDefines properties of matrices used to denote layout and operands to GEMM kernels
 platform.hC++ features that may be otherwise unimplemented for CUDA device functions
 predicate_vector.hDefines container classes and iterators for managing a statically sized vector of boolean predicates
 reshape_tile.hDefines a type for restructuring a tile
 sgemm_traits.hDefies structural properties of single-precision GEMM
 shape.hDefines Shape implementing the Layout concept for representing a 4D hypercube of objects
 tensor_ref.hDefines a structure containing strides, bounds, and a pointer to tensor data
 tensor_view.hDefines a structure containing strides and a pointer to tensor data
 thread_multiply_add.hTemplate implementing matrix multiply-add operations on fragments
 tile_iterator.hDefines the Tile Traits concept and iterators for loading and storing to tiles efficiently
 tile_traits_standard.hDefines tile traits for several tile partitioning arrangements of threads expected to achieve efficient streaming performance
 vector.hDefines a 1D vector of elements held in the registers of each thread
 wmma_gemm_epilogue_traits.hDefines structural properties of WMMA GEMM's epilogue phase
 wmma_gemm_global_tile.hDefines tile iterator traits for loading thread block-level tile from global memory
 wmma_gemm_multiply_add.hImplements warp-level matrix multiply-accumulate operation using CUDA WMMA API
 wmma_gemm_shared_tile.hDefines iterator traits for efficiently loading and storing fragment to and from shared memory, specialized for WMMA GEMM
 wmma_gemm_traits.hDefies structural properties of GEMM targeting WMMA API in CUDA
 wmma_matrix.hAbstractions for loading and storing matrices using the CUDA WMMA API
 complex.h
 convert.hDefines conversion operations among Fragments of different base type
 coord.hA Coord is a coordinate of arbitrary rank into a tensor or matrix
 core_io.hHelpers for printing cutlass/core objects
 cutlass.hBasic include for CUTLASS macros
 cutlass_math.hMath utilities
 debug.hDebugging and logging functionality
 dgemm_traits.hDefines structural traits of double-precision GEMM
 fp16_sgemm_multiply_add.hTemplate implementing matrix multiply-add operations on fragments
 fp16_sgemm_traits.hDefies structural properties of single-precision GEMM where any number of the input/output could be fp16 or fp32. The accumulator type stays in fp32
 fragment.hDefines Fragment, a statically-sized array for storing parts of matrices within a thread's registers
 fragment_multiply_add.hDefines multiply-add operations on fragments within a thread
 gemm.hImplements a software-pipelined efficient GEMM
 gemm_config.hDefines properties of GEMM computation that impose some constraints on caller
 gemm_coord.hGemmCoord is a structure derived from Coord<4> that specifies a location within the coordinate system of a GEMM problem
 gemm_desc.hImplements a software-pipelined efficient GEMM
 gemm_epilogue.hImplements the epilogue phase of the GEMM kernel that efficiently updates global memory with the computed matrix product
 gemm_epilogue_traits.hDefines structural properties of the GEMM epilogue
 gemm_global_stream.hImplements efficient loading of the thread block-level tile from global memory and storing to shared memory
 gemm_global_tile.hDefines iterators for efficiently loading and storing to global memory
 gemm_operand.hDefines constant expressions for mapping GEMM problem size and strides onto pitch-linear memory
 gemm_shared_stream.hDefines abstractions for managing loading and storing fragments to shared memory in the efficient GEMM pipeline
 gemm_shared_tile.hDefines iterators for efficiently loading and storing tiles to and from shared memory
 gemm_stream_pair.hDefines a pair of GEMM tile streams
 gemm_traits.hDefines structural properties of complete GEMM computation
 hgemm_global_tile.hTile traits used to construct global tile iterator for HGEMM. This is intended to partition the thread block-level tile into 2D subtiles loaded by the threads and facilitate memory accesses larger than 16 bits
 hgemm_multiply_add.hSpecialization implementing multiply-add operation on half-precision floating point fragments
 hgemm_swizzle.hTransposes a tile of 16b elements. Used by HGEMM to construct a K-strided layout in shared memory for multiplicands
 hgemm_traits.hDefies structural properties of half-precision GEMM computation
 igemm_epilogue.hDefines the epilogue phase of the GEMM computation for IGEMM, supporting integer and floating-point output matrix formats
 igemm_global_tile.hImplements tile iterators to partition the thread block tile into 2D subtiles and efficiently load each. Applies permute transformation to construct 'interleaved K-strided' data layout in which 4-element dot products from the same K index are arranged in consecutive locations within shared memory
 igemm_multiply_add.hImplements matrix multiply accumulate operation of 8-bit integer data using DP4A instruction
 igemm_swizzle.hTransposes a fragment of data containing packed 8-bit integer elements
 igemm_traits.hDefies structural properties of mixed-precision integer GEMM. Multiplicands are assumed to be packed 8bit integers, accumulators are assumed to be 32b signed integers, and output formats vary
 iterator_access.hFree functions for loading and storing to implementations of tile iteartor concepts
 kernel_launch.hDefines structures and helpers to launch CUDA kernels within CUTLASS
 linear_scaling.hImplements the BLAS linear scaling function alpha*AB + beta*C
 linear_scaling_device_ptr.hImplements the BLAS linear scaling function alpha*AB + beta*C
 load_store.hDefines abstractions for efficiently loading and storing vectors to memory
 matrix_traits.hDefines properties of matrices used to denote layout and operands to GEMM kernels
 numeric_types.h
 platform.hC++ features that may be otherwise unimplemented for CUDA device functions
 predicate_vector.hDefines container classes and iterators for managing a statically sized vector of boolean predicates
 reshape_tile.hDefines a type for restructuring a tile
 scalar_or_pointer.hImplements the BLAS linear scaling function alpha*AB + beta*C
 sgemm_traits.hDefies structural properties of single-precision GEMM
 shape.hDefines Shape implementing the Layout concept for representing a 4D hypercube of objects
 tensor_ref.hDefines a structure containing strides, bounds, and a pointer to tensor data
 tensor_ref_collection.hIntroduces TensorRefCollection concept and defines TensorRefBatch and TensorRefArray
 tensor_view.hDefines a structure containing strides and a pointer to tensor data
 thread_multiply_add.hTemplate implementing matrix multiply-add operations on fragments
 threadblock_swizzle.hDefies functors for mapping blockIdx to partitions of the GEMM computation
 tile_allocation.hDefines a fragment based on a Shape<> template
 tile_coord.hDefines a coordinate used for the CUTLASS 4-D tile structure
 tile_iterator.hDefines the Tile Traits concept and iterators for loading and storing to tiles efficiently
 tile_stream.hImplements the tile stream concept, composing an iterator with a transformation. Offers split-phase semantics, separating the initiation of an asynchronous memory operation with a fence forcing it to complete
 tile_traits_standard.hDefines tile traits for several tile partitioning arrangements of threads expected to achieve efficient streaming performance
 vector.hDefines a 1D vector of elements held in the registers of each thread
 wmma_gemm_epilogue_traits.hDefines structural properties of WMMA GEMM's epilogue phase
 wmma_gemm_global_tile.hDefines tile iterator traits for loading thread block-level tile from global memory
 wmma_gemm_multiply_add.hImplements warp-level matrix multiply-accumulate operation using CUDA WMMA API
 wmma_gemm_shared_tile.hDefines iterator traits for efficiently loading and storing fragment to and from shared memory, specialized for WMMA GEMM
 wmma_gemm_traits.hDefies structural properties of GEMM targeting WMMA API in CUDA
 wmma_matrix.hAbstractions for loading and storing matrices using the CUDA WMMA API
 zip_fragment.hModels a pair of fragments
 zip_tensor_ref.hDefines a structure containing a pair of TensorRef-like objects
 zip_tile_iterator.hConstructs an iterator that owns two tile iterator instances
diff --git a/docs/fp16__sgemm__multiply__add_8h.html b/docs/fp16__sgemm__multiply__add_8h.html new file mode 100644 index 0000000000..deff050ef5 --- /dev/null +++ b/docs/fp16__sgemm__multiply__add_8h.html @@ -0,0 +1,111 @@ + + + + + + + +Cutlass: fp16_sgemm_multiply_add.h File Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
fp16_sgemm_multiply_add.h File Reference
+
+
+ +

Template implementing matrix multiply-add operations on fragments. +More...

+ +

Go to the source code of this file.

+ + + + + +

+Classes

struct  cutlass::gemm::ThreadMultiplyAdd< ThreadGemmShape_, ThreadsPerWarp_, half, half, float >
 Template performing matrix multiply-add operation within a thread. More...
 
+ + + + + +

+Namespaces

 cutlass
 
 cutlass::gemm
 
+
+ + + + diff --git a/docs/fp16__sgemm__multiply__add_8h_source.html b/docs/fp16__sgemm__multiply__add_8h_source.html new file mode 100644 index 0000000000..efac04637f --- /dev/null +++ b/docs/fp16__sgemm__multiply__add_8h_source.html @@ -0,0 +1,107 @@ + + + + + + + +Cutlass: fp16_sgemm_multiply_add.h Source File + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
fp16_sgemm_multiply_add.h
+
+
+Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
28 #pragma once
29 
30 #include "cutlass/fragment.h"
32 namespace cutlass {
33 namespace gemm {
34 
36 
38 template <typename ThreadGemmShape_,
39  typename ThreadsPerWarp_>
40 struct ThreadMultiplyAdd<ThreadGemmShape_, ThreadsPerWarp_, half, half, float> {
44  typedef ThreadGemmShape_ ThreadGemmShape;
48  typedef ThreadsPerWarp_ ThreadsPerWarp;
52  typedef half ScalarA;
56  typedef half ScalarB;
60  typedef float ScalarC;
63 
65  CUTLASS_DEVICE ThreadMultiplyAdd() {}
66 
68  CUTLASS_DEVICE void multiply_add(FragmentA const& a,
69  FragmentB const& b,
70  Accumulators const& c,
71  Accumulators& d) {
72  for (int j = 0; j < AccumulatorsPerThread::kH; ++j) {
73  for (int i = 0; i < AccumulatorsPerThread::kW; ++i) {
74  d[j * AccumulatorsPerThread::kW + i] = static_cast<ScalarC>(a[i]) * static_cast<ScalarC>(b[j]) + c[j * AccumulatorsPerThread::kW + i];
75  }
76  }
77  }
78 };
79 
81 
82 } // namespace gemm
83 } // namespace cutlass
Definition: convert.h:33
+
CUTLASS_DEVICE ThreadMultiplyAdd()
Ctor.
Definition: fp16_sgemm_multiply_add.h:65
+
Fragment< ScalarB, AccumulatorsPerThread::kH > FragmentB
The fragment for B.
Definition: fp16_sgemm_multiply_add.h:58
+
Shape< A_::kD *B_::kD, A_::kH *B_::kH, A_::kW *B_::kW, A_::kC *B_::kC > Shape
Definition: shape.h:119
+
A template defining Fragment Concept.
Definition: fragment.h:99
+
ShapeMul< ThreadGemmShape, ThreadsPerWarp >::Shape AccumulatorsPerWarp
The number of accumulators per warp.
Definition: fp16_sgemm_multiply_add.h:50
+
Template implementing matrix multiply-add operations on fragments.
+
ThreadGemmShape_ ThreadGemmShape
The shape of a thread-leveel matrix multiply accumulate.
Definition: fp16_sgemm_multiply_add.h:44
+
CUTLASS_DEVICE void multiply_add(FragmentA const &a, FragmentB const &b, Accumulators const &c, Accumulators &d)
Multiply : d = a*b + c.
Definition: fp16_sgemm_multiply_add.h:68
+
half ScalarA
The type for A. specialized to half.
Definition: fp16_sgemm_multiply_add.h:52
+
half ScalarB
The type for B. specialized to half.
Definition: fp16_sgemm_multiply_add.h:56
+
ThreadsPerWarp_ ThreadsPerWarp
The number of threads per warp.
Definition: fp16_sgemm_multiply_add.h:48
+
Fragment< ScalarA, AccumulatorsPerThread::kW > FragmentA
The fragment for A.
Definition: fp16_sgemm_multiply_add.h:54
+
float ScalarC
The type for C and D. specialized to float.
Definition: fp16_sgemm_multiply_add.h:60
+
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64
+
Fragment< ScalarC, AccumulatorsPerThread::kH *AccumulatorsPerThread::kW, 16 > Accumulators
The accumulators.
Definition: fp16_sgemm_multiply_add.h:62
+
ThreadGemmShape AccumulatorsPerThread
Aliased to "AccumulatorsPerThread" for compatibility. Expect to be renamed in CUTLASS v2...
Definition: fp16_sgemm_multiply_add.h:46
+
Template performing matrix multiply-add operation within a thread.
Definition: thread_multiply_add.h:44
+
Defines Fragment, a statically-sized array for storing parts of matrices within a thread&#39;s registers...
+
Shape< 1, 1, 1, 1 > InstructionShape
The shape of the instruction.
Definition: fp16_sgemm_multiply_add.h:42
+
+ + + + diff --git a/docs/fp16__sgemm__traits_8h.html b/docs/fp16__sgemm__traits_8h.html new file mode 100644 index 0000000000..0691fbbfcf --- /dev/null +++ b/docs/fp16__sgemm__traits_8h.html @@ -0,0 +1,117 @@ + + + + + + + +Cutlass: fp16_sgemm_traits.h File Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
fp16_sgemm_traits.h File Reference
+
+ + + + + diff --git a/docs/fp16__sgemm__traits_8h_source.html b/docs/fp16__sgemm__traits_8h_source.html new file mode 100644 index 0000000000..b5f94457fe --- /dev/null +++ b/docs/fp16__sgemm__traits_8h_source.html @@ -0,0 +1,104 @@ + + + + + + + +Cutlass: fp16_sgemm_traits.h Source File + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
fp16_sgemm_traits.h
+
+
+Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include "cutlass/gemm/gemm.h"
38 
39 namespace cutlass {
40 namespace gemm {
41 
43 
44 template <
46  typename OutputTile_,
48  typename ThreadGemmShape_,
50  typename ScalarA_,
52  typename ScalarB_,
54  typename ScalarC_,
56  typename ScalarD_,
58  int kScalarsPerLdgA_ = 1,
60  int kScalarsPerLdgB_ = 1>
61 struct Fp16SgemmConfig : public GemmConfig<
63  ScalarA_,
65  ScalarB_,
67  ScalarC_,
69  ScalarD_,
71  OutputTile_,
73  ThreadMultiplyAdd<ThreadGemmShape_, Shape<1, 4, 8>, ScalarA_, ScalarB_, float /*for sgemm accum is float*/>,
75  kScalarsPerLdgA_,
77  kScalarsPerLdgA_,
79  4,
81  kScalarsPerLdgB_,
83  kScalarsPerLdgB_,
85  4,
87  1,
89  4,
91  1,
93  2> {};
94 
96 
97 template <
99  MatrixLayout::Kind kLayoutA_,
101  MatrixLayout::Kind kLayoutB_,
103  typename OutputTile_ = Shape<8, 128, 128>,
105  typename ScalarA_ = half,
107  typename ScalarB_ = half,
109  typename ScalarC_ = half,
111  typename ScalarD_ = half,
113  typename Scalar_ = half,
115  typename EpilogueFunctor_ = LinearScaling<Scalar_, FragmentMultiplyAdd<Scalar_, float/*accumulator type*/> >,
117  typename ThreadGemmShape_ = Shape<8, 8, 8>,
119  int kScalarsPerLdgA_ = 1,
121  int kScalarsPerLdgB_ = 1,
123  typename Index_ = int,
125  typename GemmConfig_ =
126  Fp16SgemmConfig<OutputTile_,
127  ThreadGemmShape_,
128  ScalarA_,
129  ScalarB_,
130  ScalarC_,
131  ScalarD_,
132  kScalarsPerLdgA_,
133  kScalarsPerLdgB_>,
135  typename GemmEpilogueTraits_ =
138  // The layout for A.
139  kLayoutA_,
140  // The layout for B.
141  kLayoutB_,
142  // The config.
143  GemmConfig_,
144  // The epilogue.
145  GemmEpilogue<GemmEpilogueTraits_>,
146  // The index.
147  Index_> {};
148 
150 
151 } // namespace gemm
152 } // namespace cutlass
Definition: convert.h:33
+
Defines iterators for efficiently loading and storing to global memory.
+
Defines structural properties of complete GEMM computation.
+
Implements the epilogue phase of the GEMM kernel that efficiently updates global memory with the comp...
+
Defines iterators for efficiently loading and storing tiles to and from shared memory.
+
Definition: gemm_config.h:76
+
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64
+
Definition: gemm_epilogue_traits.h:323
+
Definition: fp16_sgemm_traits.h:61
+
Kind
Enumeration defining fundamental contiguous layouts.
Definition: matrix_traits.h:159
+
Template implementing matrix multiply-add operations on fragments.
+
Functor to compute linear combination of fragments.
Definition: linear_scaling.h:51
+
Implements a software-pipelined efficient GEMM.
+
Defines structural properties of the GEMM epilogue.
+
Definition: fp16_sgemm_traits.h:137
+
Definition: gemm_traits.h:650
+
Definition: fragment_multiply_add.h:41
+
+ + + + diff --git a/docs/fragment_8h.html b/docs/fragment_8h.html index d97ac7b5a8..687dfdc86b 100644 --- a/docs/fragment_8h.html +++ b/docs/fragment_8h.html @@ -83,15 +83,15 @@

Defines Fragment, a statically-sized array for storing parts of matrices within a thread's registers. More...

#include <assert.h>
-#include <cutlass/shape.h>
-#include <cutlass/util/cutlass_math.h>
-#include <cutlass/vector.h>
+#include "cutlass/shape.h"
+#include "cutlass/util/cutlass_math.h"
+#include "cutlass/vector.h"

Go to the source code of this file.

- + @@ -116,7 +116,7 @@ diff --git a/docs/fragment_8h_source.html b/docs/fragment_8h_source.html index 8006bbbdf4..f7d2365654 100644 --- a/docs/fragment_8h_source.html +++ b/docs/fragment_8h_source.html @@ -76,64 +76,66 @@
fragment.h
-Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include <assert.h>
32 #include <cutlass/shape.h>
34 #include <cutlass/vector.h>
35 
36 namespace cutlass {
37 
39 
56 
73 
75 template <int kAlignment_>
76 struct StorageType {
77  typedef uint64_t Type;
78 };
79 template <>
80 struct StorageType<4> {
81  typedef uint32_t Type;
82 };
83 template <>
84 struct StorageType<2> {
85  typedef uint16_t Type;
86 };
87 template <>
88 struct StorageType<1> {
89  typedef uint8_t Type;
90 };
91 
93 
98 template <typename Element_, int kElements_, size_t kAlignment_ = 16>
99 struct Fragment : public AlignedStruct<kAlignment_> {
101  static_assert(kAlignment_ == 16 || kAlignment_ >= sizeof(Element_), "Alignment is too small");
103  static_assert(is_pow2<kAlignment_>::value, "Alignment must be a power of two");
104 
108  typedef Element_ Element;
110  static int const kElements = kElements_;
111 
113  CUTLASS_DEVICE void clear() {
114  // Avoid element-wise access for sub 32b element type
115  if (kAlignment_ >= 8 && (kElements * sizeof(Element)) % 8 == 0) {
116  uint64_t* ptr = reinterpret_cast<uint64_t*>(storage);
117  for (int i = 0; i < (kElements * sizeof(Element)) / 8; ++i) {
118  ptr[i] = uint64_t(0);
119  }
120  } else if (kAlignment_ >= 4 && (kElements * sizeof(Element)) % 4 == 0) {
121  uint32_t* ptr = reinterpret_cast<uint32_t*>(storage);
122  for (int i = 0; i < (kElements * sizeof(Element)) / 4; ++i) {
123  ptr[i] = uint32_t(0);
124  }
125  } else if (kAlignment_ >= 2 && (kElements * sizeof(Element)) % 2 == 0) {
126  uint16_t* ptr = reinterpret_cast<uint16_t*>(storage);
127  for (int i = 0; i < (kElements * sizeof(Element)) / 2; ++i) {
128  ptr[i] = uint16_t(0);
129  }
130  } else {
131  for (int i = 0; i < kElements; ++i) {
132  storage[i] = 0;
133  }
134  }
135  }
136 
138  CUTLASS_DEVICE Element& operator[](int i) {
139  assert(i < kElements_);
140  return reinterpret_cast<Element*>(storage)[i];
141  }
142 
144  CUTLASS_DEVICE Element const& operator[](int i) const {
145  assert(i < kElements_);
146  return reinterpret_cast<Element const*>(storage)[i];
147  }
148 
149  private:
152 
154  static int const kStorageCount =
155  (sizeof(Element_) * kElements_ + sizeof(StorageType) - 1) / sizeof(StorageType);
157  StorageType storage[kStorageCount];
158 
160  static_assert(sizeof(StorageType) <= kAlignment_, "StorageType is too big for given alignment");
161 };
162 
164 
169 template <typename Fragment_, typename Iterations_, typename AccessType_>
174  typedef Fragment_ Fragment;
176  typedef Iterations_ Iterations;
178  typedef AccessType_ AccessType;
179 
181  typedef typename Fragment::Element Element;
183  static int const kElementsPerAccess = (int)(sizeof(AccessType) / sizeof(Element));
188 
190  template <typename OtherFragment_>
191  CUTLASS_DEVICE FragmentIterator(OtherFragment_& fragment, int offset = 0)
192  : pointer(reinterpret_cast<Element*>(&fragment[offset])) {
193  static_assert(OtherFragment_::kElements >= Fragment::kElements, "");
194  }
195 
197  CUTLASS_DEVICE AccessType const& at(int d, int h, int w, int c = 0) const {
198  int const imm = ComputeOffsetFromStrides<Strides>::get(d, h, w, c);
199  return reinterpret_cast<AccessType const&>(pointer[imm]);
200  }
201 
203  CUTLASS_DEVICE AccessType& at(int d, int h, int w, int c = 0) {
204  int const imm = ComputeOffsetFromStrides<Strides>::get(d, h, w, c);
205  return reinterpret_cast<AccessType&>(pointer[imm]);
206  }
207 
209  CUTLASS_DEVICE AccessType const& operator[](int i) const {
210  return reinterpret_cast<AccessType const&>(pointer[i * kElementsPerAccess]);
211  }
212 
214  CUTLASS_DEVICE AccessType& operator[](int i) {
215  return reinterpret_cast<AccessType&>(pointer[i * kElementsPerAccess]);
216  }
217 
219  CUTLASS_DEVICE bool valid(int d, int h, int w, int c) const { return true; }
220 
223 };
224 
226 
227 template <typename Fragment_, typename Iterations_, typename AccessType_>
232  typedef Fragment_ Fragment;
234  typedef Iterations_ Iterations;
236  typedef AccessType_ AccessType;
237 
239  typedef typename Fragment::Element Element;
241  static int const kElementsPerAccess = (int)(sizeof(AccessType) / sizeof(Element));
246 
248  template <typename OtherFragment_>
249  CUTLASS_DEVICE FragmentConstIterator(OtherFragment_& fragment, int offset = 0)
250  : pointer(reinterpret_cast<Element const*>(&fragment[offset])) {
251  static_assert(OtherFragment_::kElements >= Fragment::kElements, "");
252  }
254  CUTLASS_DEVICE FragmentConstIterator(
256  : pointer(reinterpret_cast<Element const*>(rhs_.offset)) {}
257 
259  CUTLASS_DEVICE AccessType const& at(int d, int h, int w, int c = 0) const {
260  int const imm = ComputeOffsetFromStrides<IterationsStrides>::get(d, h, w, c);
261  return reinterpret_cast<AccessType const&>(pointer[imm]);
262  }
263 
265  CUTLASS_DEVICE AccessType const& operator[](int i) const {
266  return reinterpret_cast<AccessType const&>(pointer[i * kElementsPerAccess]);
267  }
268 
270  CUTLASS_DEVICE bool valid(int d, int h, int w, int c) const { return true; }
271 
273  Element const* pointer;
274 };
275 
277 
278 } // namespace cutlass
CUTLASS_DEVICE void clear()
Clear a fragment.
Definition: fragment.h:113
+Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
29 #pragma once
30 
31 #include <assert.h>
32 #include "cutlass/shape.h"
34 #include "cutlass/vector.h"
35 
36 namespace cutlass {
37 
39 
56 
73 
75 template <int alignment>
76 struct StorageType {
77  typedef uint64_t Type;
78 };
79 template <>
80 struct StorageType<4> {
81  typedef uint32_t Type;
82 };
83 template <>
84 struct StorageType<2> {
85  typedef uint16_t Type;
86 };
87 template <>
88 struct StorageType<1> {
89  typedef uint8_t Type;
90 };
91 
93 
98 template <typename Element_, int kElements_, size_t kAlignment_ = 16>
99 struct Fragment : public AlignedStruct<kAlignment_> {
101  static_assert(kAlignment_ == 16 || kAlignment_ >= sizeof(Element_), "Alignment is too small");
103  static_assert(is_pow2<kAlignment_>::value, "Alignment must be a power of two");
104 
108  typedef Element_ Element;
110  static int const kElements = kElements_;
112  static int const kAlignment = kAlignment_;
113 
116  // Avoid element-wise access for sub 32b element type
117  if (kAlignment_ >= 8 && (kElements * sizeof(Element)) % 8 == 0) {
118  uint64_t* ptr = reinterpret_cast<uint64_t*>(storage);
119  for (int i = 0; i < (kElements * sizeof(Element)) / 8; ++i) {
120  ptr[i] = uint64_t(0);
121  }
122  } else if (kAlignment_ >= 4 && (kElements * sizeof(Element)) % 4 == 0) {
123  uint32_t* ptr = reinterpret_cast<uint32_t*>(storage);
124  for (int i = 0; i < (kElements * sizeof(Element)) / 4; ++i) {
125  ptr[i] = uint32_t(0);
126  }
127  } else if (kAlignment_ >= 2 && (kElements * sizeof(Element)) % 2 == 0) {
128  uint16_t* ptr = reinterpret_cast<uint16_t*>(storage);
129  for (int i = 0; i < (kElements * sizeof(Element)) / 2; ++i) {
130  ptr[i] = uint16_t(0);
131  }
132  } else {
133  for (int i = 0; i < kElements; ++i) {
134  storage[i] = 0;
135  }
136  }
137  }
138 
140  CUTLASS_HOST_DEVICE Element& operator[](int i) { return reinterpret_cast<Element*>(storage)[i]; }
141 
143  CUTLASS_HOST_DEVICE Element const& operator[](int i) const {
144  return reinterpret_cast<Element const*>(storage)[i];
145  }
146 
147  private:
150 
152  static int const kStorageCount =
153  (sizeof(Element_) * kElements_ + sizeof(StorageType) - 1) / sizeof(StorageType);
155  StorageType storage[kStorageCount];
156 
158  static_assert(sizeof(StorageType) <= kAlignment_, "StorageType is too big for given alignment");
159 };
160 
162 
167 template <typename Fragment_, typename Iterations_, typename AccessType_>
172  typedef Fragment_ Fragment;
174  typedef Iterations_ Iterations;
176  typedef AccessType_ AccessType;
177 
179  typedef typename Fragment::Element Element;
181  static int const kElementsPerAccess = (int)(sizeof(AccessType) / sizeof(Element));
186 
188  template <typename OtherFragment_>
189  CUTLASS_HOST_DEVICE FragmentIterator(OtherFragment_& fragment, int offset = 0)
190  : pointer(reinterpret_cast<Element*>(&fragment[offset])) {
191  static_assert(OtherFragment_::kElements >= Fragment::kElements, "");
192  }
193 
195  CUTLASS_HOST_DEVICE AccessType const& at(int d, int h, int w, int c = 0) const {
196  int const imm = ComputeOffsetFromStrides<Strides>::get(d, h, w, c);
197  return reinterpret_cast<AccessType const&>(pointer[imm]);
198  }
199 
201  CUTLASS_HOST_DEVICE AccessType& at(int d, int h, int w, int c = 0) {
202  int const imm = ComputeOffsetFromStrides<Strides>::get(d, h, w, c);
203  return reinterpret_cast<AccessType&>(pointer[imm]);
204  }
205 
208  return reinterpret_cast<AccessType const&>(pointer[i * kElementsPerAccess]);
209  }
210 
213  return reinterpret_cast<AccessType&>(pointer[i * kElementsPerAccess]);
214  }
215 
217  CUTLASS_HOST_DEVICE bool valid(int d, int h, int w, int c) const { return true; }
218 
221 };
222 
224 
225 template <typename Fragment_, typename Iterations_, typename AccessType_>
230  typedef Fragment_ Fragment;
232  typedef Iterations_ Iterations;
234  typedef AccessType_ AccessType;
235 
237  typedef typename Fragment::Element Element;
239  static int const kElementsPerAccess = (int)(sizeof(AccessType) / sizeof(Element));
244 
246  template <typename OtherFragment_>
247  CUTLASS_HOST_DEVICE FragmentConstIterator(OtherFragment_& fragment, int offset = 0)
248  : pointer(reinterpret_cast<Element const*>(&fragment[offset])) {
249  static_assert(OtherFragment_::kElements >= Fragment::kElements, "");
250  }
254  : pointer(reinterpret_cast<Element const*>(rhs_.offset)) {}
255 
257  CUTLASS_HOST_DEVICE AccessType const& at(int d, int h, int w, int c = 0) const {
258  int const imm = ComputeOffsetFromStrides<IterationsStrides>::get(d, h, w, c);
259  return reinterpret_cast<AccessType const&>(pointer[imm]);
260  }
261 
264  return reinterpret_cast<AccessType const&>(pointer[i * kElementsPerAccess]);
265  }
266 
268  CUTLASS_HOST_DEVICE bool valid(int d, int h, int w, int c) const { return true; }
269 
271  Element const* pointer;
272 };
273 
275 
276 } // namespace cutlass
CUTLASS_HOST_DEVICE void clear()
Clear a fragment.
Definition: fragment.h:115
+
CUTLASS_HOST_DEVICE bool valid(int d, int h, int w, int c) const
Is the iterator valid?
Definition: fragment.h:217
Definition: convert.h:33
-
CUTLASS_DEVICE Element & operator[](int i)
The accessor.
Definition: fragment.h:138
-
CUTLASS_DEVICE AccessType & at(int d, int h, int w, int c=0)
The accessor.
Definition: fragment.h:203
-
Definition: vector.h:41
-
Definition: fragment.h:228
-
CUTLASS_DEVICE AccessType const & operator[](int i) const
The accessor.
Definition: fragment.h:265
-
Shape< Shape_::kH *Shape_::kW *Shape_::kC, Shape_::kW *Shape_::kC, Shape_::kC, 1 > Shape
Definition: shape.h:155
+
Shape< Shape_::kH *Shape_::kW *Shape_::kC, Shape_::kW *Shape_::kC, Shape_::kC, elementsPerAccess > Shape
Definition: shape.h:170
+
Definition: vector.h:42
+
Definition: fragment.h:226
+
CUTLASS_HOST_DEVICE FragmentIterator(OtherFragment_ &fragment, int offset=0)
Ctor.
Definition: fragment.h:189
A template defining Fragment Concept.
Definition: fragment.h:99
-
Fragment::Element Element
The element.
Definition: fragment.h:181
-
static int const kElementsPerAccess
The number of elements per access.
Definition: fragment.h:241
-
Fragment_ Fragment
The fragment.
Definition: fragment.h:174
-
Fragment_ Fragment
The fragment.
Definition: fragment.h:232
-
CUTLASS_DEVICE AccessType & operator[](int i)
The accessor.
Definition: fragment.h:214
-
Fragment::Element Element
The element.
Definition: fragment.h:239
-
ShapeStrides< FragmentShape >::Shape IterationsStrides
The linear strides for iterations.
Definition: fragment.h:245
-
CUTLASS_DEVICE bool valid(int d, int h, int w, int c) const
Is the iterator valid?
Definition: fragment.h:270
-
CUTLASS_DEVICE FragmentIterator(OtherFragment_ &fragment, int offset=0)
Ctor.
Definition: fragment.h:191
+
Fragment::Element Element
The element.
Definition: fragment.h:179
+
static int const kElementsPerAccess
The number of elements per access.
Definition: fragment.h:239
+
Fragment_ Fragment
The fragment.
Definition: fragment.h:172
+
Fragment_ Fragment
The fragment.
Definition: fragment.h:230
+
Fragment::Element Element
The element.
Definition: fragment.h:237
Fragment< Element_, kElements_ > This_
Make sure the alignment makes sense wrt the size of elements.
Definition: fragment.h:101
-
FragmentIterator< Fragment_, Iterations_, AccessType_ > This_
This class.
Definition: fragment.h:172
-
ShapeMul< Iterations, Shape< 1, 1, 1, kElementsPerAccess > >::Shape FragmentShape
The shape of the the fragment.
Definition: fragment.h:243
+
FragmentIterator< Fragment_, Iterations_, AccessType_ > This_
This class.
Definition: fragment.h:170
+
ShapeMul< Iterations, Shape< 1, 1, 1, kElementsPerAccess > >::Shape FragmentShape
The shape of the the fragment.
Definition: fragment.h:241
Math utilities.
Definition: fragment.h:76
uint32_t Type
Definition: fragment.h:81
uint8_t Type
Definition: fragment.h:89
-
static CUTLASS_DEVICE int get(int d, int h, int w, int c)
Definition: shape.h:211
-
Element * pointer
The pointer.
Definition: fragment.h:222
-
AccessType_ AccessType
The access type.
Definition: fragment.h:236
+
Element * pointer
The pointer.
Definition: fragment.h:220
+
CUTLASS_HOST_DEVICE Element const & operator[](int i) const
The accessor.
Definition: fragment.h:143
+
AccessType_ AccessType
The access type.
Definition: fragment.h:234
+
ShapeStrides< FragmentShape, kElementsPerAccess >::Shape IterationsStrides
The linear strides for iterations.
Definition: fragment.h:243
Definition: shape.h:118
-
ShapeMul< Iterations, Shape< 1, 1, 1, kElementsPerAccess > >::Shape FragmentShape
The shape of the the fragment.
Definition: fragment.h:185
-
A template defining Fragment Iterator Concept.
Definition: fragment.h:170
+
ShapeMul< Iterations, Shape< 1, 1, 1, kElementsPerAccess > >::Shape FragmentShape
The shape of the the fragment.
Definition: fragment.h:183
+
CUTLASS_HOST_DEVICE FragmentConstIterator(OtherFragment_ &fragment, int offset=0)
Ctor.
Definition: fragment.h:247
+
A template defining Fragment Iterator Concept.
Definition: fragment.h:168
static int const kElements
The number of elements.
Definition: fragment.h:110
-
CUTLASS_DEVICE Element const & operator[](int i) const
The accessor.
Definition: fragment.h:144
-
Iterations_ Iterations
The number of iterations.
Definition: fragment.h:234
-
#define static_assert(__e, __m)
Definition: platform.h:145
-
Iterations_ Iterations
The number of iterations.
Definition: fragment.h:176
+
#define CUTLASS_HOST_DEVICE
Definition: cutlass.h:46
+
Iterations_ Iterations
The number of iterations.
Definition: fragment.h:232
+
CUTLASS_HOST_DEVICE AccessType const & at(int d, int h, int w, int c=0) const
The accessor.
Definition: fragment.h:195
+
#define static_assert(__e, __m)
Definition: platform.h:153
+
Iterations_ Iterations
The number of iterations.
Definition: fragment.h:174
+
CUTLASS_HOST_DEVICE FragmentConstIterator(FragmentIterator< Fragment_, Iterations_, AccessType_ > const &rhs_)
Create from non-constant FragmentIterator.
Definition: fragment.h:252
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64
-
CUTLASS_DEVICE AccessType const & at(int d, int h, int w, int c=0) const
The accessor.
Definition: fragment.h:259
Element_ Element
The element.
Definition: fragment.h:108
-
FragmentIterator< Fragment_, Iterations_, AccessType_ > This_
This class.
Definition: fragment.h:230
-
CUTLASS_DEVICE AccessType const & operator[](int i) const
The accessor.
Definition: fragment.h:209
+
FragmentIterator< Fragment_, Iterations_, AccessType_ > This_
This class.
Definition: fragment.h:228
+
CUTLASS_HOST_DEVICE AccessType const & operator[](int i) const
The accessor.
Definition: fragment.h:263
+
CUTLASS_HOST_DEVICE Element & operator[](int i)
The accessor.
Definition: fragment.h:140
+
CUTLASS_HOST_DEVICE AccessType const & operator[](int i) const
The accessor.
Definition: fragment.h:207
uint16_t Type
Definition: fragment.h:85
Defines a 1D vector of elements held in the registers of each thread.
-
CUTLASS_DEVICE FragmentConstIterator(FragmentIterator< Fragment_, Iterations_, AccessType_ > const &rhs_)
Create from non-constant FragmentIterator.
Definition: fragment.h:254
-
static int const kElementsPerAccess
The number of elements per access.
Definition: fragment.h:183
-
ShapeStrides< FragmentShape >::Shape Strides
The linear strides for iterations.
Definition: fragment.h:187
+
uint64_t Type
Definition: fragment.h:77
+
CUTLASS_HOST_DEVICE bool valid(int d, int h, int w, int c) const
Is the iterator valid?
Definition: fragment.h:268
+
ShapeStrides< FragmentShape, kElementsPerAccess >::Shape Strides
The linear strides for iterations.
Definition: fragment.h:185
+
static CUTLASS_HOST_DEVICE int get(int d, int h, int w, int c)
Definition: shape.h:199
+
CUTLASS_HOST_DEVICE AccessType & operator[](int i)
The accessor.
Definition: fragment.h:212
+
CUTLASS_HOST_DEVICE AccessType & at(int d, int h, int w, int c=0)
The accessor.
Definition: fragment.h:201
+
static int const kElementsPerAccess
The number of elements per access.
Definition: fragment.h:181
Defines Shape implementing the Layout concept for representing a 4D hypercube of objects.
-
AccessType_ AccessType
The access type.
Definition: fragment.h:178
-
CUTLASS_DEVICE bool valid(int d, int h, int w, int c) const
Is the iterator valid?
Definition: fragment.h:219
-
uint64_t Type
Definition: fragment.h:77
+
AccessType_ AccessType
The access type.
Definition: fragment.h:176
+
static int const kAlignment
Alignment.
Definition: fragment.h:112
Definition: cutlass_math.h:45
-
CUTLASS_DEVICE FragmentConstIterator(OtherFragment_ &fragment, int offset=0)
Ctor.
Definition: fragment.h:249
-
CUTLASS_DEVICE AccessType const & at(int d, int h, int w, int c=0) const
The accessor.
Definition: fragment.h:197
-
Element const * pointer
The pointer.
Definition: fragment.h:273
+
CUTLASS_HOST_DEVICE AccessType const & at(int d, int h, int w, int c=0) const
The accessor.
Definition: fragment.h:257
+
Element const * pointer
The pointer.
Definition: fragment.h:271
diff --git a/docs/fragment__multiply__add_8h.html b/docs/fragment__multiply__add_8h.html index 59a94dfdff..107cfee79a 100644 --- a/docs/fragment__multiply__add_8h.html +++ b/docs/fragment__multiply__add_8h.html @@ -82,15 +82,15 @@

Defines multiply-add operations on fragments within a thread. More...

-

Classes

struct  cutlass::StorageType< kAlignment_ >
struct  cutlass::StorageType< alignment >
 
struct  cutlass::StorageType< 4 >
 
- + - +

Classes

struct  cutlass::gemm::FragmentMultiplyAdd< Scalar_ >
struct  cutlass::gemm::FragmentMultiplyAdd< ScalarAlphaBeta_, ScalarAccum_, fragMul2 >
 
struct  cutlass::gemm::FragmentMultiplyAdd< half >
struct  cutlass::gemm::FragmentMultiplyAdd< half, half, true >
 

@@ -103,7 +103,7 @@ diff --git a/docs/fragment__multiply__add_8h_source.html b/docs/fragment__multiply__add_8h_source.html index 9b453fd942..1d4c4f7f22 100644 --- a/docs/fragment__multiply__add_8h_source.html +++ b/docs/fragment__multiply__add_8h_source.html @@ -76,28 +76,26 @@
fragment_multiply_add.h
-Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
28 #pragma once
29 
30 #include <cutlass/fragment.h>
31 
32 namespace cutlass {
33 namespace gemm {
34 
36 
37 template <typename Scalar_>
42  typedef Scalar_ ScalarA;
44  typedef Scalar_ ScalarB;
46  typedef Scalar_ ScalarC;
47 
49  CUTLASS_DEVICE FragmentMultiplyAdd() {}
50 
52  template <typename Fragment_>
53  CUTLASS_DEVICE void multiply(Scalar_ a, Fragment_ const& b, Fragment_& d) {
54  for (int j = 0; j < Fragment_::kElements; ++j) {
55  d[j] = a * b[j];
56  }
57  }
58 
60  template <typename Fragment_>
61  CUTLASS_DEVICE void multiply_add(Scalar_ a,
62  Fragment_ const& b,
63  Fragment_ const& c,
64  Fragment_& d) {
65  for (int j = 0; j < Fragment_::kElements; ++j) {
66  d[j] = a * b[j] + c[j];
67  }
68  }
69 };
70 
72 
73 #if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16)
74 template <>
75 struct FragmentMultiplyAdd<half> {
79  typedef half ScalarA;
81  typedef half ScalarB;
83  typedef half ScalarC;
84 
86  CUTLASS_DEVICE FragmentMultiplyAdd() {}
87 
89  template <typename Fragment_>
90  CUTLASS_DEVICE void multiply(half a, Fragment_ const& b, Fragment_& d) {
91 #if defined(__CUDACC__) && __CUDA_ARCH__ >= 530
92  // The input.
93  __half2 const* b_half2 = reinterpret_cast<__half2 const*>(&b[0]);
94  // The output.
95  __half2* d_half2 = reinterpret_cast<__half2*>(&d[0]);
96 
97  // Assemble a half2 from a.
98  __half2 const a_half2 = __half2half2(a);
99 
100  for (int i = 0; i < Fragment_::kElements / 2; ++i) {
101  d_half2[i] = __hmul2(a_half2, b_half2[i]);
102  }
103 #endif
104  }
105 
107  template <typename Fragment_>
108  CUTLASS_DEVICE void multiply_add(half a, Fragment_ const& b, Fragment_ const& c, Fragment_& d) {
109 #if defined(__CUDACC__) && __CUDA_ARCH__ >= 530
110  // The inputs.
111  __half2 const* b_half2 = reinterpret_cast<__half2 const*>(&b[0]);
112  __half2 const* c_half2 = reinterpret_cast<__half2 const*>(&c[0]);
113  // The output.
114  __half2* d_half2 = reinterpret_cast<__half2*>(&d[0]);
115 
116  // Assemble a half2 from a.
117  __half2 const a_half2 = __half2half2(a);
118 
119  for (int i = 0; i < Fragment_::kElements / 2; ++i) {
120  d_half2[i] = __hfma2(a_half2, b_half2[i], c_half2[i]);
121  }
122 #endif
123  }
124 };
125 
126 #endif
127 
129 
130 } // namespace gemm
131 } // namespace cutlass
Scalar_ ScalarB
The type for B.
Definition: fragment_multiply_add.h:44
+Go to the documentation of this file.
1 /***************************************************************************************************
2  * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without modification, are permitted
5  * provided that the following conditions are met:
6  * * Redistributions of source code must retain the above copyright notice, this list of
7  * conditions and the following disclaimer.
8  * * Redistributions in binary form must reproduce the above copyright notice, this list of
9  * conditions and the following disclaimer in the documentation and/or other materials
10  * provided with the distribution.
11  * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
12  * to endorse or promote products derived from this software without specific prior written
13  * permission.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
17  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
19  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
20  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
21  * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
22  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  *
24  **************************************************************************************************/
28 #pragma once
29 
30 #include "cutlass/fragment.h"
31 
32 namespace cutlass {
33 namespace gemm {
34 
36 
37 template < typename ScalarAlphaBeta_,
38  typename ScalarAccum_,
39  bool fragMul2 = true /*number of element per fragment is multiple of 2*/
40 >
45  typedef ScalarAlphaBeta_ ScalarAlphaBeta;
47  typedef ScalarAccum_ ScalarAccum;
48 
50  CUTLASS_DEVICE FragmentMultiplyAdd() {}
51 
53  template <typename FragmentB_, typename FragmentCd_>
54  CUTLASS_DEVICE void multiply(ScalarAlphaBeta a, FragmentB_ const& b, FragmentCd_& d) {
55 #if defined(__CUDACC__) && __CUDA_ARCH__ >= 530
56  int const kReduction = FragmentB_::kElements / FragmentCd_::kElements;
57  for (int j = 0; j < FragmentCd_::kElements; ++j) {
58  d[j] = b[j * kReduction + 0];
59  for (int k = 1; k < kReduction; ++k) {
60  d[j] += b[j * kReduction + k];
61  }
62  d[j] = a * ScalarAlphaBeta(d[j]);
63  }
64 #endif
65  }
66 
68  template <typename FragmentB_, typename FragmentCd_>
69  CUTLASS_DEVICE void multiply_add(ScalarAlphaBeta a,
70  FragmentB_ const& b,
71  FragmentCd_ const& c,
72  FragmentCd_& d) {
73 #if defined(__CUDACC__) && __CUDA_ARCH__ >= 530
74  int const kReduction = FragmentB_::kElements / FragmentCd_::kElements;
75  for (int j = 0; j < FragmentCd_::kElements; ++j) {
76  d[j] = b[j * kReduction + 0];
77  for (int k = 1; k < kReduction; ++k) {
78  d[j] += b[j * kReduction + k];
79  }
80  d[j] = a * ScalarAlphaBeta(d[j]) + ScalarAlphaBeta(c[j]);
81  }
82 #endif
83  }
84 };
85 
87 
88 #if !defined(__CUDACC_RTC__) || defined(CUTLASS_NVRTC_HAS_FP16)
89 template <>
90 struct FragmentMultiplyAdd<half, half, true> {
94  typedef half ScalarAlphaBeta;
96  typedef half ScalarAccum;
97 
99  CUTLASS_DEVICE FragmentMultiplyAdd() {}
100 
102  template <typename FragmentB_, typename FragmentCd_>
103  CUTLASS_DEVICE void multiply(half a, FragmentB_ const& b, FragmentCd_& d) {
104 #if defined(__CUDACC__) && __CUDA_ARCH__ >= 530
105  // The input.
106  __half2 const* b_half2 = reinterpret_cast<__half2 const*>(&b[0]);
107  // The output.
108  __half2* d_half2 = reinterpret_cast<__half2*>(&d[0]);
109 
110  // Assemble a half2 from a.
111  __half2 const a_half2 = __half2half2(a);
112 
113  int const kReduction = (FragmentB_::kElements / FragmentCd_::kElements);
114 
115  for (int j = 0; j < FragmentCd_::kElements / 2; ++j) {
116  d_half2[j] = __hmul2(a_half2, b_half2[j * kReduction + 0]);
117 
118  for (int k = 1; k < kReduction; ++k) {
119  d_half2[j] = __hfma2(a_half2, b_half2[j * kReduction + k], d_half2[j]);
120  }
121  }
122 #endif
123  }
124 
125 
127  template <typename FragmentB_, typename FragmentCd_>
128  CUTLASS_DEVICE void multiply_add(half a,
129  FragmentB_ const& b,
130  FragmentCd_ const& c,
131  FragmentCd_& d) {
132 #if defined(__CUDACC__) && __CUDA_ARCH__ >= 530
133  // The inputs.
134  __half2 const* b_half2 = reinterpret_cast<__half2 const*>(&b[0]);
135  __half2 const* c_half2 = reinterpret_cast<__half2 const*>(&c[0]);
136  // The output.
137  __half2* d_half2 = reinterpret_cast<__half2*>(&d[0]);
138 
139  // Assemble a half2 from a.
140  __half2 const a_half2 = __half2half2(a);
141 
142  int const kReduction = (FragmentB_::kElements / FragmentCd_::kElements);
143  for (int j = 0; j < FragmentCd_::kElements / 2; ++j) {
144  d_half2[j] = __hfma2(a_half2, b_half2[j * kReduction + 0], c_half2[j]);
145 
146  for (int k = 1; k < kReduction; ++k) {
147  d_half2[j] = __hfma2(a_half2, b_half2[j * kReduction + k], d_half2[j]);
148  }
149  }
150 #endif
151  }
152 };
153 
154 #endif
155 
157 
158 } // namespace gemm
159 } // namespace cutlass
CUTLASS_DEVICE void multiply(ScalarAlphaBeta a, FragmentB_ const &b, FragmentCd_ &d)
Multiply : d = a*b.
Definition: fragment_multiply_add.h:54
+
Shape< 1, 1, 1, 1 > InstructionShape
The shape of the instruction.
Definition: fragment_multiply_add.h:92
Definition: convert.h:33
-
CUTLASS_DEVICE void multiply(Scalar_ a, Fragment_ const &b, Fragment_ &d)
Multiply : d = a*b.
Definition: fragment_multiply_add.h:53
-
half ScalarA
The type for A.
Definition: fragment_multiply_add.h:79
-
CUTLASS_DEVICE FragmentMultiplyAdd()
Ctor.
Definition: fragment_multiply_add.h:86
-
CUTLASS_DEVICE void multiply_add(Scalar_ a, Fragment_ const &b, Fragment_ const &c, Fragment_ &d)
Multiply : d = a*b + c.
Definition: fragment_multiply_add.h:61
-
half ScalarC
The type for C and D.
Definition: fragment_multiply_add.h:83
-
CUTLASS_DEVICE void multiply_add(half a, Fragment_ const &b, Fragment_ const &c, Fragment_ &d)
Multiply : d = a*b + c.
Definition: fragment_multiply_add.h:108
+
half ScalarAlphaBeta
The type for alpha and beta.
Definition: fragment_multiply_add.h:94
+
CUTLASS_DEVICE FragmentMultiplyAdd()
Ctor.
Definition: fragment_multiply_add.h:50
+
CUTLASS_DEVICE FragmentMultiplyAdd()
Ctor.
Definition: fragment_multiply_add.h:99
+
CUTLASS_DEVICE void multiply(half a, FragmentB_ const &b, FragmentCd_ &d)
Multiply : d = a*b.
Definition: fragment_multiply_add.h:103
+
ScalarAccum_ ScalarAccum
The type for accumlator.
Definition: fragment_multiply_add.h:47
A Shape implementing Layout Concept describing the dimensions of a cube.
Definition: shape.h:64
-
Shape< 1, 1, 1, 1 > InstructionShape
The shape of the instruction.
Definition: fragment_multiply_add.h:40
-
Scalar_ ScalarC
The type for C and D.
Definition: fragment_multiply_add.h:46
-
Scalar_ ScalarA
The type for A.
Definition: fragment_multiply_add.h:42
-
CUTLASS_DEVICE FragmentMultiplyAdd()
Ctor.
Definition: fragment_multiply_add.h:49
+
ScalarAlphaBeta_ ScalarAlphaBeta
The type for alpha and beta.
Definition: fragment_multiply_add.h:45
+
CUTLASS_DEVICE void multiply_add(half a, FragmentB_ const &b, FragmentCd_ const &c, FragmentCd_ &d)
Multiply : d = a*b + c.
Definition: fragment_multiply_add.h:128
+
Shape< 1, 1, 1, 1 > InstructionShape
The shape of the instruction.
Definition: fragment_multiply_add.h:43
Defines Fragment, a statically-sized array for storing parts of matrices within a thread&#39;s registers...
-
CUTLASS_DEVICE void multiply(half a, Fragment_ const &b, Fragment_ &d)
Multiply : d = a*b.
Definition: fragment_multiply_add.h:90
-
Shape< 1, 1, 1, 1 > InstructionShape
The shape of the instruction.
Definition: fragment_multiply_add.h:77
-
half ScalarB
The type for B.
Definition: fragment_multiply_add.h:81
-
Definition: fragment_multiply_add.h:38
+
half ScalarAccum
The type for accumlator.
Definition: fragment_multiply_add.h:96
+
CUTLASS_DEVICE void multiply_add(ScalarAlphaBeta a, FragmentB_ const &b, FragmentCd_ const &c, FragmentCd_ &d)
Multiply : d = a*b + c.
Definition: fragment_multiply_add.h:69
+
Definition: fragment_multiply_add.h:41
diff --git a/docs/functions.html b/docs/functions.html index e6b156fbce..bdde612a82 100644 --- a/docs/functions.html +++ b/docs/functions.html @@ -71,77 +71,101 @@
Here is a list of all class members with links to the classes they belong to:

- a -

diff --git a/docs/functions_0x7e.html b/docs/functions_0x7e.html index 41aa664c47..0cb0e3458c 100644 --- a/docs/functions_0x7e.html +++ b/docs/functions_0x7e.html @@ -78,7 +78,7 @@

- ~ -

diff --git a/docs/structcutlass_1_1StorageType_3_012_01_4.html b/docs/structcutlass_1_1StorageType_3_012_01_4.html index 8464872574..3c9d5b3d22 100644 --- a/docs/structcutlass_1_1StorageType_3_012_01_4.html +++ b/docs/structcutlass_1_1StorageType_3_012_01_4.html @@ -108,7 +108,7 @@

diff --git a/docs/structcutlass_1_1StorageType_3_014_01_4-members.html b/docs/structcutlass_1_1StorageType_3_014_01_4-members.html index bf78873c4f..4a6a49f38d 100644 --- a/docs/structcutlass_1_1StorageType_3_014_01_4-members.html +++ b/docs/structcutlass_1_1StorageType_3_014_01_4-members.html @@ -83,7 +83,7 @@

diff --git a/docs/structcutlass_1_1StorageType_3_014_01_4.html b/docs/structcutlass_1_1StorageType_3_014_01_4.html index 74751e5779..3640ba5ffc 100644 --- a/docs/structcutlass_1_1StorageType_3_014_01_4.html +++ b/docs/structcutlass_1_1StorageType_3_014_01_4.html @@ -108,7 +108,7 @@

diff --git a/docs/structcutlass_1_1Store-members.html b/docs/structcutlass_1_1Store-members.html index f942adf611..909907277c 100644 --- a/docs/structcutlass_1_1Store-members.html +++ b/docs/structcutlass_1_1Store-members.html @@ -73,18 +73,18 @@
-
cutlass::Store< Scalar_, Lanes_, Memory_, bool, size_t > Member List
+
cutlass::Store< Scalar_, kAccessSize, Memory_, kFragmentElementType, FragmentElement_, kStride, size > Member List
diff --git a/docs/structcutlass_1_1Store.html b/docs/structcutlass_1_1Store.html index 057010d126..893444c286 100644 --- a/docs/structcutlass_1_1Store.html +++ b/docs/structcutlass_1_1Store.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::Store< Scalar_, Lanes_, Memory_, bool, size_t > Struct Template Reference +Cutlass: cutlass::Store< Scalar_, kAccessSize, Memory_, kFragmentElementType, FragmentElement_, kStride, size > Struct Template Reference @@ -77,7 +77,7 @@ Static Public Member Functions | List of all members
-
cutlass::Store< Scalar_, Lanes_, Memory_, bool, size_t > Struct Template Reference
+
cutlass::Store< Scalar_, kAccessSize, Memory_, kFragmentElementType, FragmentElement_, kStride, size > Struct Template Reference
@@ -85,27 +85,27 @@ - - - + + +

Public Types

typedef Vectorize< Scalar_, Lanes_ >::Type AccessType
 The output type. More...
 
typedef Vectorize< FragmentElement_, kAccessSize >::Type AccessType
 The output type. More...
 
- - - + + +

Static Public Member Functions

static CUTLASS_DEVICE void store (AccessType const &src, Scalar_ *pointer, int offset)
 The store function. More...
 
static CUTLASS_HOST_DEVICE void store (AccessType const &src, Scalar_ *pointer, int offset)
 The store function. More...
 

Member Typedef Documentation

- -

◆ AccessType

+ +

◆ AccessType

-template<typename Scalar_ , int Lanes_, MemorySpace::Kind Memory_, bool = (Lanes_ > 1), size_t = (sizeof(Scalar_) * Lanes_)>
+template<typename Scalar_ , int kAccessSize, MemorySpace::Kind Memory_, FragmentElementType::Kind kFragmentElementType = FragmentElementType::kScalar, typename FragmentElement_ = Scalar_, int kStride = 1, size_t size = (sizeof(Scalar_) * kAccessSize)>
- +
typedef Vectorize<Scalar_, Lanes_>::Type cutlass::Store< Scalar_, Lanes_, Memory_, bool, size_t >::AccessTypetypedef Vectorize<FragmentElement_, kAccessSize>::Type cutlass::Store< Scalar_, kAccessSize, Memory_, kFragmentElementType, FragmentElement_, kStride, size >::AccessType
@@ -113,21 +113,21 @@

Member Function Documentation

- -

◆ store()

+ +

◆ store()

-template<typename Scalar_ , int Lanes_, MemorySpace::Kind Memory_, bool = (Lanes_ > 1), size_t = (sizeof(Scalar_) * Lanes_)>
+template<typename Scalar_ , int kAccessSize, MemorySpace::Kind Memory_, FragmentElementType::Kind kFragmentElementType = FragmentElementType::kScalar, typename FragmentElement_ = Scalar_, int kStride = 1, size_t size = (sizeof(Scalar_) * kAccessSize)>
- +
- + - + @@ -163,7 +163,7 @@

diff --git a/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k004b304998a534d76357f834068909f8.html b/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k004b304998a534d76357f834068909f8.html new file mode 100644 index 0000000000..469365a9f3 --- /dev/null +++ b/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k004b304998a534d76357f834068909f8.html @@ -0,0 +1,92 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+

static CUTLASS_DEVICE void cutlass::Store< Scalar_, Lanes_, Memory_, bool, size_t >::store static CUTLASS_HOST_DEVICE void cutlass::Store< Scalar_, kAccessSize, Memory_, kFragmentElementType, FragmentElement_, kStride, size >::store (AccessType const & AccessType const &  src,
+ + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+ + + + + + + + + +
+
+ + +
+ +
+ + + +
+
+
cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 8 > Member List
+
+ + + + + diff --git a/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k12f5c8a016a307e76de374322fc00a66.html b/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k12f5c8a016a307e76de374322fc00a66.html new file mode 100644 index 0000000000..b9b3985d54 --- /dev/null +++ b/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k12f5c8a016a307e76de374322fc00a66.html @@ -0,0 +1,92 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kWmmaMatrix, FragmentElement_, kStride, size > Member List
+
+ + + + + diff --git a/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k220d5790f803f10840e2a92fb9a51dac.html b/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k220d5790f803f10840e2a92fb9a51dac.html new file mode 100644 index 0000000000..14e6874159 --- /dev/null +++ b/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k220d5790f803f10840e2a92fb9a51dac.html @@ -0,0 +1,171 @@ + + + + + + + +Cutlass: cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 16 > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 16 > Struct Template Reference
+
+
+ +

#include <load_store.h>

+ + + + + +

+Public Types

typedef Vectorize< Scalar_, kAccessSize >::Type AccessType
 The output type. More...
 
+ + + + +

+Static Public Member Functions

static CUTLASS_HOST_DEVICE void store (AccessType const &src, Scalar_ *pointer, int offset)
 The store function. More...
 
+

Member Typedef Documentation

+ +

◆ AccessType

+ +
+
+
+template<typename Scalar_ , int kAccessSize, MemorySpace::Kind Memory_, int kStride>
+ + + + +
typedef Vectorize<Scalar_, kAccessSize>::Type cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 16 >::AccessType
+
+ +
+
+

Member Function Documentation

+ +

◆ store()

+ +
+
+
+template<typename Scalar_ , int kAccessSize, MemorySpace::Kind Memory_, int kStride>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + +
static CUTLASS_HOST_DEVICE void cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 16 >::store (AccessType const & src,
Scalar_ * pointer,
int offset 
)
+
+inlinestatic
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k28cc0b88a16efca73d258128312d2a7e.html b/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k28cc0b88a16efca73d258128312d2a7e.html new file mode 100644 index 0000000000..b5f8d29d05 --- /dev/null +++ b/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k28cc0b88a16efca73d258128312d2a7e.html @@ -0,0 +1,171 @@ + + + + + + + +Cutlass: cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, 1, 2 > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, 1, 2 > Struct Template Reference
+
+
+ +

#include <load_store.h>

+ + + + + +

+Public Types

typedef Vectorize< Scalar_, kAccessSize >::Type AccessType
 The output type. More...
 
+ + + + +

+Static Public Member Functions

static CUTLASS_HOST_DEVICE void store (AccessType const &src, Scalar_ *pointer, int offset)
 The store function. More...
 
+

Member Typedef Documentation

+ +

◆ AccessType

+ +
+
+
+template<typename Scalar_ , int kAccessSize, MemorySpace::Kind Memory_>
+ + + + +
typedef Vectorize<Scalar_, kAccessSize>::Type cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, 1, 2 >::AccessType
+
+ +
+
+

Member Function Documentation

+ +

◆ store()

+ +
+
+
+template<typename Scalar_ , int kAccessSize, MemorySpace::Kind Memory_>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + +
static CUTLASS_HOST_DEVICE void cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, 1, 2 >::store (AccessType const & src,
Scalar_ * pointer,
int offset 
)
+
+inlinestatic
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k40d038d4bce377843c21a56ebf97d011.html b/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k40d038d4bce377843c21a56ebf97d011.html new file mode 100644 index 0000000000..2e3d711850 --- /dev/null +++ b/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k40d038d4bce377843c21a56ebf97d011.html @@ -0,0 +1,171 @@ + + + + + + + +Cutlass: cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 4 > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 4 > Struct Template Reference
+
+
+ +

#include <load_store.h>

+ + + + + +

+Public Types

typedef Vectorize< Scalar_, kAccessSize >::Type AccessType
 The output type. More...
 
+ + + + +

+Static Public Member Functions

static CUTLASS_HOST_DEVICE void store (AccessType const &src, Scalar_ *pointer, int offset)
 The store function. More...
 
+

Member Typedef Documentation

+ +

◆ AccessType

+ +
+
+
+template<typename Scalar_ , int kAccessSize, MemorySpace::Kind Memory_, int kStride>
+ + + + +
typedef Vectorize<Scalar_, kAccessSize>::Type cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 4 >::AccessType
+
+ +
+
+

Member Function Documentation

+ +

◆ store()

+ +
+
+
+template<typename Scalar_ , int kAccessSize, MemorySpace::Kind Memory_, int kStride>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + +
static CUTLASS_HOST_DEVICE void cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 4 >::store (AccessType const & src,
Scalar_ * pointer,
int offset 
)
+
+inlinestatic
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k60eedca420c41e94fd40b41299967ef2.html b/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k60eedca420c41e94fd40b41299967ef2.html new file mode 100644 index 0000000000..7b902f52b1 --- /dev/null +++ b/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k60eedca420c41e94fd40b41299967ef2.html @@ -0,0 +1,92 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 16 > Member List
+
+ + + + + diff --git a/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k775a1d27affec5236489735ed4503c92.html b/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k775a1d27affec5236489735ed4503c92.html new file mode 100644 index 0000000000..4d1705582d --- /dev/null +++ b/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k775a1d27affec5236489735ed4503c92.html @@ -0,0 +1,92 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, 1, 2 > Member List
+
+ + + + + diff --git a/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k84da7dcd68ee74b8d2bdb67885b0ca56.html b/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k84da7dcd68ee74b8d2bdb67885b0ca56.html new file mode 100644 index 0000000000..1f1c686d41 --- /dev/null +++ b/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1k84da7dcd68ee74b8d2bdb67885b0ca56.html @@ -0,0 +1,92 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 4 > Member List
+
+ + + + + diff --git a/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1kd9a7e85f80a21c504388612a60462417.html b/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1kd9a7e85f80a21c504388612a60462417.html new file mode 100644 index 0000000000..41a456e7c6 --- /dev/null +++ b/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1kd9a7e85f80a21c504388612a60462417.html @@ -0,0 +1,171 @@ + + + + + + + +Cutlass: cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kWmmaMatrix, FragmentElement_, kStride, size > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kWmmaMatrix, FragmentElement_, kStride, size > Struct Template Reference
+
+
+ +

#include <load_store.h>

+ + + + + +

+Public Types

typedef FragmentElement_ AccessType
 The input type. More...
 
+ + + + +

+Static Public Member Functions

static CUTLASS_HOST_DEVICE void store (AccessType const &value, Scalar_ *pointer, int offset)
 The store function. More...
 
+

Member Typedef Documentation

+ +

◆ AccessType

+ +
+
+
+template<typename Scalar_ , int kAccessSize, MemorySpace::Kind Memory_, typename FragmentElement_ , int kStride, size_t size>
+ + + + +
typedef FragmentElement_ cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kWmmaMatrix, FragmentElement_, kStride, size >::AccessType
+
+ +
+
+

Member Function Documentation

+ +

◆ store()

+ +
+
+
+template<typename Scalar_ , int kAccessSize, MemorySpace::Kind Memory_, typename FragmentElement_ , int kStride, size_t size>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + +
static CUTLASS_HOST_DEVICE void cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kWmmaMatrix, FragmentElement_, kStride, size >::store (AccessType const & value,
Scalar_ * pointer,
int offset 
)
+
+inlinestatic
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1ke6d73d34fa7b5254cf828804a19842e1.html b/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1ke6d73d34fa7b5254cf828804a19842e1.html new file mode 100644 index 0000000000..46c9d9b447 --- /dev/null +++ b/docs/structcutlass_1_1Store_3_01Scalar___00_01kAccessSize_00_01Memory___00_01FragmentElementType_1_1ke6d73d34fa7b5254cf828804a19842e1.html @@ -0,0 +1,171 @@ + + + + + + + +Cutlass: cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 8 > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 8 > Struct Template Reference
+
+
+ +

#include <load_store.h>

+ + + + + +

+Public Types

typedef Vectorize< Scalar_, kAccessSize >::Type AccessType
 The output type. More...
 
+ + + + +

+Static Public Member Functions

static CUTLASS_HOST_DEVICE void store (AccessType const &src, Scalar_ *pointer, int offset)
 The store function. More...
 
+

Member Typedef Documentation

+ +

◆ AccessType

+ +
+
+
+template<typename Scalar_ , int kAccessSize, MemorySpace::Kind Memory_, int kStride>
+ + + + +
typedef Vectorize<Scalar_, kAccessSize>::Type cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 8 >::AccessType
+
+ +
+
+

Member Function Documentation

+ +

◆ store()

+ +
+
+
+template<typename Scalar_ , int kAccessSize, MemorySpace::Kind Memory_, int kStride>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + +
static CUTLASS_HOST_DEVICE void cutlass::Store< Scalar_, kAccessSize, Memory_, FragmentElementType::kScalar, Scalar_, kStride, 8 >::store (AccessType const & src,
Scalar_ * pointer,
int offset 
)
+
+inlinestatic
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1Store_3_01double_00_012_00_01Memory___00_01FragmentElementType_1_1kScalar_00_013d38935f41bf709e067932b9e042255a.html b/docs/structcutlass_1_1Store_3_01double_00_012_00_01Memory___00_01FragmentElementType_1_1kScalar_00_013d38935f41bf709e067932b9e042255a.html new file mode 100644 index 0000000000..70da9ff685 --- /dev/null +++ b/docs/structcutlass_1_1Store_3_01double_00_012_00_01Memory___00_01FragmentElementType_1_1kScalar_00_013d38935f41bf709e067932b9e042255a.html @@ -0,0 +1,92 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::Store< double, 2, Memory_, FragmentElementType::kScalar, double, kStride, 16 > Member List
+
+ + + + + diff --git a/docs/structcutlass_1_1Store_3_01double_00_012_00_01Memory___00_01FragmentElementType_1_1kScalar_00_0160391c6be5cb1d3f99e012a6a18e486d.html b/docs/structcutlass_1_1Store_3_01double_00_012_00_01Memory___00_01FragmentElementType_1_1kScalar_00_0160391c6be5cb1d3f99e012a6a18e486d.html new file mode 100644 index 0000000000..2bbfac9824 --- /dev/null +++ b/docs/structcutlass_1_1Store_3_01double_00_012_00_01Memory___00_01FragmentElementType_1_1kScalar_00_0160391c6be5cb1d3f99e012a6a18e486d.html @@ -0,0 +1,171 @@ + + + + + + + +Cutlass: cutlass::Store< double, 2, Memory_, FragmentElementType::kScalar, double, kStride, 16 > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::Store< double, 2, Memory_, FragmentElementType::kScalar, double, kStride, 16 > Struct Template Reference
+
+
+ +

#include <load_store.h>

+ + + + + +

+Public Types

typedef Vectorize< double, 2 >::Type AccessType
 The output type. More...
 
+ + + + +

+Static Public Member Functions

static CUTLASS_HOST_DEVICE void store (AccessType const &src, double *pointer, int offset)
 The store function. More...
 
+

Member Typedef Documentation

+ +

◆ AccessType

+ +
+
+
+template<MemorySpace::Kind Memory_, int kStride>
+ + + + +
typedef Vectorize<double, 2>::Type cutlass::Store< double, 2, Memory_, FragmentElementType::kScalar, double, kStride, 16 >::AccessType
+
+ +
+
+

Member Function Documentation

+ +

◆ store()

+ +
+
+
+template<MemorySpace::Kind Memory_, int kStride>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + +
static CUTLASS_HOST_DEVICE void cutlass::Store< double, 2, Memory_, FragmentElementType::kScalar, double, kStride, 16 >::store (AccessType const & src,
double * pointer,
int offset 
)
+
+inlinestatic
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1TensorRefArray-members.html b/docs/structcutlass_1_1TensorRefArray-members.html new file mode 100644 index 0000000000..1a0ae6f7b5 --- /dev/null +++ b/docs/structcutlass_1_1TensorRefArray-members.html @@ -0,0 +1,101 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > Member List
+
+ + + + + diff --git a/docs/structcutlass_1_1TensorRefArray.html b/docs/structcutlass_1_1TensorRefArray.html new file mode 100644 index 0000000000..6aa5adbe83 --- /dev/null +++ b/docs/structcutlass_1_1TensorRefArray.html @@ -0,0 +1,398 @@ + + + + + + + +Cutlass: cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > Struct Template Reference
+
+
+ +

#include <tensor_ref_collection.h>

+ + + + + +

+Classes

class  ConstIterator
 TensorRefIterator over TensorRef objects in TensorRefArray. More...
 
+ + + + + + + + + + + + + +

+Public Types

typedef TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > TensorRef
 TensorRef type obtained from the TensorRefArray. More...
 
typedef Storage_ Storage
 Element pointed to by the TensorRef. More...
 
typedef Index_ Index
 Index type. More...
 
typedef LongIndex_ LongIndex
 Typically, strides in memory can be very large. More...
 
+ + + + + + + + + + +

+Public Member Functions

CUTLASS_HOST_DEVICE TensorArrayRef ()
 
CUTLASS_HOST_DEVICE TensorArrayRef (Storage **_pointers, Index _strides[kStorageRank - 1])
 
CUTLASS_HOST_DEVICE TensorRef at (Index idx) const
 
CUTLASS_HOST_DEVICE ConstIterator begin ()
 Returns an TesnorRefIterator over the TensorRef objects in this collection. More...
 
+ + + + + + + +

+Public Attributes

Storage ** pointers
 Base addresses. More...
 
Indexstrides [kStorageRank - 1]
 Array of strides. More...
 
+ + + + +

+Static Public Attributes

static int const kStorageRank = TensorRef::kStorageRank
 Rank of the stride vector. More...
 
+

Detailed Description

+

template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+struct cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >

+ +

This satisfies TensorRefCollection and stores a collection of TensorRef objects. This is a structure of arrays in that the individual members of the TensorRef are held in distinct arrays.

+

Note, TensorRef maps a logical coordinate space to an n-D array with rank kStorageRank. It maintains a stride vector of similar rank, but the least significant rank is defined to be 1.

+

The least significant stride of 1 is not stored, and therefore the number of stride arrays is kStorageRank - 1.

+

Member Typedef Documentation

+ +

◆ Index

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef Index_ cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::Index
+
+ +
+
+ +

◆ LongIndex

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef LongIndex_ cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::LongIndex
+
+ +
+
+ +

◆ Storage

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef Storage_ cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::Storage
+
+ +
+
+ +

◆ TensorRef

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef TensorRef<Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_> cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::TensorRef
+
+ +
+
+

Member Function Documentation

+ +

◆ at()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE TensorRef cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::at (Index idx) const
+
+inline
+
+ +
+
+ +

◆ begin()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE ConstIterator cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::begin ()
+
+inline
+
+ +
+
+ +

◆ TensorArrayRef() [1/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::TensorArrayRef ()
+
+inline
+
+ +
+
+ +

◆ TensorArrayRef() [2/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::TensorArrayRef (Storage ** _pointers,
Index _strides[kStorageRank - 1] 
)
+
+inline
+
+ +
+
+

Member Data Documentation

+ +

◆ kStorageRank

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + +
int const cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::kStorageRank = TensorRef::kStorageRank
+
+static
+
+ +
+
+ +

◆ pointers

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
Storage** cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::pointers
+
+ +
+
+ +

◆ strides

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
Index* cutlass::TensorRefArray< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::strides[kStorageRank - 1]
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1TensorRefBatchStrided-members.html b/docs/structcutlass_1_1TensorRefBatchStrided-members.html new file mode 100644 index 0000000000..0319ad8276 --- /dev/null +++ b/docs/structcutlass_1_1TensorRefBatchStrided-members.html @@ -0,0 +1,134 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > Member List
+
+
+ +

This is the complete list of members for cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >, including all inherited members.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
add_pointer_offset(LongIndex delta)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
at(Index idx) constcutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
cutlass::TensorRef::at(TensorCoord const &coord) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
cutlass::TensorRef::at(LongIndex idx) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
Base typedefcutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
begin()cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
const_ref() constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
ConstTensorRef typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
Coord_t typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
data() constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
get_pointer_offset(Index idx) constcutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
good() constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
Index typedefcutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
kRankcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >static
kStorageRankcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >static
leading_dim(int idx=0) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
LongIndex typedefcutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
map(TensorCoord const &coord) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
MapFunc typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
offset(TensorCoord const &coord) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
operator+(TensorCoord const &b) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
operator+=(TensorCoord const &b)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
operator-(TensorCoord const &b) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
operator-=(TensorCoord const &b)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
operator[](TensorCoord const &coord) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
operator[](LongIndex idx) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
Rankcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >static
reset(Storage *ptr=nullptr)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
reset(Storage *ptr, StorageCoord const &stride)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
Storage typedefcutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
StorageCoord typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
stride() constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
stride(int dim) constcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
StrideVector typedefcutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
tensor_stridecutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
TensorCoord typedefcutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
TensorRef typedefcutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
cutlass::TensorRef::TensorRef(Storage *ptr=nullptr)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
cutlass::TensorRef::TensorRef(Storage *ptr, Index ldm)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
cutlass::TensorRef::TensorRef(Storage *ptr, StrideVector const &stride)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
cutlass::TensorRef::TensorRef(Storage *ptr, StorageCoord const &stride)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
cutlass::TensorRef::TensorRef(TensorRef< typename platform::remove_const< Storage >::type, kRank, MapFunc, kStorageRank, Index, LongIndex > const &ref)cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
TensorRefBatchStrided()cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
TensorRefBatchStrided(TensorRef const &ref, LongIndex _tensor_stride=0)cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >inline
+ + + + diff --git a/docs/structcutlass_1_1TensorRefBatchStrided.html b/docs/structcutlass_1_1TensorRefBatchStrided.html new file mode 100644 index 0000000000..b1c9693102 --- /dev/null +++ b/docs/structcutlass_1_1TensorRefBatchStrided.html @@ -0,0 +1,537 @@ + + + + + + + +Cutlass: cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > Struct Template Reference
+
+
+ +

#include <tensor_ref_collection.h>

+
+Inheritance diagram for cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >:
+
+
+ + +cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > + +
+ + + + + +

+Classes

class  ConstIterator
 Constant iterator over tensors implied by TensorRefBatchStrided. More...
 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

+Public Types

typedef TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > Base
 Underlying TensorRef type. More...
 
typedef Base::Storage Storage
 Storage type. More...
 
typedef Index_ Index
 Index type. More...
 
typedef LongIndex_ LongIndex
 Typically, strides in memory can be very large. More...
 
typedef Coord< kRankTensorCoord
 Coordinate in logical tensor space. More...
 
typedef Base TensorRef
 Tensor reference implied by the TensorRefBatchStrided. More...
 
- Public Types inherited from cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
typedef Storage_ Storage
 Data type of individual access. More...
 
typedef MapFunc_ MapFunc
 Mapping function from logical coordinate to internal n-D array. More...
 
typedef Index_ Index
 Index type. More...
 
typedef LongIndex_ LongIndex
 Typically, strides in memory can be very large. More...
 
typedef Coord< kRankTensorCoord
 Coordinate in logical tensor space. More...
 
typedef Coord< kStorageRankStorageCoord
 Coordinate in storage n-D array. More...
 
typedef Coord< kStorageRank - 1 > StrideVector
 
typedef TensorRef< typename platform::remove_const< Storage >::type const, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ > ConstTensorRef
 Tensor reference to of constant value. More...
 
typedef TensorCoord Coord_t
 Coordinate in logical tensor space. More...
 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

+Public Member Functions

CUTLASS_HOST_DEVICE TensorRefBatchStrided ()
 
CUTLASS_HOST_DEVICE TensorRefBatchStrided (TensorRef const &ref, LongIndex _tensor_stride=0)
 
CUTLASS_HOST_DEVICE LongIndex get_pointer_offset (Index idx) const
 Gets the pointer offset. More...
 
CUTLASS_HOST_DEVICE TensorRef at (Index idx) const
 
CUTLASS_HOST_DEVICE ConstIterator begin ()
 Returns an iterator. More...
 
- Public Member Functions inherited from cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
CUTLASS_HOST_DEVICE TensorRef (Storage *ptr=nullptr)
 Helper for 1-D memory. All higher ranks are projected onto the fastest changing rank. More...
 
CUTLASS_HOST_DEVICE TensorRef (Storage *ptr, Index ldm)
 Helper to construct from a pointer and single stride element for 2-D pitch linear memory. More...
 
CUTLASS_HOST_DEVICE TensorRef (Storage *ptr, StrideVector const &stride)
 Constructs from a single pointer and stride vector. More...
 
CUTLASS_HOST_DEVICE TensorRef (Storage *ptr, StorageCoord const &stride)
 
CUTLASS_HOST_DEVICE TensorRef (TensorRef< typename platform::remove_const< Storage >::type, kRank, MapFunc, kStorageRank, Index, LongIndex > const &ref)
 Enables conversion from TensorRef of non-const type. More...
 
CUTLASS_HOST_DEVICE ConstTensorRef const_ref () const
 Returns a reference to constant-valued tensor. More...
 
CUTLASS_HOST_DEVICE void reset (Storage *ptr=nullptr)
 Updates only the pointer. More...
 
CUTLASS_HOST_DEVICE void reset (Storage *ptr, StorageCoord const &stride)
 Updates the pointer, stride, and location within a TensorRef. More...
 
CUTLASS_HOST_DEVICE bool good () const
 Returns true if the TensorRef may be safely accessed. More...
 
CUTLASS_HOST_DEVICE Storagedata () const
 Returns the pointer to referenced data. More...
 
CUTLASS_HOST_DEVICE StorageCoord stride () const
 Returns the stride of the tensor. More...
 
CUTLASS_HOST_DEVICE Index stride (int dim) const
 Returns the stride of the tensor in the given dimension. More...
 
CUTLASS_HOST_DEVICE Index leading_dim (int idx=0) const
 Returns the maximum stride element as the 'leading dimension'. More...
 
CUTLASS_HOST_DEVICE StorageCoord map (TensorCoord const &coord) const
 Maps a logical coordinate to an n-D array in memory. More...
 
CUTLASS_HOST_DEVICE LongIndex offset (TensorCoord const &coord) const
 Computes the offset of an index from the origin of the tensor. More...
 
CUTLASS_HOST_DEVICE Storageat (TensorCoord const &coord) const
 Returns a reference to the element at a given Coord. More...
 
CUTLASS_HOST_DEVICE Storageat (LongIndex idx) const
 Returns a reference to the element at a given linear index. More...
 
CUTLASS_HOST_DEVICE Storageoperator[] (TensorCoord const &coord) const
 Returns a reference to the element at a given Coord. More...
 
CUTLASS_HOST_DEVICE Storageoperator[] (LongIndex idx) const
 Returns a reference to the element at a given linear index. More...
 
CUTLASS_HOST_DEVICE TensorRefadd_pointer_offset (LongIndex delta)
 Adds an offset to each pointer. More...
 
CUTLASS_HOST_DEVICE TensorRef operator+ (TensorCoord const &b) const
 Returns a TensorRef offset by a given amount. More...
 
CUTLASS_HOST_DEVICE TensorRefoperator+= (TensorCoord const &b)
 Returns a TensorRef offset by a given amount. More...
 
CUTLASS_HOST_DEVICE TensorRef operator- (TensorCoord const &b) const
 Returns a TensorRef offset by a given amount. More...
 
CUTLASS_HOST_DEVICE TensorRefoperator-= (TensorCoord const &b)
 Returns a TensorRef offset by a given amount. More...
 
+ + + + +

+Public Attributes

LongIndex tensor_stride
 Stride between tensors. More...
 
+ + + + + + + + + + + +

+Additional Inherited Members

- Static Public Attributes inherited from cutlass::TensorRef< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >
static int const kRank = Rank_
 Logical rank of tensor index space. More...
 
static int const kStorageRank = StorageRank_
 Rank of internal storage. More...
 
static int const Rank = kRank
 Logical rank of tensor index space. More...
 
+

Detailed Description

+

template<typename Storage_, int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+struct cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >

+ +

This satisfies TensorRefCollection and stores a collection of TensorRef objects that have identical strides. TensorRef objects are separated by a linear stride.

+

Member Typedef Documentation

+ +

◆ Base

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef TensorRef<Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_> cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::Base
+
+ +
+
+ +

◆ Index

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef Index_ cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::Index
+
+ +
+
+ +

◆ LongIndex

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef LongIndex_ cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::LongIndex
+
+ +
+
+ +

◆ Storage

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef Base::Storage cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::Storage
+
+ +
+
+ +

◆ TensorCoord

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef Coord<kRank> cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::TensorCoord
+
+ +
+
+ +

◆ TensorRef

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
typedef Base cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::TensorRef
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ TensorRefBatchStrided() [1/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::TensorRefBatchStrided ()
+
+inline
+
+ +
+
+ +

◆ TensorRefBatchStrided() [2/2]

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::TensorRefBatchStrided (TensorRef const & ref,
LongIndex _tensor_stride = 0 
)
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ at()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE TensorRef cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::at (Index idx) const
+
+inline
+
+ +
+
+ +

◆ begin()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE ConstIterator cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::begin ()
+
+inline
+
+ +
+
+ +

◆ get_pointer_offset()

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE LongIndex cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::get_pointer_offset (Index idx) const
+
+inline
+
+ +
+
+

Member Data Documentation

+ +

◆ tensor_stride

+ +
+
+
+template<typename Storage_ , int Rank_, typename MapFunc_ = IdentityTensorMapFunc<Rank_>, int StorageRank_ = MapFunc_::kStorageRank, typename Index_ = int, typename LongIndex_ = long long>
+ + + + +
LongIndex cutlass::TensorRefBatchStrided< Storage_, Rank_, MapFunc_, StorageRank_, Index_, LongIndex_ >::tensor_stride
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1TensorRefBatchStrided.png b/docs/structcutlass_1_1TensorRefBatchStrided.png new file mode 100644 index 0000000000..9a21bce92c Binary files /dev/null and b/docs/structcutlass_1_1TensorRefBatchStrided.png differ diff --git a/docs/structcutlass_1_1TensorRef_3_01Storage___00_01Rank___00_01MapFunc___00_011_00_01Index___00_01LongIndex___01_4_1_1StrideVector.html b/docs/structcutlass_1_1TensorRef_3_01Storage___00_01Rank___00_01MapFunc___00_011_00_01Index___00_01LongIndex___01_4_1_1StrideVector.html new file mode 100644 index 0000000000..76d7a9ec85 --- /dev/null +++ b/docs/structcutlass_1_1TensorRef_3_01Storage___00_01Rank___00_01MapFunc___00_011_00_01Index___00_01LongIndex___01_4_1_1StrideVector.html @@ -0,0 +1,97 @@ + + + + + + + +Cutlass: cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::StrideVector Struct Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::StrideVector Struct Reference
+
+
+ +

#include <tensor_ref.h>

+

Detailed Description

+

template<typename Storage_, int Rank_, typename MapFunc_, typename Index_, typename LongIndex_>
+struct cutlass::TensorRef< Storage_, Rank_, MapFunc_, 1, Index_, LongIndex_ >::StrideVector

+ +

Stride vector in storage coordinage space - assumes least significant stride is 1 and does not store it.

+

The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1TileAllocation-members.html b/docs/structcutlass_1_1TileAllocation-members.html new file mode 100644 index 0000000000..11af96977b --- /dev/null +++ b/docs/structcutlass_1_1TileAllocation-members.html @@ -0,0 +1,101 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::TileAllocation< Scalar_, Shape_ > Member List
+
+ + + + + diff --git a/docs/structcutlass_1_1TileAllocation.html b/docs/structcutlass_1_1TileAllocation.html new file mode 100644 index 0000000000..7f5ad2df03 --- /dev/null +++ b/docs/structcutlass_1_1TileAllocation.html @@ -0,0 +1,366 @@ + + + + + + + +Cutlass: cutlass::TileAllocation< Scalar_, Shape_ > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::TileAllocation< Scalar_, Shape_ > Struct Template Reference
+
+
+ +

Class for storing a tile in memory and accessing it through a tensor ref. +

+ +

#include <tile_allocation.h>

+ + + + + + + + + + + + + + + + + + + + +

+Public Types

typedef Scalar_ Scalar
 Scalar element. More...
 
typedef StorageType< sizeof(Scalar)>::Type Storage
 The actual storage (may differ from the scalar type) More...
 
typedef Shape_ Shape
 Size of the allocation in units of scalars. More...
 
typedef ShapeStrides< Shape, 1 >::Shape Strides
 Strides. More...
 
typedef TensorRef< Scalar const, 4 > ConstTensorRef
 Defines the tensor reference for this allocation. More...
 
typedef TensorRef< Scalar, 4 > TensorRef
 Defines the tensor reference for this allocation. More...
 
+ + + + + + + + + + + + + +

+Public Member Functions

CUTLASS_DEVICE Scalardata ()
 Returns a pointer to the raw data. More...
 
CUTLASS_DEVICE Scalar const * data () const
 Returns a const pointer to the raw data. More...
 
CUTLASS_DEVICE TensorRef reference ()
 Returns a TensorRef object pointing to the data. More...
 
CUTLASS_DEVICE ConstTensorRef reference () const
 Returns a TensorRef object pointing to the data. More...
 
+ + + + +

+Public Attributes

Storage storage [Shape::kD][Shape::kH][Shape::kW][Shape::kC]
 Storage. More...
 
+

Member Typedef Documentation

+ +

◆ ConstTensorRef

+ +
+
+
+template<typename Scalar_ , typename Shape_ >
+ + + + +
typedef TensorRef<Scalar const, 4> cutlass::TileAllocation< Scalar_, Shape_ >::ConstTensorRef
+
+ +
+
+ +

◆ Scalar

+ +
+
+
+template<typename Scalar_ , typename Shape_ >
+ + + + +
typedef Scalar_ cutlass::TileAllocation< Scalar_, Shape_ >::Scalar
+
+ +
+
+ +

◆ Shape

+ +
+
+
+template<typename Scalar_ , typename Shape_ >
+ + + + +
typedef Shape_ cutlass::TileAllocation< Scalar_, Shape_ >::Shape
+
+ +
+
+ +

◆ Storage

+ +
+
+
+template<typename Scalar_ , typename Shape_ >
+ + + + +
typedef StorageType<sizeof(Scalar)>::Type cutlass::TileAllocation< Scalar_, Shape_ >::Storage
+
+ +
+
+ +

◆ Strides

+ +
+
+
+template<typename Scalar_ , typename Shape_ >
+ + + + +
typedef ShapeStrides<Shape, 1>::Shape cutlass::TileAllocation< Scalar_, Shape_ >::Strides
+
+ +
+
+ +

◆ TensorRef

+ +
+
+
+template<typename Scalar_ , typename Shape_ >
+ + + + +
typedef TensorRef<Scalar, 4> cutlass::TileAllocation< Scalar_, Shape_ >::TensorRef
+
+ +
+
+

Member Function Documentation

+ +

◆ data() [1/2]

+ +
+
+
+template<typename Scalar_ , typename Shape_ >
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE Scalar* cutlass::TileAllocation< Scalar_, Shape_ >::data ()
+
+inline
+
+ +
+
+ +

◆ data() [2/2]

+ +
+
+
+template<typename Scalar_ , typename Shape_ >
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE Scalar const* cutlass::TileAllocation< Scalar_, Shape_ >::data () const
+
+inline
+
+ +
+
+ +

◆ reference() [1/2]

+ +
+
+
+template<typename Scalar_ , typename Shape_ >
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE TensorRef cutlass::TileAllocation< Scalar_, Shape_ >::reference ()
+
+inline
+
+ +
+
+ +

◆ reference() [2/2]

+ +
+
+
+template<typename Scalar_ , typename Shape_ >
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE ConstTensorRef cutlass::TileAllocation< Scalar_, Shape_ >::reference () const
+
+inline
+
+ +
+
+

Member Data Documentation

+ +

◆ storage

+ +
+
+
+template<typename Scalar_ , typename Shape_ >
+ + + + +
Storage cutlass::TileAllocation< Scalar_, Shape_ >::storage[Shape::kD][Shape::kH][Shape::kW][Shape::kC]
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1TileCoord-members.html b/docs/structcutlass_1_1TileCoord-members.html new file mode 100644 index 0000000000..73a3fef585 --- /dev/null +++ b/docs/structcutlass_1_1TileCoord-members.html @@ -0,0 +1,151 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::TileCoord< Index_ > Member List
+
+
+ +

This is the complete list of members for cutlass::TileCoord< Index_ >, including all inherited members.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
at()cutlass::Coord< 4, Index_ >inline
at(int dim)cutlass::Coord< 4, Index_ >inline
at() constcutlass::Coord< 4, Index_ >inline
at(int dim) constcutlass::Coord< 4, Index_ >inline
Base typedefcutlass::TileCoord< Index_ >
c() constcutlass::TileCoord< Index_ >inline
c()cutlass::TileCoord< Index_ >inline
clamp(Coord< kRank > const &max, Coord< kRank > const &min=Coord< kRank >())cutlass::Coord< 4, Index_ >inline
Coord(Index value=0)cutlass::Coord< 4, Index_ >inline
Coord(Index _idx[])cutlass::Coord< 4, Index_ >inline
Coord(Coord< kRank > const &coord)cutlass::Coord< 4, Index_ >inline
count() constcutlass::Coord< 4, Index_ >inline
d() constcutlass::TileCoord< Index_ >inline
d()cutlass::TileCoord< Index_ >inline
dhw() constcutlass::TileCoord< Index_ >inline
dot(Coord const &b, T sum) constcutlass::Coord< 4, Index_ >inline
dot(Coord const &b) constcutlass::Coord< 4, Index_ >inline
h() constcutlass::TileCoord< Index_ >inline
h()cutlass::TileCoord< Index_ >inline
hw() constcutlass::TileCoord< Index_ >inline
hwc() constcutlass::TileCoord< Index_ >inline
idxcutlass::Coord< 4, Index_ >
Index typedefcutlass::TileCoord< Index_ >
kCcutlass::TileCoord< Index_ >static
kDcutlass::TileCoord< Index_ >static
kHcutlass::TileCoord< Index_ >static
kRankcutlass::Coord< 4, Index_ >static
kWcutlass::TileCoord< Index_ >static
Ncutlass::Coord< 4, Index_ >static
operator bool() constcutlass::Coord< 4, Index_ >inline
operator!() constcutlass::Coord< 4, Index_ >inline
operator!=(Coord< kRank > const &b) constcutlass::Coord< 4, Index_ >inline
operator*(Base const &b) constcutlass::TileCoord< Index_ >inline
Coord< 4, Index_ >::operator*(Coord const &b) constcutlass::Coord< 4, Index_ >inline
operator*=(Base const &b)cutlass::TileCoord< Index_ >inline
Coord< 4, Index_ >::operator*=(Coord const &b)cutlass::Coord< 4, Index_ >inline
operator+(Base const &b) constcutlass::TileCoord< Index_ >inline
Coord< 4, Index_ >::operator+(Coord const &b) constcutlass::Coord< 4, Index_ >inline
operator+=(Base const &b)cutlass::TileCoord< Index_ >inline
Coord< 4, Index_ >::operator+=(Coord const &b)cutlass::Coord< 4, Index_ >inline
operator-(Base const &b) constcutlass::TileCoord< Index_ >inline
Coord< 4, Index_ >::operator-(Coord const &b) constcutlass::Coord< 4, Index_ >inline
operator-=(Base const &b)cutlass::TileCoord< Index_ >inline
Coord< 4, Index_ >::operator-=(Coord const &b)cutlass::Coord< 4, Index_ >inline
operator/(Base const &b) constcutlass::TileCoord< Index_ >inline
Coord< 4, Index_ >::operator/(Coord const &b) constcutlass::Coord< 4, Index_ >inline
operator/=(Base const &b)cutlass::TileCoord< Index_ >inline
Coord< 4, Index_ >::operator/=(Coord const &b)cutlass::Coord< 4, Index_ >inline
operator<(Coord< kRank > const &b) constcutlass::Coord< 4, Index_ >inline
operator<=(Coord< kRank > const &b) constcutlass::Coord< 4, Index_ >inline
operator==(Coord< kRank > const &b) constcutlass::Coord< 4, Index_ >inline
operator[](int dim)cutlass::Coord< 4, Index_ >inline
operator[](int dim) constcutlass::Coord< 4, Index_ >inline
slice(int start=0, Index identity=0) constcutlass::Coord< 4, Index_ >inline
TileCoord()cutlass::TileCoord< Index_ >inline
TileCoord(Coord< 3, Index > const &coord)cutlass::TileCoord< Index_ >inline
TileCoord(Coord< 4, Index > const &coord)cutlass::TileCoord< Index_ >inline
TileCoord(Index coord[4])cutlass::TileCoord< Index_ >inline
TileCoord(Index d, Index h, Index w, Index c)cutlass::TileCoord< Index_ >inline
w() constcutlass::TileCoord< Index_ >inline
w()cutlass::TileCoord< Index_ >inline
+ + + + diff --git a/docs/structcutlass_1_1TileCoord.html b/docs/structcutlass_1_1TileCoord.html new file mode 100644 index 0000000000..e0684be4de --- /dev/null +++ b/docs/structcutlass_1_1TileCoord.html @@ -0,0 +1,1127 @@ + + + + + + + +Cutlass: cutlass::TileCoord< Index_ > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::TileCoord< Index_ > Struct Template Reference
+
+
+ +

#include <tile_coord.h>

+
+Inheritance diagram for cutlass::TileCoord< Index_ >:
+
+
+ + +cutlass::Coord< 4, Index_ > + +
+ + + + + + + + + + + + +

+Public Types

typedef Index_ Index
 Index type. More...
 
typedef Coord< 4, IndexBase
 Underlying Coord<4> More...
 
- Public Types inherited from cutlass::Coord< 4, Index_ >
typedef Index_ Index
 Index type used to store elements. More...
 
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

+Public Member Functions

CUTLASS_HOST_DEVICE TileCoord ()
 Default ctor. More...
 
CUTLASS_HOST_DEVICE TileCoord (Coord< 3, Index > const &coord)
 Constructs from Coord<3> and infers coord[kC] = 0. More...
 
CUTLASS_HOST_DEVICE TileCoord (Coord< 4, Index > const &coord)
 Constructs from Coord<4> More...
 
CUTLASS_HOST_DEVICE TileCoord (Index coord[4])
 Constructs from an array of coordinate elements. More...
 
CUTLASS_HOST_DEVICE TileCoord (Index d, Index h, Index w, Index c)
 Helper to construct from a row and column. More...
 
CUTLASS_HOST_DEVICE Index const & d () const
 Returns the D element of the coordinate. More...
 
CUTLASS_HOST_DEVICE Indexd ()
 Returns the D element of the coordinate. More...
 
CUTLASS_HOST_DEVICE Index const & h () const
 Returns the H element of the coordinate. More...
 
CUTLASS_HOST_DEVICE Indexh ()
 Returns the H element of the coordinate. More...
 
CUTLASS_HOST_DEVICE Index const & w () const
 Returns the W element of the coordinate. More...
 
CUTLASS_HOST_DEVICE Indexw ()
 Returns the W element of the coordinate. More...
 
CUTLASS_HOST_DEVICE Index const & c () const
 Returns the Celement of the coordinate. More...
 
CUTLASS_HOST_DEVICE Indexc ()
 Returns the C element of the coordinate. More...
 
CUTLASS_HOST_DEVICE Coord< 2 > hw () const
 Gets H and W dimensions as a Coord<2> More...
 
CUTLASS_HOST_DEVICE Coord< 3 > hwc () const
 Gets H, W, and C dimensions as a Coord<3> More...
 
CUTLASS_HOST_DEVICE Coord< 3 > dhw () const
 Gets D, H, and W dimensions as a Coord<3> More...
 
CUTLASS_HOST_DEVICE TileCoord operator+ (Base const &b) const
 Element-wise addition. More...
 
CUTLASS_HOST_DEVICE TileCoord operator- (Base const &b) const
 Element-wise subtraction. More...
 
CUTLASS_HOST_DEVICE TileCoord operator* (Base const &b) const
 Element-wise multiplication. More...
 
CUTLASS_HOST_DEVICE TileCoord operator/ (Base const &b) const
 Element-wise division. More...
 
CUTLASS_HOST_DEVICE TileCoordoperator+= (Base const &b)
 In-place addition. More...
 
CUTLASS_HOST_DEVICE TileCoordoperator-= (Base const &b)
 In-place subtraction. More...
 
CUTLASS_HOST_DEVICE TileCoordoperator*= (Base const &b)
 In-place multiplication. More...
 
CUTLASS_HOST_DEVICE TileCoordoperator/= (Base const &b)
 In-place division. More...
 
- Public Member Functions inherited from cutlass::Coord< 4, Index_ >
CUTLASS_HOST_DEVICE Coord (Index value=0)
 Default ctor initializes uniformly. More...
 
CUTLASS_HOST_DEVICE Coord (Index _idx[])
 Constructs from an array of integers. More...
 
CUTLASS_HOST_DEVICE Coord (Coord< kRank > const &coord)
 Constructs from an array of integers. More...
 
CUTLASS_HOST_DEVICE Coord< Slice > slice (int start=0, Index identity=0) const
 
CUTLASS_HOST_DEVICE operator bool () const
 Returns true if Coord is non-zero. More...
 
CUTLASS_HOST_DEVICE bool operator! () const
 Returns true if Coord is uniformly zero. More...
 
CUTLASS_HOST_DEVICE Coord operator+ (Coord const &b) const
 Element-wise addition. More...
 
CUTLASS_HOST_DEVICE Coord operator- (Coord const &b) const
 Element-wise subtraction. More...
 
CUTLASS_HOST_DEVICE Coord operator* (Coord const &b) const
 Element-wise multiplication. More...
 
CUTLASS_HOST_DEVICE Coord operator/ (Coord const &b) const
 Element-wise division. More...
 
CUTLASS_HOST_DEVICE Coordoperator+= (Coord const &b)
 In-place addition. More...
 
CUTLASS_HOST_DEVICE Coordoperator-= (Coord const &b)
 In-place subtraction. More...
 
CUTLASS_HOST_DEVICE Coordoperator*= (Coord const &b)
 In-place multiplication. More...
 
CUTLASS_HOST_DEVICE Coordoperator/= (Coord const &b)
 In-place division. More...
 
CUTLASS_HOST_DEVICE Indexoperator[] (int dim)
 Member access operator. More...
 
CUTLASS_HOST_DEVICE Index const & operator[] (int dim) const
 Member access operator. More...
 
CUTLASS_HOST_DEVICEdot (Coord const &b, T sum) const
 Computes the dot product of two Coord instances. More...
 
CUTLASS_HOST_DEVICEdot (Coord const &b) const
 Computes the dot product of two Coord instances. More...
 
CUTLASS_HOST_DEVICE Indexat ()
 Gets the index of a given Coord element. More...
 
CUTLASS_HOST_DEVICE Indexat (int dim)
 Access via index; may limit unrolling potential. More...
 
CUTLASS_HOST_DEVICE Index const & at () const
 Gets the index of a given Coord element. More...
 
CUTLASS_HOST_DEVICE Index const & at (int dim) const
 Access via index; may limit unrolling potential. More...
 
CUTLASS_HOST_DEVICE bool operator== (Coord< kRank > const &b) const
 Determines if two Coord<> objects are equal. More...
 
CUTLASS_HOST_DEVICE bool operator!= (Coord< kRank > const &b) const
 Not equal. More...
 
CUTLASS_HOST_DEVICE Coordclamp (Coord< kRank > const &max, Coord< kRank > const &min=Coord< kRank >())
 Clamps a coordinate to a range specified by maximum and minimum values. More...
 
CUTLASS_HOST_DEVICE Index count () const
 Returns the product of all elements. More...
 
CUTLASS_HOST_DEVICE bool operator< (Coord< kRank > const &b) const
 Less than operator. More...
 
CUTLASS_HOST_DEVICE bool operator<= (Coord< kRank > const &b) const
 Less than or equals operator. More...
 
+ + + + + + + + + + + + + + + + + + + + +

+Static Public Attributes

static int kD = 0
 D dimension. More...
 
static int kH = 1
 H dimension. More...
 
static int kW = 2
 W dimension. More...
 
static int kC = 3
 C dimension. More...
 
- Static Public Attributes inherited from cutlass::Coord< 4, Index_ >
static int const kRank
 Number of elements in Coord. More...
 
static int const N
 Number of elements in Coord, aliased for compatibility. More...
 
+ + + + + +

+Additional Inherited Members

- Public Attributes inherited from cutlass::Coord< 4, Index_ >
Index idx [kRank]
 Indices. More...
 
+

Detailed Description

+

template<typename Index_ = int>
+struct cutlass::TileCoord< Index_ >

+ +

TileCoord wraps Coord<4, int> to provide a helper for accessing named dimensions. Classes expecting a coordinate in the rank=4 index space of a CUTLASS tile structure should use TileCoord.

+

Member Typedef Documentation

+ +

◆ Base

+ +
+
+
+template<typename Index_ = int>
+ + + + +
typedef Coord<4, Index> cutlass::TileCoord< Index_ >::Base
+
+ +
+
+ +

◆ Index

+ +
+
+
+template<typename Index_ = int>
+ + + + +
typedef Index_ cutlass::TileCoord< Index_ >::Index
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ TileCoord() [1/5]

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileCoord< Index_ >::TileCoord ()
+
+inline
+
+ +
+
+ +

◆ TileCoord() [2/5]

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileCoord< Index_ >::TileCoord (Coord< 3, Index > const & coord)
+
+inline
+
+ +
+
+ +

◆ TileCoord() [3/5]

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileCoord< Index_ >::TileCoord (Coord< 4, Index > const & coord)
+
+inline
+
+ +
+
+ +

◆ TileCoord() [4/5]

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileCoord< Index_ >::TileCoord (Index coord[4])
+
+inline
+
+ +
+
+ +

◆ TileCoord() [5/5]

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileCoord< Index_ >::TileCoord (Index d,
Index h,
Index w,
Index c 
)
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ c() [1/2]

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE Index const& cutlass::TileCoord< Index_ >::c () const
+
+inline
+
+ +
+
+ +

◆ c() [2/2]

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE Index& cutlass::TileCoord< Index_ >::c ()
+
+inline
+
+ +
+
+ +

◆ d() [1/2]

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE Index const& cutlass::TileCoord< Index_ >::d () const
+
+inline
+
+ +
+
+ +

◆ d() [2/2]

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE Index& cutlass::TileCoord< Index_ >::d ()
+
+inline
+
+ +
+
+ +

◆ dhw()

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE Coord<3> cutlass::TileCoord< Index_ >::dhw () const
+
+inline
+
+ +
+
+ +

◆ h() [1/2]

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE Index const& cutlass::TileCoord< Index_ >::h () const
+
+inline
+
+ +
+
+ +

◆ h() [2/2]

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE Index& cutlass::TileCoord< Index_ >::h ()
+
+inline
+
+ +
+
+ +

◆ hw()

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE Coord<2> cutlass::TileCoord< Index_ >::hw () const
+
+inline
+
+ +
+
+ +

◆ hwc()

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE Coord<3> cutlass::TileCoord< Index_ >::hwc () const
+
+inline
+
+ +
+
+ +

◆ operator*()

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE TileCoord cutlass::TileCoord< Index_ >::operator* (Base const & b) const
+
+inline
+
+ +
+
+ +

◆ operator*=()

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE TileCoord& cutlass::TileCoord< Index_ >::operator*= (Base const & b)
+
+inline
+
+ +
+
+ +

◆ operator+()

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE TileCoord cutlass::TileCoord< Index_ >::operator+ (Base const & b) const
+
+inline
+
+ +
+
+ +

◆ operator+=()

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE TileCoord& cutlass::TileCoord< Index_ >::operator+= (Base const & b)
+
+inline
+
+ +
+
+ +

◆ operator-()

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE TileCoord cutlass::TileCoord< Index_ >::operator- (Base const & b) const
+
+inline
+
+ +
+
+ +

◆ operator-=()

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE TileCoord& cutlass::TileCoord< Index_ >::operator-= (Base const & b)
+
+inline
+
+ +
+
+ +

◆ operator/()

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE TileCoord cutlass::TileCoord< Index_ >::operator/ (Base const & b) const
+
+inline
+
+ +
+
+ +

◆ operator/=()

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE TileCoord& cutlass::TileCoord< Index_ >::operator/= (Base const & b)
+
+inline
+
+ +
+
+ +

◆ w() [1/2]

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE Index const& cutlass::TileCoord< Index_ >::w () const
+
+inline
+
+ +
+
+ +

◆ w() [2/2]

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE Index& cutlass::TileCoord< Index_ >::w ()
+
+inline
+
+ +
+
+

Member Data Documentation

+ +

◆ kC

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + +
int cutlass::TileCoord< Index_ >::kC = 3
+
+static
+
+ +
+
+ +

◆ kD

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + +
int cutlass::TileCoord< Index_ >::kD = 0
+
+static
+
+ +
+
+ +

◆ kH

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + +
int cutlass::TileCoord< Index_ >::kH = 1
+
+static
+
+ +
+
+ +

◆ kW

+ +
+
+
+template<typename Index_ = int>
+ + + + + +
+ + + + +
int cutlass::TileCoord< Index_ >::kW = 2
+
+static
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1TileCoord.png b/docs/structcutlass_1_1TileCoord.png new file mode 100644 index 0000000000..ed79e4b075 Binary files /dev/null and b/docs/structcutlass_1_1TileCoord.png differ diff --git a/docs/structcutlass_1_1TileIteratorBase-members.html b/docs/structcutlass_1_1TileIteratorBase-members.html index a313a5115b..ce46e77f75 100644 --- a/docs/structcutlass_1_1TileIteratorBase-members.html +++ b/docs/structcutlass_1_1TileIteratorBase-members.html @@ -73,40 +73,40 @@
-
cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ > Member List
+
cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ > Member List
-

This is the complete list of members for cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >, including all inherited members.

+

This is the complete list of members for cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >, including all inherited members.

- - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + +
AccessType typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Delta typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Fragment typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
FragmentConstIterator typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
FragmentElement typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
FragmentIterator typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
FragmentShape typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
ImmediateOffsetStrides typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Index typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
initialize_predicates(PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &offset=make_Coord(0, 0, 0))cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inlinestatic
Iterations typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
kAccessSizecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >static
kAdvancecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >static
kFragmentSizecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >static
kIteratorFragmentcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >static
kMemorySpacecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >static
PredicateVector typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Scalar typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Skew typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Storage typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
ThreadOffset typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Tile typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Traits typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
valid(int d, int h, int w, int c) constcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
AccessType typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Delta typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Fragment typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
FragmentConstIterator typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
FragmentElement typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
FragmentIterator typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
FragmentShape typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
ImmediateOffsetStrides typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Index typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
initialize_predicates(PredicateIterator predicate_it, PredicateFunctor const &predicate_func, Coord< 3 > const &offset)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inlinestatic
Iterations typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
kAccessSizecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >static
kAdvancecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >static
kFragmentElementTypecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >static
kFragmentSizecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >static
kMemorySpacecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >static
PredicateVector typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Scalar typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Skew typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Storage typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
ThreadOffset typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Tile typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Traits typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
valid(int d, int h, int w, int c) constcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
diff --git a/docs/structcutlass_1_1TileIteratorBase.html b/docs/structcutlass_1_1TileIteratorBase.html index a946914977..b01915206f 100644 --- a/docs/structcutlass_1_1TileIteratorBase.html +++ b/docs/structcutlass_1_1TileIteratorBase.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ > Struct Template Reference +Cutlass: cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ > Struct Template Reference @@ -80,7 +80,7 @@ Static Public Attributes | List of all members
-
cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ > Struct Template Reference
+
cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ > Struct Template Reference
@@ -89,13 +89,13 @@

#include <tile_iterator.h>

-Inheritance diagram for cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >:
+Inheritance diagram for cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >:
- - -cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ > -cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ > + + +cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ > +cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
@@ -107,356 +107,356 @@
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Public Types

typedef Traits_ Traits
 concept TileTraits More...
 
typedef Scalar_ Scalar
 Scalar element. More...
 
typedef FragmentElement_ FragmentElement
 Fragment element. More...
 
typedef Index_ Index
 Index type. More...
 
typedef Skew_ Skew
 Skew quantity. More...
 
typedef Traits::Tile Tile
 Tile shape. More...
 
typedef Traits::Delta Delta
 Distance along each dimension. More...
 
typedef Traits::ImmediateOffsetStrides ImmediateOffsetStrides
 The strides in each dimension between different loads/stores. More...
 
typedef Traits::Iterations Iterations
 Iterations. More...
 
typedef Traits::ThreadOffset ThreadOffset
 Thread offset. More...
 
typedef Vectorize< FragmentElement, kAccessSize >::Type AccessType
 The elements loaded/store by one instruction. More...
 
typedef Fragment< Scalar, ShapeCount< Tile >::kCount, kFragmentSizeStorage
 The storage. More...
 
typedef Fragment< FragmentElement, ShapeCount< Iterations >::kCount *kAccessSizeFragment
 The fragment. More...
 
typedef FragmentIterator< Fragment, Iterations, AccessTypeFragmentIterator
 The fragment iterator. More...
 
typedef FragmentConstIterator< Fragment, Iterations, AccessTypeFragmentConstIterator
 The fragment const iterator. More...
 
typedef FragmentIterator::FragmentShape FragmentShape
 The shape of the fragment. More...
 
typedef PredicateVector< ShapeCount< Iterations >::kCount > PredicateVector
 Default predicate mask type. More...
 
typedef Traits_ Traits
 concept TileTraits More...
 
typedef Scalar_ Scalar
 Scalar element. More...
 
typedef FragmentElement_ FragmentElement
 Fragment element. More...
 
typedef Index_ Index
 Index type. More...
 
typedef Skew_ Skew
 Skew quantity. More...
 
typedef Traits::Tile Tile
 Tile shape. More...
 
typedef Traits::Delta Delta
 Distance along each dimension. More...
 
typedef Traits::ImmediateOffsetStrides ImmediateOffsetStrides
 The strides in each dimension between different loads/stores. More...
 
typedef Traits::Iterations Iterations
 Iterations. More...
 
typedef Traits::ThreadOffset ThreadOffset
 Thread offset. More...
 
typedef Vectorize< FragmentElement, kAccessSize >::Type AccessType
 The elements loaded/store by one instruction. More...
 
typedef Fragment< Scalar, ShapeCount< Tile >::kCount, kFragmentSizeStorage
 The storage. More...
 
typedef Fragment< FragmentElement, ShapeCount< Iterations >::kCount *kAccessSizeFragment
 The fragment. More...
 
typedef FragmentIterator< Fragment, Iterations, AccessTypeFragmentIterator
 The fragment iterator. More...
 
typedef FragmentConstIterator< Fragment, Iterations, AccessTypeFragmentConstIterator
 The fragment const iterator. More...
 
typedef FragmentIterator::FragmentShape FragmentShape
 The shape of the fragment. More...
 
typedef PredicateVector< ShapeCount< Iterations >::kCount > PredicateVector
 Default predicate mask type. More...
 
- - - + + +

Public Member Functions

CUTLASS_DEVICE bool valid (int d, int h, int w, int c) const
 Is the iterator valid? More...
 
CUTLASS_HOST_DEVICE bool valid (int d, int h, int w, int c) const
 Is the iterator valid? More...
 
- - - - + + + +

Static Public Member Functions

template<typename PredicateIterator >
static CUTLASS_DEVICE void initialize_predicates (PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &offset=make_Coord(0, 0, 0))
 Initializes a predicate vector. More...
 
template<typename PredicateIterator , typename PredicateFunctor >
static CUTLASS_HOST_DEVICE void initialize_predicates (PredicateIterator predicate_it, PredicateFunctor const &predicate_func, Coord< 3 > const &offset)
 Initializes a predicate vector. More...
 
- - - - - - - - - - - - - - - + + + + + + + + + + + + + + +

Static Public Attributes

static IteratorAdvance::Kind const kAdvance = Advance_
 Specifies dimension in which post-increment accesses advance. More...
 
static IteratorFragment::Kind const kIteratorFragment = IteratorFragment_
 Specifies iterator storage fragment type (Scalar or WmmaMatrix) More...
 
static MemorySpace::Kind const kMemorySpace = MemorySpace
 Source or destination memory space. More...
 
static int const kAccessSize = Tile::kC
 The number of scalars accessed per load/store. More...
 
static int const kFragmentSize
 The size of storage needed per fragment. More...
 
static IteratorAdvance::Kind const kAdvance = Advance_
 Specifies dimension in which post-increment accesses advance. More...
 
static FragmentElementType::Kind const kFragmentElementType = FragmentElementType_
 Specifies iterator storage fragment type (Scalar or WmmaMatrix) More...
 
static MemorySpace::Kind const kMemorySpace = MemorySpace
 Source or destination memory space. More...
 
static int const kAccessSize = Traits::kAccessSize
 The number of scalars accessed per load/store. More...
 
static int const kFragmentSize
 The size of storage needed per fragment. More...
 

Member Typedef Documentation

- -

◆ AccessType

+ +

◆ AccessType

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Vectorize<FragmentElement, kAccessSize>::Type cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::AccessTypetypedef Vectorize<FragmentElement, kAccessSize>::Type cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::AccessType
- -

◆ Delta

+ +

◆ Delta

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Traits::Delta cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Deltatypedef Traits::Delta cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Delta
- -

◆ Fragment

+ +

◆ Fragment

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Fragment<FragmentElement, ShapeCount<Iterations>::kCount * kAccessSize> cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Fragmenttypedef Fragment<FragmentElement, ShapeCount<Iterations>::kCount * kAccessSize> cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Fragment
- -

◆ FragmentConstIterator

+ +

◆ FragmentConstIterator

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef FragmentConstIterator<Fragment, Iterations, AccessType> cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::FragmentConstIteratortypedef FragmentConstIterator<Fragment, Iterations, AccessType> cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::FragmentConstIterator
- -

◆ FragmentElement

+ +

◆ FragmentElement

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef FragmentElement_ cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::FragmentElementtypedef FragmentElement_ cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::FragmentElement
- -

◆ FragmentIterator

+ +

◆ FragmentIterator

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef FragmentIterator<Fragment, Iterations, AccessType> cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::FragmentIteratortypedef FragmentIterator<Fragment, Iterations, AccessType> cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::FragmentIterator
- -

◆ FragmentShape

+ +

◆ FragmentShape

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef FragmentIterator::FragmentShape cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::FragmentShapetypedef FragmentIterator::FragmentShape cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::FragmentShape
- -

◆ ImmediateOffsetStrides

+ +

◆ ImmediateOffsetStrides

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Traits::ImmediateOffsetStrides cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::ImmediateOffsetStridestypedef Traits::ImmediateOffsetStrides cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::ImmediateOffsetStrides
- -

◆ Index

+ +

◆ Index

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Index_ cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Indextypedef Index_ cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Index
- -

◆ Iterations

+ +

◆ Iterations

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Traits::Iterations cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Iterationstypedef Traits::Iterations cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Iterations
- -

◆ PredicateVector

+ +

◆ PredicateVector

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef PredicateVector<ShapeCount<Iterations>::kCount> cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::PredicateVectortypedef PredicateVector<ShapeCount<Iterations>::kCount> cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::PredicateVector
- -

◆ Scalar

+ +

◆ Scalar

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Scalar_ cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Scalartypedef Scalar_ cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Scalar
- -

◆ Skew

+ +

◆ Skew

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Skew_ cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Skewtypedef Skew_ cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Skew
- -

◆ Storage

+ +

◆ Storage

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Fragment<Scalar, ShapeCount<Tile>::kCount, kFragmentSize> cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Storagetypedef Fragment<Scalar, ShapeCount<Tile>::kCount, kFragmentSize> cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Storage
- -

◆ ThreadOffset

+ +

◆ ThreadOffset

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Traits::ThreadOffset cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::ThreadOffsettypedef Traits::ThreadOffset cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::ThreadOffset
- -

◆ Tile

+ +

◆ Tile

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Traits::Tile cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Tiletypedef Traits::Tile cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Tile
- -

◆ Traits

+ +

◆ Traits

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Traits_ cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Traitstypedef Traits_ cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Traits
@@ -464,21 +464,21 @@

Member Function Documentation

- -

◆ initialize_predicates()

+ +

◆ initialize_predicates()

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
-template<typename PredicateIterator >
+template<typename PredicateIterator , typename PredicateFunctor >
- +
- + @@ -486,14 +486,14 @@

- - + + - + @@ -510,19 +510,19 @@

-

◆ valid()

+ +

◆ valid()

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
static CUTLASS_DEVICE void cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::initialize_predicates static CUTLASS_HOST_DEVICE void cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::initialize_predicates ( PredicateIterator  predicate_it, Coord< 3 > const & bounds, PredicateFunctor const & predicate_func,
Coord< 3 > const & offset = make_Coord(0, 0, 0) offset 
- +
- + @@ -561,19 +561,19 @@

Member Data Documentation

- -

◆ kAccessSize

+ +

◆ kAccessSize

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
CUTLASS_DEVICE bool cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::valid CUTLASS_HOST_DEVICE bool cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::valid ( int  d,
@@ -585,19 +585,19 @@

-

◆ kAdvance

+ +

◆ kAdvance

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

- +
int const cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::kAccessSize = Tile::kCint const cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::kAccessSize = Traits::kAccessSize
@@ -609,19 +609,19 @@

-

◆ kFragmentSize

+ +

◆ kFragmentElementType

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

- +
IteratorAdvance::Kind const cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::kAdvance = Advance_IteratorAdvance::Kind const cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::kAdvance = Advance_
@@ -630,22 +630,22 @@

-Initial value: + - -

◆ kIteratorFragment

+ +

◆ kFragmentSize

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

- +
int const cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::kFragmentSizeFragmentElementType::Kind const cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::kFragmentElementType = FragmentElementType_
@@ -654,22 +654,22 @@

- +Initial value: - -

◆ kMemorySpace

+ +

◆ kMemorySpace

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

- +
IteratorFragment::Kind const cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::kIteratorFragment = IteratorFragment_int const cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::kFragmentSize
@@ -687,7 +687,7 @@

diff --git a/docs/structcutlass_1_1TileIteratorBase.png b/docs/structcutlass_1_1TileIteratorBase.png index ce0eacc9de..0dd3418494 100644 Binary files a/docs/structcutlass_1_1TileIteratorBase.png and b/docs/structcutlass_1_1TileIteratorBase.png differ diff --git a/docs/structcutlass_1_1TileIteratorBase_1_1Params-members.html b/docs/structcutlass_1_1TileIteratorBase_1_1Params-members.html index 3acf4206bc..c3855bd167 100644 --- a/docs/structcutlass_1_1TileIteratorBase_1_1Params-members.html +++ b/docs/structcutlass_1_1TileIteratorBase_1_1Params-members.html @@ -73,26 +73,30 @@
-
cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params Member List
+
cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params Member List

- +
MemorySpace::Kind const cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::kMemorySpace = MemorySpaceMemorySpace::Kind const cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::kMemorySpace = MemorySpace
- - - - - - - - - - + + + + + + + + + + + + + +
inc_advancecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
inc_dcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
inc_hcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
inc_wcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
initialize(Index _stride_d, Index _stride_h, Index _stride_w, Index _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
initialize(Index _stride_d, Index _stride_h, Index _stride_w)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
initialize()cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
stride_dcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
stride_hcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
stride_wcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
inc_advancecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
inc_dcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
inc_hcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
inc_wcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
initialize(long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, long long _inc_advance)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
initialize(Coord< 4 > const &stride)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
initialize(long long _stride_d, Index _stride_h, Index _stride_w)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
initialize()cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
Params()cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
Params(long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, long long _inc_advance)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
Params(Coord< 4 > const &stride)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
stride_dcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
stride_hcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
stride_wcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
diff --git a/docs/structcutlass_1_1TileIteratorBase_1_1Params.html b/docs/structcutlass_1_1TileIteratorBase_1_1Params.html index be921381e3..19a16384de 100644 --- a/docs/structcutlass_1_1TileIteratorBase_1_1Params.html +++ b/docs/structcutlass_1_1TileIteratorBase_1_1Params.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params Struct Reference +Cutlass: cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params Struct Reference @@ -77,7 +77,7 @@ Public Attributes | List of all members
-
cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params Struct Reference
+
cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params Struct Reference
@@ -86,96 +86,234 @@

#include <tile_iterator.h>

-Inheritance diagram for cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params:
+Inheritance diagram for cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params:
- - -cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params -cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params -cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::Params + + +cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params +cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params +cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::Params
- - - - - - - + + + + + + + + + + + + + + + + + + + + +

Public Member Functions

CUTLASS_HOST_DEVICE int initialize (Index _stride_d, Index _stride_h, Index _stride_w, Index _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)
 Initializes params. More...
 
CUTLASS_HOST_DEVICE int initialize (Index _stride_d, Index _stride_h, Index _stride_w)
 
CUTLASS_HOST_DEVICE int initialize ()
 
CUTLASS_HOST_DEVICE Params ()
 Constructs params. More...
 
CUTLASS_HOST_DEVICE Params (long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, long long _inc_advance)
 Constructs params. More...
 
CUTLASS_HOST_DEVICE Params (Coord< 4 > const &stride)
 Constructs params with a stride vector. More...
 
CUTLASS_HOST_DEVICE int initialize (long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, long long _inc_advance)
 Initializes params. More...
 
CUTLASS_HOST_DEVICE int initialize (Coord< 4 > const &stride)
 Initializes the parameters object from a vector of strides. More...
 
CUTLASS_HOST_DEVICE int initialize (long long _stride_d, Index _stride_h, Index _stride_w)
 Initializes the parameters object from a vector of strides. More...
 
CUTLASS_HOST_DEVICE int initialize ()
 Gotta have this. More...
 
- - - - - - - - - - - - - - + + + + + + + + + + + + + +

Public Attributes

Index stride_d
 
Index stride_h
 
Index stride_w
 
Index inc_d
 
Index inc_h
 
Index inc_w
 
Index inc_advance
 
long long stride_d
 
Index stride_h
 
Index stride_w
 
long long inc_d
 
Index inc_h
 
Index inc_w
 
long long inc_advance
 
+

Constructor & Destructor Documentation

+ +

◆ Params() [1/3]

+ +
+
+
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::Params ()
+
+inline
+
+ +
+
+ +

◆ Params() [2/3]

+ +
+
+
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::Params (long long _stride_d,
Index _stride_h,
Index _stride_w,
long long _inc_d,
Index _inc_h,
Index _inc_w,
long long _inc_advance 
)
+
+inline
+
+ +
+
+ +

◆ Params() [3/3]

+ +
+
+
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::Params (Coord< 4 > const & stride)
+
+inline
+
+ +
+

Member Function Documentation

- -

◆ initialize() [1/3]

+ +

◆ initialize() [1/4]

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
- + - + - + - + - + - + - + - + @@ -193,33 +331,61 @@

-

◆ initialize() [2/3]

+ +

◆ initialize() [2/4]

+ +
+
+
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+

CUTLASS_HOST_DEVICE int cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params::initialize CUTLASS_HOST_DEVICE int cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::initialize (Index long long  _stride_d,
Index Index  _stride_h,
Index Index  _stride_w,
Index long long  _inc_d,
Index Index  _inc_h,
Index Index  _inc_w,
Index long long  _inc_advance 
+ + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE int cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::initialize (Coord< 4 > const & stride)
+
+inline
+
+ +
+ + +

◆ initialize() [3/4]

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- + + +
- + - + - + - + @@ -237,19 +403,19 @@

-

◆ initialize() [3/3]

+ +

◆ initialize() [4/4]

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

CUTLASS_HOST_DEVICE int cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params::initialize CUTLASS_HOST_DEVICE int cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::initialize (Index long long  _stride_d,
Index Index  _stride_h,
Index Index  _stride_w 
- +
- + @@ -265,112 +431,112 @@

Member Data Documentation

- -

◆ inc_advance

+ +

◆ inc_advance

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
CUTLASS_HOST_DEVICE int cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params::initialize CUTLASS_HOST_DEVICE int cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::initialize ( )
- +
Index cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params::inc_advancelong long cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::inc_advance
- -

◆ inc_d

+ +

◆ inc_d

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
Index cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params::inc_dlong long cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::inc_d
- -

◆ inc_h

+ +

◆ inc_h

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
Index cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params::inc_hIndex cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::inc_h
- -

◆ inc_w

+ +

◆ inc_w

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
Index cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params::inc_wIndex cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::inc_w
- -

◆ stride_d

+ +

◆ stride_d

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
Index cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params::stride_dlong long cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::stride_d
- -

◆ stride_h

+ +

◆ stride_h

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
Index cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params::stride_hIndex cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::stride_h
- -

◆ stride_w

+ +

◆ stride_w

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
Index cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params::stride_wIndex cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::stride_w
@@ -383,7 +549,7 @@

diff --git a/docs/structcutlass_1_1TileIteratorBase_1_1Params.png b/docs/structcutlass_1_1TileIteratorBase_1_1Params.png index f1c874633c..2b723102b3 100644 Binary files a/docs/structcutlass_1_1TileIteratorBase_1_1Params.png and b/docs/structcutlass_1_1TileIteratorBase_1_1Params.png differ diff --git a/docs/structcutlass_1_1TileLoadIterator-members.html b/docs/structcutlass_1_1TileLoadIterator-members.html index 6acaea33b6..7d9755f9f2 100644 --- a/docs/structcutlass_1_1TileLoadIterator-members.html +++ b/docs/structcutlass_1_1TileLoadIterator-members.html @@ -73,61 +73,67 @@

-
cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ > Member List
+
cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ > Member List
-

This is the complete list of members for cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >, including all inherited members.

+

This is the complete list of members for cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >, including all inherited members.

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AccessType typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Base typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
BaseParams typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
data() constcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
Delta typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Fragment typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
FragmentConstIterator typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
FragmentElement typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
FragmentIterator typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
FragmentShape typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
ImmediateOffsetStrides typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
inc_advance()cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
inc_d()cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
inc_h()cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
inc_stage()cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
inc_w()cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
Index typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
initialize_predicates(PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &block_offset=make_Coord(0, 0, 0))cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
Iterations typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
kAccessSizecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >static
kAdvancecutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >static
kFragmentSizecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >static
kIteratorFragmentcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >static
kMemorySpacecutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >static
kRequiresLoadFence enum valuecutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
load(Fragment &fragment, PredicateIterator pred_it) constcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
load(Fragment &fragment) constcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
load_post_increment(Fragment &fragment, PredicateIterator pred_it)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
load_post_increment(Fragment &fragment)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
paramscutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Pointer typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
PredicateVector typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Scalar typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
SharedStorage typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Skew typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
stagecutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Storage typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
thread_offsetcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
ThreadOffset typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Tile typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
TileLoadIterator()cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
TileLoadIterator(Params const &_params, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
TileLoadIterator(Params const &, SharedStorage &shared_storage, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
Traits typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
valid(int d, int h, int w, int c) constcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
AccessType typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
add_pointer_offset(Index offset)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
Base typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
BaseParams typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Delta typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Fragment typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
FragmentConstIterator typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
FragmentElement typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
FragmentIterator typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
FragmentShape typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
ImmediateOffsetStrides typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
inc_advance()cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
inc_d()cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
inc_h()cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
inc_stage()cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
inc_w()cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
Index typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
initialize_predicates(PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &block_offset=make_Coord(0, 0, 0))cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
initialize_predicates(PredicateIterator predicate_it, PredicateFunctor const &functor, Coord< 3 > const &block_offset)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
Iterations typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
kAccessSizecutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >static
kAdvancecutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >static
kFragmentElementTypecutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >static
kFragmentSizecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >static
kMemorySpacecutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >static
kRequiresLoadFence enum valuecutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
load(Fragment &fragment, PredicateIterator pred_it) constcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
load(Fragment &fragment) constcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
load(Fragment &fragment, int d)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
load_element(AccessType &value, int d, int h, int w, int c) constcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
load_post_increment(Fragment &fragment, PredicateIterator pred_it)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
load_post_increment(Fragment &fragment)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
operator+=(Coord< 3 > const &offset)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
paramscutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Pointer typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
PredicateVector typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Scalar typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
SharedStorage typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Skew typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
stagecutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Storage typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
stride_advance(void)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
TensorRef typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
thread_offsetcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
ThreadOffset typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Tile typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
TileLoadIterator()cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
TileLoadIterator(Params const &_params, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
TileLoadIterator(Params const &, Scalar const *ptr, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
Traits typedefcutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
valid(int d, int h, int w, int c) constcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
diff --git a/docs/structcutlass_1_1TileLoadIterator.html b/docs/structcutlass_1_1TileLoadIterator.html index d670b93fbb..f9e5ad7e57 100644 --- a/docs/structcutlass_1_1TileLoadIterator.html +++ b/docs/structcutlass_1_1TileLoadIterator.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ > Struct Template Reference +Cutlass: cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ > Struct Template Reference @@ -80,7 +80,7 @@ Static Public Attributes | List of all members
-
cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ > Struct Template Reference
+
cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ > Struct Template Reference
@@ -89,12 +89,12 @@

#include <tile_iterator.h>

-Inheritance diagram for cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >:
+Inheritance diagram for cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >:
- - -cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ > + + +cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
@@ -106,521 +106,559 @@
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Public Types

enum  { kRequiresLoadFence = Tile::kD == 1 +
enum  { kRequiresLoadFence = Tile::kD == 1 }
 Do we require a fence? More...
 
typedef TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ > Base
 Base class. More...
 
typedef Base::Traits Traits
 concept TileTraits More...
 
typedef Base::Scalar Scalar
 Scalar element. More...
 
typedef Base::FragmentElement FragmentElement
 Fragment element. More...
 
typedef Base::Index Index
 Index type. More...
 
typedef Base::Skew Skew
 Skew quantity. More...
 
typedef Base::Tile Tile
 Tile shape. More...
 
typedef Base::Delta Delta
 Delta. More...
 
typedef Base::Iterations Iterations
 Iterations. More...
 
typedef Base::ThreadOffset ThreadOffset
 ThreadOffset functor. More...
 
typedef Base::FragmentShape FragmentShape
 Fragment type. More...
 
typedef Base::AccessType AccessType
 Memory access type. More...
 
typedef Base::Fragment Fragment
 Fragment definition. More...
 
typedef Base::FragmentIterator FragmentIterator
 Fragment iterator definition. More...
 
typedef Base::FragmentConstIterator FragmentConstIterator
 Fragment const iterator definition. More...
 
typedef Base::PredicateVector PredicateVector
 Default predicate mask type. More...
 
typedef Base::Storage SharedStorage
 Storage object that may be loaded from. More...
 
typedef Base::Params BaseParams
 IteratorBase parameters. More...
 
typedef Scalar const * Pointer
 The pointer type. More...
 
- Public Types inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
typedef Traits_ Traits
 concept TileTraits More...
 
typedef Scalar_ Scalar
 Scalar element. More...
 
typedef FragmentElement_ FragmentElement
 Fragment element. More...
 
typedef Index_ Index
 Index type. More...
 
typedef Skew_ Skew
 Skew quantity. More...
 
typedef Traits::Tile Tile
 Tile shape. More...
 
typedef Traits::Delta Delta
 Distance along each dimension. More...
 
typedef Traits::ImmediateOffsetStrides ImmediateOffsetStrides
 The strides in each dimension between different loads/stores. More...
 
typedef Traits::Iterations Iterations
 Iterations. More...
 
typedef Traits::ThreadOffset ThreadOffset
 Thread offset. More...
 
typedef Vectorize< FragmentElement, kAccessSize >::Type AccessType
 The elements loaded/store by one instruction. More...
 
typedef Fragment< Scalar, ShapeCount< Tile >::kCount, kFragmentSizeStorage
 The storage. More...
 
typedef Fragment< FragmentElement, ShapeCount< Iterations >::kCount *kAccessSizeFragment
 The fragment. More...
 
typedef FragmentIterator< Fragment, Iterations, AccessTypeFragmentIterator
 The fragment iterator. More...
 
typedef FragmentConstIterator< Fragment, Iterations, AccessTypeFragmentConstIterator
 The fragment const iterator. More...
 
typedef FragmentIterator::FragmentShape FragmentShape
 The shape of the fragment. More...
 
typedef PredicateVector< ShapeCount< Iterations >::kCount > PredicateVector
 Default predicate mask type. More...
 
typedef TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ > Base
 Base class. More...
 
typedef Base::Traits Traits
 concept TileTraits More...
 
typedef Base::Scalar Scalar
 Scalar element. More...
 
typedef FragmentElement_ FragmentElement
 Fragment element. More...
 
typedef Base::Index Index
 Index type. More...
 
typedef Base::Skew Skew
 Skew quantity. More...
 
typedef Base::Tile Tile
 Tile shape. More...
 
typedef Base::Delta Delta
 Delta. More...
 
typedef Base::Iterations Iterations
 Iterations. More...
 
typedef Base::ThreadOffset ThreadOffset
 ThreadOffset functor. More...
 
typedef Base::FragmentShape FragmentShape
 Fragment type. More...
 
typedef Base::AccessType AccessType
 Memory access type. More...
 
typedef Base::Fragment Fragment
 Fragment definition. More...
 
typedef Base::FragmentIterator FragmentIterator
 Fragment iterator definition. More...
 
typedef Base::FragmentConstIterator FragmentConstIterator
 Fragment const iterator definition. More...
 
typedef Base::PredicateVector PredicateVector
 Default predicate mask type. More...
 
typedef Base::Storage SharedStorage
 Storage object that may be loaded from. More...
 
typedef Base::Params BaseParams
 IteratorBase parameters. More...
 
typedef Scalar const * Pointer
 The pointer type. More...
 
typedef TensorRef< Scalar const, 4 > TensorRef
 Tensor reference for the load iterator. More...
 
- Public Types inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
typedef Traits_ Traits
 concept TileTraits More...
 
typedef Scalar_ Scalar
 Scalar element. More...
 
typedef FragmentElement_ FragmentElement
 Fragment element. More...
 
typedef Index_ Index
 Index type. More...
 
typedef Skew_ Skew
 Skew quantity. More...
 
typedef Traits::Tile Tile
 Tile shape. More...
 
typedef Traits::Delta Delta
 Distance along each dimension. More...
 
typedef Traits::ImmediateOffsetStrides ImmediateOffsetStrides
 The strides in each dimension between different loads/stores. More...
 
typedef Traits::Iterations Iterations
 Iterations. More...
 
typedef Traits::ThreadOffset ThreadOffset
 Thread offset. More...
 
typedef Vectorize< FragmentElement, kAccessSize >::Type AccessType
 The elements loaded/store by one instruction. More...
 
typedef Fragment< Scalar, ShapeCount< Tile >::kCount, kFragmentSizeStorage
 The storage. More...
 
typedef Fragment< FragmentElement, ShapeCount< Iterations >::kCount *kAccessSizeFragment
 The fragment. More...
 
typedef FragmentIterator< Fragment, Iterations, AccessTypeFragmentIterator
 The fragment iterator. More...
 
typedef FragmentConstIterator< Fragment, Iterations, AccessTypeFragmentConstIterator
 The fragment const iterator. More...
 
typedef FragmentIterator::FragmentShape FragmentShape
 The shape of the fragment. More...
 
typedef PredicateVector< ShapeCount< Iterations >::kCount > PredicateVector
 Default predicate mask type. More...
 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Public Member Functions

template<typename PredicateIterator >
CUTLASS_HOST_DEVICE void initialize_predicates (PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &block_offset=make_Coord(0, 0, 0))
 Initializes a predicate vector. More...
 
CUTLASS_HOST_DEVICE TileLoadIterator ()
 Default constructor. More...
 
CUTLASS_HOST_DEVICE TileLoadIterator (Params const &_params, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())
 Constructs a tile load iterator. More...
 
CUTLASS_HOST_DEVICE TileLoadIterator (Params const &, SharedStorage &shared_storage, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())
 Constructs a tile load iterator. More...
 
CUTLASS_HOST_DEVICE Scalar const * data () const
 Returns the current pointer. More...
 
CUTLASS_HOST_DEVICE void inc_d ()
 Increment in the D dimension. More...
 
CUTLASS_HOST_DEVICE void inc_h ()
 Increment in the H dimension. More...
 
CUTLASS_HOST_DEVICE void inc_w ()
 Increment in the W dimension. More...
 
CUTLASS_HOST_DEVICE void inc_advance ()
 Increment in the next dimension. More...
 
CUTLASS_DEVICE void inc_stage ()
 Increment the stage. More...
 
template<typename Fragment , typename PredicateIterator >
CUTLASS_HOST_DEVICE void load_post_increment (Fragment &fragment, PredicateIterator pred_it)
 Loads a fragment and advances the iterator to the next tile. More...
 
template<typename Fragment >
CUTLASS_HOST_DEVICE void load_post_increment (Fragment &fragment)
 Loads a fragment and advances the iterator to the next tile. More...
 
template<typename Fragment , typename PredicateIterator >
CUTLASS_HOST_DEVICE void load (Fragment &fragment, PredicateIterator pred_it) const
 Loads a fragment without advancing the iterator.. More...
 
template<typename Fragment >
CUTLASS_HOST_DEVICE void load (Fragment &fragment) const
 Loads a fragment without advancing the iterator.. More...
 
- Public Member Functions inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
CUTLASS_DEVICE bool valid (int d, int h, int w, int c) const
 Is the iterator valid? More...
 
template<typename PredicateIterator >
CUTLASS_HOST_DEVICE void initialize_predicates (PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &block_offset=make_Coord(0, 0, 0))
 Initializes a predicate vector using a RegularTilePredicateFunctor. More...
 
template<typename PredicateIterator , typename PredicateFunctor >
CUTLASS_HOST_DEVICE void initialize_predicates (PredicateIterator predicate_it, PredicateFunctor const &functor, Coord< 3 > const &block_offset)
 Initializes a predicate vector using an arbitrary predicate functor. More...
 
CUTLASS_HOST_DEVICE TileLoadIterator ()
 Default constructor. More...
 
CUTLASS_HOST_DEVICE TileLoadIterator (Params const &_params, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())
 Constructs a tile load iterator. More...
 
CUTLASS_HOST_DEVICE TileLoadIterator (Params const &, Scalar const *ptr, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())
 Constructs a tile load iterator. More...
 
CUTLASS_HOST_DEVICE void inc_d ()
 Increment in the D dimension. More...
 
CUTLASS_HOST_DEVICE void inc_h ()
 Increment in the H dimension. More...
 
CUTLASS_HOST_DEVICE void inc_w ()
 Increment in the W dimension. More...
 
CUTLASS_HOST_DEVICE void inc_advance ()
 Increment in the next dimension. More...
 
CUTLASS_HOST_DEVICE void load_element (AccessType &value, int d, int h, int w, int c) const
 Loads a single fragment element from memory. More...
 
CUTLASS_HOST_DEVICE void inc_stage ()
 Increment the stage. More...
 
CUTLASS_HOST_DEVICE TileLoadIteratoroperator+= (Coord< 3 > const &offset)
 Adds a vector offset to the iterator. More...
 
CUTLASS_HOST_DEVICE void add_pointer_offset (Index offset)
 Adds a raw offset to the pointer. More...
 
CUTLASS_HOST_DEVICE Index stride_advance (void)
 
template<typename Fragment , typename PredicateIterator >
CUTLASS_HOST_DEVICE void load_post_increment (Fragment &fragment, PredicateIterator pred_it)
 Loads a fragment and advances the iterator to the next tile. More...
 
template<typename Fragment >
CUTLASS_HOST_DEVICE void load_post_increment (Fragment &fragment)
 Loads a fragment and advances the iterator to the next tile. More...
 
template<typename Fragment , typename PredicateIterator >
CUTLASS_HOST_DEVICE void load (Fragment &fragment, PredicateIterator pred_it) const
 Loads a fragment without advancing the iterator.. More...
 
template<typename Fragment >
CUTLASS_HOST_DEVICE void load (Fragment &fragment) const
 Loads a fragment without advancing the iterator.. More...
 
template<typename Fragment >
CUTLASS_HOST_DEVICE void load (Fragment &fragment, int d)
 Loads a fragment without advancing the iterator.. More...
 
- Public Member Functions inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
CUTLASS_HOST_DEVICE bool valid (int d, int h, int w, int c) const
 Is the iterator valid? More...
 
- - - - - - - - - + + + + + + + + +

Public Attributes

Params params
 Parameters structure. More...
 
Coord< 4 > thread_offset
 Offset of an individual lane from the start of the tile. More...
 
int stage
 Stage argument enables wrapping after some number of tiles have been loaded. More...
 
Params params
 Parameters structure. More...
 
Coord< 4 > thread_offset
 Offset of an individual lane from the start of the tile. More...
 
int stage
 Stage argument enables wrapping after some number of tiles have been loaded. More...
 
- - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Static Public Attributes

static IteratorAdvance::Kind const kAdvance = Base::kAdvance
 Specifies in which dimension post-increment accesses advance. More...
 
static IteratorFragment::Kind const kIteratorFragment = Base::kIteratorFragment
 Specifies type of iterator fragment storage (Salar or WmmaMatrix) More...
 
static MemorySpace::Kind const kMemorySpace = Base::kMemorySpace
 Source or destination memory space. More...
 
- Static Public Attributes inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
static IteratorAdvance::Kind const kAdvance = Advance_
 Specifies dimension in which post-increment accesses advance. More...
 
static IteratorFragment::Kind const kIteratorFragment = IteratorFragment_
 Specifies iterator storage fragment type (Scalar or WmmaMatrix) More...
 
static MemorySpace::Kind const kMemorySpace = MemorySpace
 Source or destination memory space. More...
 
static int const kAccessSize = Tile::kC
 The number of scalars accessed per load/store. More...
 
static int const kFragmentSize
 The size of storage needed per fragment. More...
 
static IteratorAdvance::Kind const kAdvance = Base::kAdvance
 Specifies in which dimension post-increment accesses advance. More...
 
static FragmentElementType::Kind const kFragmentElementType = FragmentElementType_
 Specifies type of iterator fragment storage (Salar or WmmaMatrix) More...
 
static MemorySpace::Kind const kMemorySpace = Base::kMemorySpace
 Source or destination memory space. More...
 
static int const kAccessSize = Base::kAccessSize
 The number of scalars accessed per load/store. More...
 
- Static Public Attributes inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
static IteratorAdvance::Kind const kAdvance = Advance_
 Specifies dimension in which post-increment accesses advance. More...
 
static FragmentElementType::Kind const kFragmentElementType = FragmentElementType_
 Specifies iterator storage fragment type (Scalar or WmmaMatrix) More...
 
static MemorySpace::Kind const kMemorySpace = MemorySpace
 Source or destination memory space. More...
 
static int const kAccessSize = Traits::kAccessSize
 The number of scalars accessed per load/store. More...
 
static int const kFragmentSize
 The size of storage needed per fragment. More...
 
- - - - - + + + + +

Additional Inherited Members

- Static Public Member Functions inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
template<typename PredicateIterator >
static CUTLASS_DEVICE void initialize_predicates (PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &offset=make_Coord(0, 0, 0))
 Initializes a predicate vector. More...
 
- Static Public Member Functions inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
template<typename PredicateIterator , typename PredicateFunctor >
static CUTLASS_HOST_DEVICE void initialize_predicates (PredicateIterator predicate_it, PredicateFunctor const &predicate_func, Coord< 3 > const &offset)
 Initializes a predicate vector. More...
 

Member Typedef Documentation

- -

◆ AccessType

+ +

◆ AccessType

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::AccessType cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::AccessTypetypedef Base::AccessType cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::AccessType
- -

◆ Base

+ +

◆ Base

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef TileIteratorBase<Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_> cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Basetypedef TileIteratorBase<Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_> cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Base
- -

◆ BaseParams

+ +

◆ BaseParams

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::Params cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::BaseParamstypedef Base::Params cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::BaseParams
- -

◆ Delta

+ +

◆ Delta

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::Delta cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Deltatypedef Base::Delta cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Delta
- -

◆ Fragment

+ +

◆ Fragment

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::Fragment cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Fragmenttypedef Base::Fragment cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Fragment
- -

◆ FragmentConstIterator

+ +

◆ FragmentConstIterator

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::FragmentConstIterator cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::FragmentConstIteratortypedef Base::FragmentConstIterator cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::FragmentConstIterator
- -

◆ FragmentElement

+ +

◆ FragmentElement

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::FragmentElement cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::FragmentElementtypedef FragmentElement_ cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::FragmentElement
- -

◆ FragmentIterator

+ +

◆ FragmentIterator

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::FragmentIterator cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::FragmentIteratortypedef Base::FragmentIterator cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::FragmentIterator
- -

◆ FragmentShape

+ +

◆ FragmentShape

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::FragmentShape cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::FragmentShapetypedef Base::FragmentShape cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::FragmentShape
- -

◆ Index

+ +

◆ Index

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::Index cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Indextypedef Base::Index cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Index
- -

◆ Iterations

+ +

◆ Iterations

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::Iterations cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Iterationstypedef Base::Iterations cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Iterations
- -

◆ Pointer

+ +

◆ Pointer

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Scalar const* cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Pointertypedef Scalar const* cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Pointer
- -

◆ PredicateVector

+ +

◆ PredicateVector

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::PredicateVector cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::PredicateVectortypedef Base::PredicateVector cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::PredicateVector
- -

◆ Scalar

+ +

◆ Scalar

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::Scalar cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Scalartypedef Base::Scalar cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Scalar
- -

◆ SharedStorage

+ +

◆ SharedStorage

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::Storage cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::SharedStoragetypedef Base::Storage cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::SharedStorage
- -

◆ Skew

+ +

◆ Skew

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::Skew cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Skewtypedef Base::Skew cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Skew
- -

◆ ThreadOffset

+ +

◆ TensorRef

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::ThreadOffset cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::ThreadOffsettypedef TensorRef<Scalar const, 4> cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::TensorRef
- -

◆ Tile

+ +

◆ ThreadOffset

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::Tile cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Tiletypedef Base::ThreadOffset cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::ThreadOffset
- -

◆ Traits

+ +

◆ Tile

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- + + +
typedef Base::Traits cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Traitstypedef Base::Tile cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Tile
+
+ +
+ + +

◆ Traits

+ +
+
+
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + +
typedef Base::Traits cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Traits

Constructor & Destructor Documentation

-
-

◆ TileLoadIterator() [1/3]

+ +

◆ TileLoadIterator() [1/3]

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
- + @@ -675,19 +713,19 @@

-

◆ TileLoadIterator() [2/3]

+ +

◆ TileLoadIterator() [2/3]

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

CUTLASS_HOST_DEVICE cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::TileLoadIterator CUTLASS_HOST_DEVICE cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::TileLoadIterator ( )
- +
- + @@ -701,8 +739,8 @@

- - + + @@ -719,19 +757,19 @@

-

◆ TileLoadIterator() [3/3]

+ +

◆ TileLoadIterator() [3/3]

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
CUTLASS_HOST_DEVICE cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::TileLoadIterator CUTLASS_HOST_DEVICE cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::TileLoadIterator ( Params const &  _params, ThreadOffset thread_offset_func = ThreadOffset() ThreadOffset thread_offset_func = ThreadOffset() 
- - - - + + + + - + - - - - + + + + + + + - + @@ -128,7 +131,7 @@ - +
- + @@ -739,8 +777,8 @@

- - + + @@ -751,8 +789,8 @@

- - + + @@ -770,22 +808,23 @@

Member Function Documentation

- -

◆ data()

+ +

◆ add_pointer_offset()

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
CUTLASS_HOST_DEVICE cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::TileLoadIterator CUTLASS_HOST_DEVICE cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::TileLoadIterator ( Params const &  , SharedStorageshared_storage, Scalar const * ptr,
ThreadOffset thread_offset_func = ThreadOffset() ThreadOffset thread_offset_func = ThreadOffset() 
@@ -797,19 +836,19 @@

-

◆ inc_advance()

+ +

◆ inc_advance()

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

- + - - + + +
CUTLASS_HOST_DEVICE Scalar const* cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::data CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::add_pointer_offset () constIndex offset)
- - - - + + + + +
- + @@ -824,19 +863,19 @@

-

◆ inc_d()

+ +

◆ inc_d()

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::inc_advance CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::inc_advance ( )
- + @@ -851,19 +890,19 @@

-

◆ inc_h()

+ +

◆ inc_h()

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::inc_d CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::inc_d ( )
- +
- + @@ -878,19 +917,19 @@

-

◆ inc_stage()

+ +

◆ inc_stage()

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::inc_h CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::inc_h ( )
- +
- + @@ -905,19 +944,19 @@

-

◆ inc_w()

+ +

◆ inc_w()

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

CUTLASS_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::inc_stage CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::inc_stage ( )
- - - - - - - - + + @@ -544,8 +519,8 @@

Member Function Documentation

- -

◆ epilogue()

+ +

◆ epilogue()

@@ -558,14 +533,20 @@

CUTLASS_DEVICE void cutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >::epilogue

+ + + + + + - + - - + + @@ -582,15 +563,15 @@

-

◆ epilogue_with_or_without_beta()

+ +

◆ epilogue_with_or_without_beta()

template<typename GemmEpilogueTraits_ >
-template<bool kBetaIsZero_>
+template<bool kSourceRequired>

- + @@ -932,13 +971,13 @@

-

◆ initialize_predicates()

+ +

◆ initialize_predicates() [1/2]

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
template<typename PredicateIterator >

CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::inc_w CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::inc_w ( )
@@ -946,7 +985,7 @@

- + @@ -978,13 +1017,59 @@

-

◆ load() [1/2]

+ +

◆ initialize_predicates() [2/2]

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+
+template<typename PredicateIterator , typename PredicateFunctor >
+
CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::initialize_predicates CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::initialize_predicates ( PredicateIterator  predicate_it,
+ + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::initialize_predicates (PredicateIterator predicate_it,
PredicateFunctor const & functor,
Coord< 3 > const & block_offset 
)
+
+inline
+
+ +
+ + +

◆ load() [1/3]

+ +
+
+
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
template<typename Fragment , typename PredicateIterator >
@@ -992,9 +1077,9 @@

- + - + @@ -1018,13 +1103,13 @@

-

◆ load() [2/2]

+ +

◆ load() [2/3]

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
template<typename Fragment >
CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::load CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::load (FragmentFragment fragment,
@@ -1032,9 +1117,9 @@

- + - + @@ -1048,13 +1133,109 @@

-

◆ load_post_increment() [1/2]

+ +

◆ load() [3/3]

+ +
+
+
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+
+template<typename Fragment >
+
CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::load CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::load (FragmentFragment fragment) const
+ + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::load (Fragmentfragment,
int d 
)
+
+inline
+
+ +
+
+ +

◆ load_element()

+ +
+
+
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::load_element (AccessTypevalue,
int d,
int h,
int w,
int c 
) const
+
+inline
+
+ +
+
+ +

◆ load_post_increment() [1/2]

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
template<typename Fragment , typename PredicateIterator >
@@ -1062,9 +1243,9 @@

- + - + @@ -1088,13 +1269,13 @@

-

◆ load_post_increment() [2/2]

+ +

◆ load_post_increment() [2/2]

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
template<typename Fragment >
CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::load_post_increment CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::load_post_increment (FragmentFragment fragment,
@@ -1102,9 +1283,9 @@

- + - + @@ -1116,22 +1297,102 @@

+ + + +

◆ operator+=()

+ +
+
+
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+
CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::load_post_increment CUTLASS_HOST_DEVICE void cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::load_post_increment (FragmentFragment fragment)
+ + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE TileLoadIterator& cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::operator+= (Coord< 3 > const & offset)
+
+inline
+
+ +
+ + +

◆ stride_advance()

+ +
+
+
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE Index cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::stride_advance (void )
+
+inline
+
+

Member Data Documentation

- -

◆ kAdvance

+ +

◆ kAccessSize

+ +
+
+
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + + + +
+ + + + +
int const cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::kAccessSize = Base::kAccessSize
+
+static
+
+ +
+
+ +

◆ kAdvance

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
@@ -1143,19 +1404,19 @@

-

◆ kIteratorFragment

+ +

◆ kFragmentElementType

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

- +
IteratorAdvance::Kind const cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::kAdvance = Base::kAdvanceIteratorAdvance::Kind const cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::kAdvance = Base::kAdvance
@@ -1167,19 +1428,19 @@

-

◆ kMemorySpace

+ +

◆ kMemorySpace

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

- +
IteratorFragment::Kind const cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::kIteratorFragment = Base::kIteratorFragmentFragmentElementType::Kind const cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::kFragmentElementType = FragmentElementType_
@@ -1191,48 +1452,48 @@

-

◆ params

+ +

◆ params

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

- +
MemorySpace::Kind const cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::kMemorySpace = Base::kMemorySpaceMemorySpace::Kind const cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::kMemorySpace = Base::kMemorySpace
- +
Params cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::paramsParams cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::params
- -

◆ stage

+ +

◆ stage

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
int cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::stageint cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::stage
- -

◆ thread_offset

+ +

◆ thread_offset

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
Coord<4> cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::thread_offsetCoord<4> cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::thread_offset
@@ -1245,7 +1506,7 @@

diff --git a/docs/structcutlass_1_1TileLoadIterator.png b/docs/structcutlass_1_1TileLoadIterator.png index 30866fa85d..bad871e1e7 100644 Binary files a/docs/structcutlass_1_1TileLoadIterator.png and b/docs/structcutlass_1_1TileLoadIterator.png differ diff --git a/docs/structcutlass_1_1TileLoadIterator_1_1Params-members.html b/docs/structcutlass_1_1TileLoadIterator_1_1Params-members.html index 1977795eae..25b1fdad22 100644 --- a/docs/structcutlass_1_1TileLoadIterator_1_1Params-members.html +++ b/docs/structcutlass_1_1TileLoadIterator_1_1Params-members.html @@ -73,30 +73,40 @@

-
cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params Member List
+
cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params Member List
-

This is the complete list of members for cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params, including all inherited members.

+

This is the complete list of members for cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params, including all inherited members.

- - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + +
inc_advancecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
inc_dcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
inc_hcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
inc_wcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
initialize(SharedStorage const &storage)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
initialize(Scalar const *ptr, Index stride_d, Index stride_h, Index stride_w)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
initialize(Scalar const *ptr, Index _stride_d, Index _stride_h, Index _stride_w, Index _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
initialize()cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
cutlass::TileIteratorBase::Params::initialize(Index _stride_d, Index _stride_h, Index _stride_w, Index _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
cutlass::TileIteratorBase::Params::initialize(Index _stride_d, Index _stride_h, Index _stride_w)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
pointercutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
stride_dcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
stride_hcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
stride_wcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
inc_advancecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
inc_dcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
inc_hcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
inc_wcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
initialize(TensorRef const &ref)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
initialize(SharedStorage const &storage)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
initialize(Scalar const *ptr)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
initialize(Scalar const *ptr, long long stride_d, Index stride_h, Index stride_w)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
initialize(Scalar const *ptr, long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
initialize()cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
cutlass::TileIteratorBase::Params::initialize(long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, long long _inc_advance)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
cutlass::TileIteratorBase::Params::initialize(Coord< 4 > const &stride)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
cutlass::TileIteratorBase::Params::initialize(long long _stride_d, Index _stride_h, Index _stride_w)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
Params()cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
Params(Scalar const *ptr)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
Params(TensorRef const &ref)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
Params(Scalar const *ptr, long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
Params(Scalar const *ptr, long long stride_d, Index stride_h, Index stride_w)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
cutlass::TileIteratorBase::Params::Params(long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, long long _inc_advance)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
cutlass::TileIteratorBase::Params::Params(Coord< 4 > const &stride)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
pointercutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
stride_dcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
stride_hcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
stride_wcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
diff --git a/docs/structcutlass_1_1TileLoadIterator_1_1Params.html b/docs/structcutlass_1_1TileLoadIterator_1_1Params.html index b25879f36b..70ae9afbd4 100644 --- a/docs/structcutlass_1_1TileLoadIterator_1_1Params.html +++ b/docs/structcutlass_1_1TileLoadIterator_1_1Params.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params Struct Reference +Cutlass: cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params Struct Reference @@ -77,7 +77,7 @@ Public Attributes | List of all members
-
cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params Struct Reference
+
cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params Struct Reference
@@ -86,75 +86,346 @@

#include <tile_iterator.h>

-Inheritance diagram for cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params:
+Inheritance diagram for cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params:
- - -cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params -cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::Params + + +cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params +cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::Params
- - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Public Member Functions

CUTLASS_HOST_DEVICE int initialize (SharedStorage const &storage)
 Initialize params to access storage object. More...
 
CUTLASS_HOST_DEVICE int initialize (Scalar const *ptr, Index stride_d, Index stride_h, Index stride_w)
 Initializes params to access a raw pointer. More...
 
CUTLASS_HOST_DEVICE int initialize (Scalar const *ptr, Index _stride_d, Index _stride_h, Index _stride_w, Index _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)
 Initializes params. More...
 
CUTLASS_HOST_DEVICE int initialize ()
 
- Public Member Functions inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
CUTLASS_HOST_DEVICE int initialize (Index _stride_d, Index _stride_h, Index _stride_w, Index _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)
 Initializes params. More...
 
CUTLASS_HOST_DEVICE int initialize (Index _stride_d, Index _stride_h, Index _stride_w)
 
CUTLASS_HOST_DEVICE int initialize ()
 
CUTLASS_HOST_DEVICE Params ()
 Initialize params to access storage object. More...
 
CUTLASS_HOST_DEVICE Params (Scalar const *ptr)
 Initialize params to access storage object. More...
 
CUTLASS_HOST_DEVICE Params (TensorRef const &ref)
 Constructs with a CompactTensorRef<> More...
 
CUTLASS_HOST_DEVICE Params (Scalar const *ptr, long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)
 Initialize params to access storage object. More...
 
CUTLASS_HOST_DEVICE Params (Scalar const *ptr, long long stride_d, Index stride_h, Index stride_w)
 Initialize params to access storage object. More...
 
CUTLASS_HOST_DEVICE int initialize (TensorRef const &ref)
 Initializes params to access a raw pointer. More...
 
CUTLASS_HOST_DEVICE int initialize (SharedStorage const &storage)
 Initialize params to access storage object. More...
 
CUTLASS_HOST_DEVICE int initialize (Scalar const *ptr)
 Initialize params to access storage object. More...
 
CUTLASS_HOST_DEVICE int initialize (Scalar const *ptr, long long stride_d, Index stride_h, Index stride_w)
 Initializes params to access a raw pointer. More...
 
CUTLASS_HOST_DEVICE int initialize (Scalar const *ptr, long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)
 Initializes params. More...
 
CUTLASS_HOST_DEVICE int initialize ()
 
- Public Member Functions inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
CUTLASS_HOST_DEVICE Params ()
 Constructs params. More...
 
CUTLASS_HOST_DEVICE Params (long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, long long _inc_advance)
 Constructs params. More...
 
CUTLASS_HOST_DEVICE Params (Coord< 4 > const &stride)
 Constructs params with a stride vector. More...
 
CUTLASS_HOST_DEVICE int initialize (long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, long long _inc_advance)
 Initializes params. More...
 
CUTLASS_HOST_DEVICE int initialize (Coord< 4 > const &stride)
 Initializes the parameters object from a vector of strides. More...
 
CUTLASS_HOST_DEVICE int initialize (long long _stride_d, Index _stride_h, Index _stride_w)
 Initializes the parameters object from a vector of strides. More...
 
CUTLASS_HOST_DEVICE int initialize ()
 Gotta have this. More...
 
- - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + +

Public Attributes

Scalar const * pointer
 Pointer to memory. More...
 
- Public Attributes inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
Index stride_d
 
Index stride_h
 
Index stride_w
 
Index inc_d
 
Index inc_h
 
Index inc_w
 
Index inc_advance
 
Scalar const * pointer
 Pointer to memory. More...
 
- Public Attributes inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
long long stride_d
 
Index stride_h
 
Index stride_w
 
long long inc_d
 
Index inc_h
 
Index inc_w
 
long long inc_advance
 
+

Constructor & Destructor Documentation

+ +

◆ Params() [1/5]

+ +
+
+
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::Params ()
+
+inline
+
+ +
+
+ +

◆ Params() [2/5]

+ +
+
+
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::Params (Scalar const * ptr)
+
+inline
+
+ +
+
+ +

◆ Params() [3/5]

+ +
+
+
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::Params (TensorRef const & ref)
+
+inline
+
+ +
+
+ +

◆ Params() [4/5]

+ +
+
+
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::Params (Scalar const * ptr,
long long _stride_d,
Index _stride_h,
Index _stride_w,
long long _inc_d,
Index _inc_h,
Index _inc_w,
Index _inc_advance 
)
+
+inline
+
+ +
+
+ +

◆ Params() [5/5]

+ +
+
+
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::Params (Scalar const * ptr,
long long stride_d,
Index stride_h,
Index stride_w 
)
+
+inline
+
+ +
+

Member Function Documentation

- -

◆ initialize() [1/4]

+ +

◆ initialize() [1/6]

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + +
- + - + + + + +
CUTLASS_HOST_DEVICE int cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params::initialize CUTLASS_HOST_DEVICE int cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::initialize (SharedStorage const & TensorRef const & ref)
+
+inline
+
+ +
+ + +

◆ initialize() [2/6]

+ +
+
+
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + - - -
+ + + + + @@ -168,39 +439,67 @@

-

◆ initialize() [2/4]

+ +

◆ initialize() [3/6]

+ +
+
+
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+

CUTLASS_HOST_DEVICE int cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::initialize (SharedStorage const &  storage)
+ + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE int cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::initialize (Scalar const * ptr)
+
+inline
+
+ +
+ + +

◆ initialize() [4/6]

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- - - - - - + + + @@ -156,15 +153,15 @@
- + - + - + - + - + @@ -218,63 +517,63 @@

-

◆ initialize() [3/4]

+ +

◆ initialize() [5/6]

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

CUTLASS_HOST_DEVICE int cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params::initialize CUTLASS_HOST_DEVICE int cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::initialize (Scalar const * Scalar const *  ptr,
Index long long  stride_d,
Index Index  stride_h,
Index Index  stride_w 
+ + + + + + - + @@ -118,13 +124,7 @@ - - - - - - - + @@ -144,6 +144,15 @@ + + + + + + + + + @@ -165,7 +174,7 @@ diff --git a/docs/structcutlass_1_1gemm_1_1DgemmTraits.png b/docs/structcutlass_1_1gemm_1_1DgemmTraits.png index 151b3c5ab0..6307a7cdaf 100644 Binary files a/docs/structcutlass_1_1gemm_1_1DgemmTraits.png and b/docs/structcutlass_1_1gemm_1_1DgemmTraits.png differ diff --git a/docs/structcutlass_1_1gemm_1_1Fp16SgemmConfig-members.html b/docs/structcutlass_1_1gemm_1_1Fp16SgemmConfig-members.html new file mode 100644 index 0000000000..6941e82eb5 --- /dev/null +++ b/docs/structcutlass_1_1gemm_1_1Fp16SgemmConfig-members.html @@ -0,0 +1,118 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+
- + - + - + - + - + - + - + - + - + @@ -292,19 +591,19 @@

-

◆ initialize() [4/4]

+ +

◆ initialize() [6/6]

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

CUTLASS_HOST_DEVICE int cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params::initialize CUTLASS_HOST_DEVICE int cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::initialize (Scalar const * Scalar const *  ptr,
Index long long  _stride_d,
Index Index  _stride_h,
Index Index  _stride_w,
Index long long  _inc_d,
Index Index  _inc_h,
Index Index  _inc_w,
Index Index  _inc_advance 
+ + + @@ -101,7 +104,7 @@
- + @@ -320,16 +619,16 @@

Member Data Documentation

- -

◆ pointer

+ +

◆ pointer

-template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_, typename Scalar_, IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
CUTLASS_HOST_DEVICE int cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params::initialize CUTLASS_HOST_DEVICE int cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::initialize ( )
- +
Scalar const* cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params::pointerScalar const* cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::pointer
@@ -342,7 +641,7 @@

diff --git a/docs/structcutlass_1_1TileLoadIterator_1_1Params.png b/docs/structcutlass_1_1TileLoadIterator_1_1Params.png index 9993389178..831398c27b 100644 Binary files a/docs/structcutlass_1_1TileLoadIterator_1_1Params.png and b/docs/structcutlass_1_1TileLoadIterator_1_1Params.png differ diff --git a/docs/structcutlass_1_1TileLoadStream-members.html b/docs/structcutlass_1_1TileLoadStream-members.html new file mode 100644 index 0000000000..463ff21440 --- /dev/null +++ b/docs/structcutlass_1_1TileLoadStream-members.html @@ -0,0 +1,106 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::TileLoadStream< Iterator_, Transformer_ > Member List
+
+ + + + + diff --git a/docs/structcutlass_1_1TileLoadStream.html b/docs/structcutlass_1_1TileLoadStream.html new file mode 100644 index 0000000000..ca065a3d70 --- /dev/null +++ b/docs/structcutlass_1_1TileLoadStream.html @@ -0,0 +1,525 @@ + + + + + + + +Cutlass: cutlass::TileLoadStream< Iterator_, Transformer_ > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::TileLoadStream< Iterator_, Transformer_ > Struct Template Reference
+
+
+ +

Generic stream for loading and transforming fragments. +

+ +

#include <tile_stream.h>

+
+Inheritance diagram for cutlass::TileLoadStream< Iterator_, Transformer_ >:
+
+
+ + +cutlass::PredicatedTileLoadStream< Iterator_, PredicateFunctor_, Transformer_ > + +
+ + + + + + + + +

+Classes

struct  Params
 Parameters object used to construct generic load stream. More...
 
struct  PredicateVector
 Empty predicate vector struct. More...
 
+ + + + + + + + + + + + + + + + + + + +

+Public Types

typedef Iterator_ Iterator
 TileLoadIterator. More...
 
typedef Transformer_ Transformer
 Transformer. More...
 
typedef Iterator::Fragment Fragment
 Fragment fetched from source memory. More...
 
typedef Transformer::OutputFragment TransformedFragment
 Output fragment from transformer. More...
 
typedef Iterator::TensorRef TensorRef
 Tensor reference expected by the stream. More...
 
typedef Iterator::Index Index
 Index type. More...
 
+ + + + + + + + + + + + + + + + + + + +

+Public Member Functions

CUTLASS_DEVICE TileLoadStream (Params const &_params, TensorRef const &_ref)
 Ctor. More...
 
CUTLASS_DEVICE TileLoadStream (Params const &_params, Coord< 3 > const &threadblock_offset=make_Coord(0, 0, 0))
 Ctor. More...
 
CUTLASS_DEVICE void copy ()
 Loads a tile and increments the iterator. More...
 
CUTLASS_DEVICE void commit ()
 Commits the fetched fragment and applies a transformation. More...
 
CUTLASS_DEVICE Fragmentintermediate_fragment ()
 Accesses the loaded, transformed fragment. More...
 
CUTLASS_DEVICE TransformedFragmentfragment ()
 Accesses the loaded, transformed fragment. More...
 
+ + + + + + + + + + + + + +

+Public Attributes

Iterator iterator
 Iterator to load tiles. More...
 
Fragment fetched_fragment
 Fragment loaded via iterator. More...
 
Transformer transformer
 Transformation applied to fragments. More...
 
TransformedFragment transformed_fragment
 Transformed fragment from transformer. More...
 
+

Member Typedef Documentation

+ +

◆ Fragment

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
typedef Iterator::Fragment cutlass::TileLoadStream< Iterator_, Transformer_ >::Fragment
+
+ +
+
+ +

◆ Index

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
typedef Iterator::Index cutlass::TileLoadStream< Iterator_, Transformer_ >::Index
+
+ +
+
+ +

◆ Iterator

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
typedef Iterator_ cutlass::TileLoadStream< Iterator_, Transformer_ >::Iterator
+
+ +
+
+ +

◆ TensorRef

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
typedef Iterator::TensorRef cutlass::TileLoadStream< Iterator_, Transformer_ >::TensorRef
+
+ +
+
+ +

◆ TransformedFragment

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
typedef Transformer::OutputFragment cutlass::TileLoadStream< Iterator_, Transformer_ >::TransformedFragment
+
+ +
+
+ +

◆ Transformer

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
typedef Transformer_ cutlass::TileLoadStream< Iterator_, Transformer_ >::Transformer
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ TileLoadStream() [1/2]

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE cutlass::TileLoadStream< Iterator_, Transformer_ >::TileLoadStream (Params const & _params,
TensorRef const & _ref 
)
+
+inline
+
+ +
+
+ +

◆ TileLoadStream() [2/2]

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE cutlass::TileLoadStream< Iterator_, Transformer_ >::TileLoadStream (Params const & _params,
Coord< 3 > const & threadblock_offset = make_Coord(0, 0, 0) 
)
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ commit()

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE void cutlass::TileLoadStream< Iterator_, Transformer_ >::commit ()
+
+inline
+
+ +
+
+ +

◆ copy()

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE void cutlass::TileLoadStream< Iterator_, Transformer_ >::copy ()
+
+inline
+
+ +
+
+ +

◆ fragment()

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE TransformedFragment& cutlass::TileLoadStream< Iterator_, Transformer_ >::fragment ()
+
+inline
+
+ +
+
+ +

◆ intermediate_fragment()

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE Fragment& cutlass::TileLoadStream< Iterator_, Transformer_ >::intermediate_fragment ()
+
+inline
+
+ +
+
+

Member Data Documentation

+ +

◆ fetched_fragment

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
Fragment cutlass::TileLoadStream< Iterator_, Transformer_ >::fetched_fragment
+
+ +
+
+ +

◆ iterator

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
Iterator cutlass::TileLoadStream< Iterator_, Transformer_ >::iterator
+
+ +
+
+ +

◆ transformed_fragment

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
TransformedFragment cutlass::TileLoadStream< Iterator_, Transformer_ >::transformed_fragment
+
+ +
+
+ +

◆ transformer

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
Transformer cutlass::TileLoadStream< Iterator_, Transformer_ >::transformer
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1TileLoadStream.png b/docs/structcutlass_1_1TileLoadStream.png new file mode 100644 index 0000000000..3f78724914 Binary files /dev/null and b/docs/structcutlass_1_1TileLoadStream.png differ diff --git a/docs/structcutlass_1_1TileLoadStream_1_1Params-members.html b/docs/structcutlass_1_1TileLoadStream_1_1Params-members.html new file mode 100644 index 0000000000..bafa4e14a2 --- /dev/null +++ b/docs/structcutlass_1_1TileLoadStream_1_1Params-members.html @@ -0,0 +1,93 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::TileLoadStream< Iterator_, Transformer_ >::Params Member List
+
+ + + + + diff --git a/docs/structcutlass_1_1TileLoadStream_1_1Params.html b/docs/structcutlass_1_1TileLoadStream_1_1Params.html new file mode 100644 index 0000000000..34eec4d302 --- /dev/null +++ b/docs/structcutlass_1_1TileLoadStream_1_1Params.html @@ -0,0 +1,188 @@ + + + + + + + +Cutlass: cutlass::TileLoadStream< Iterator_, Transformer_ >::Params Struct Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::TileLoadStream< Iterator_, Transformer_ >::Params Struct Reference
+
+
+ +

Parameters object used to construct generic load stream. +

+ +

#include <tile_stream.h>

+ + + + + + + + +

+Public Member Functions

CUTLASS_HOST_DEVICE Params ()
 Default constructor. More...
 
CUTLASS_HOST_DEVICE Params (typename Iterator::Params const &_iterator)
 Constructor with iterator params. More...
 
+ + + + +

+Public Attributes

Iterator::Params iterator
 Parameters to the iterator. More...
 
+

Constructor & Destructor Documentation

+ +

◆ Params() [1/2]

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileLoadStream< Iterator_, Transformer_ >::Params::Params ()
+
+inline
+
+ +
+
+ +

◆ Params() [2/2]

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileLoadStream< Iterator_, Transformer_ >::Params::Params (typename Iterator::Params const & _iterator)
+
+inline
+
+ +
+
+

Member Data Documentation

+ +

◆ iterator

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
Iterator::Params cutlass::TileLoadStream< Iterator_, Transformer_ >::Params::iterator
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1TileLoadStream_1_1PredicateVector.html b/docs/structcutlass_1_1TileLoadStream_1_1PredicateVector.html new file mode 100644 index 0000000000..2ff76de11b --- /dev/null +++ b/docs/structcutlass_1_1TileLoadStream_1_1PredicateVector.html @@ -0,0 +1,95 @@ + + + + + + + +Cutlass: cutlass::TileLoadStream< Iterator_, Transformer_ >::PredicateVector Struct Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::TileLoadStream< Iterator_, Transformer_ >::PredicateVector Struct Reference
+
+
+ +

Empty predicate vector struct. +

+ +

#include <tile_stream.h>

+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1TileStoreIterator-members.html b/docs/structcutlass_1_1TileStoreIterator-members.html index f24d2dcd78..9499cf9727 100644 --- a/docs/structcutlass_1_1TileStoreIterator-members.html +++ b/docs/structcutlass_1_1TileStoreIterator-members.html @@ -73,59 +73,70 @@

-
cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ > Member List
+
cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ > Member List
-

This is the complete list of members for cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >, including all inherited members.

+

This is the complete list of members for cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >, including all inherited members.

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AccessType typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Base typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
BaseParams typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
data() constcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
Delta typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Fragment typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
FragmentConstIterator typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
FragmentElement typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
FragmentIterator typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
FragmentShape typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
ImmediateOffsetStrides typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
inc_advance()cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
inc_d()cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
inc_h()cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
inc_stage()cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
inc_w()cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
Index typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
initialize_predicates(PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &block_offset=make_Coord(0, 0, 0))cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
Iterations typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
kAccessSizecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >static
kAdvancecutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >static
kFragmentSizecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >static
kIteratorFragmentcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >static
kMemorySpacecutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >static
paramscutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
PredicateVector typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Scalar typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
SharedStorage typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Skew typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
stagecutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Storage typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
store(Fragment &fragment, PredicateIterator pred_it) constcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
store(Fragment &fragment) constcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
store_post_increment(Fragment &fragment, PredicateIterator pred_it)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
store_post_increment(Fragment &fragment)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
thread_offsetcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
ThreadOffset typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
Tile typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
TileStoreIterator()cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
TileStoreIterator(Params const &_params, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
TileStoreIterator(Params const &, SharedStorage &shared_storage, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
Traits typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
valid(int d, int h, int w, int c) constcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >inline
AccessType typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
add_pointer_offset(Index offset)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
Base typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
BaseParams typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Delta typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Fragment typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
FragmentConstIterator typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
FragmentElement typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
FragmentIterator typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
FragmentShape typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
ImmediateOffsetStrides typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
inc_advance()cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
inc_d()cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
inc_h()cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
inc_stage()cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
inc_w()cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
Index typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
initialize_predicates(PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &block_offset=make_Coord(0, 0, 0))cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
initialize_predicates(PredicateIterator predicate_it, PredicateFunctor const &functor, Coord< 3 > const &block_offset)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
Iterations typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
kAccessSizecutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >static
kAdvancecutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >static
kFragmentElementTypecutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >static
kFragmentSizecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >static
kMemorySpacecutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >static
load(Fragment &fragment, PredicateIterator pred_it) constcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
load(Fragment &fragment) constcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
load(Fragment &fragment, int d)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
load_element(AccessType &value, int d, int h, int w, int c) constcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
load_post_increment(Fragment &fragment, PredicateIterator pred_it)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
load_post_increment(Fragment &fragment)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
operator+=(Coord< 3 > const &offset)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
paramscutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Pointer typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
PredicateVector typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Scalar typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
SharedStorage typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Skew typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
stagecutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Storage typedefcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
store(Fragment const &fragment, PredicateIterator pred_it) constcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
store(Fragment const &fragment) constcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
store_element(AccessType const &value, int d, int h, int w, int c)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
store_post_increment(Fragment const &fragment, PredicateIterator pred_it)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
store_post_increment(Fragment const &fragment)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
TensorRef typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
thread_offsetcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
ThreadOffset typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
Tile typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
TileStoreIterator()cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
TileStoreIterator(Params const &_params, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
TileStoreIterator(Params const &, Scalar *ptr, ThreadOffset thread_offset_func=ThreadOffset())cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
Traits typedefcutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
valid(int d, int h, int w, int c) constcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >inline
diff --git a/docs/structcutlass_1_1TileStoreIterator.html b/docs/structcutlass_1_1TileStoreIterator.html index 4fe6f216b9..e1b7f0bd79 100644 --- a/docs/structcutlass_1_1TileStoreIterator.html +++ b/docs/structcutlass_1_1TileStoreIterator.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ > Struct Template Reference +Cutlass: cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ > Struct Template Reference @@ -80,7 +80,7 @@ Static Public Attributes | List of all members
-
cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ > Struct Template Reference
+
cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ > Struct Template Reference
@@ -89,12 +89,12 @@

#include <tile_iterator.h>

-Inheritance diagram for cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >:
+Inheritance diagram for cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >:
- - -cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ > + + +cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
@@ -106,498 +106,572 @@
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Public Types

typedef TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ > Base
 Base class. More...
 
typedef Base::Traits Traits
 concept TileTraits More...
 
typedef Base::Scalar Scalar
 Scalar element. More...
 
typedef Base::FragmentElement FragmentElement
 Fragment element. More...
 
typedef Base::Index Index
 Index type. More...
 
typedef Base::Skew Skew
 Skew quantity. More...
 
typedef Base::Tile Tile
 Tile shape. More...
 
typedef Base::Delta Delta
 Delta. More...
 
typedef Base::Iterations Iterations
 Iterations. More...
 
typedef Base::ThreadOffset ThreadOffset
 ThreadOffset functor. More...
 
typedef Base::FragmentShape FragmentShape
 Fragment type. More...
 
typedef Base::AccessType AccessType
 Memory access type. More...
 
typedef Base::Fragment Fragment
 Fragment definition. More...
 
typedef Base::FragmentIterator FragmentIterator
 Fragment iterator definition. More...
 
typedef Base::FragmentConstIterator FragmentConstIterator
 Fragment const iterator definition. More...
 
typedef Base::PredicateVector PredicateVector
 Default predicate mask type. More...
 
typedef Base::Storage SharedStorage
 Storage object which may be stored to. More...
 
typedef Base::Params BaseParams
 IteratorBase parameters. More...
 
- Public Types inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
typedef Traits_ Traits
 concept TileTraits More...
 
typedef Scalar_ Scalar
 Scalar element. More...
 
typedef FragmentElement_ FragmentElement
 Fragment element. More...
 
typedef Index_ Index
 Index type. More...
 
typedef Skew_ Skew
 Skew quantity. More...
 
typedef Traits::Tile Tile
 Tile shape. More...
 
typedef Traits::Delta Delta
 Distance along each dimension. More...
 
typedef Traits::ImmediateOffsetStrides ImmediateOffsetStrides
 The strides in each dimension between different loads/stores. More...
 
typedef Traits::Iterations Iterations
 Iterations. More...
 
typedef Traits::ThreadOffset ThreadOffset
 Thread offset. More...
 
typedef Vectorize< FragmentElement, kAccessSize >::Type AccessType
 The elements loaded/store by one instruction. More...
 
typedef Fragment< Scalar, ShapeCount< Tile >::kCount, kFragmentSizeStorage
 The storage. More...
 
typedef Fragment< FragmentElement, ShapeCount< Iterations >::kCount *kAccessSizeFragment
 The fragment. More...
 
typedef FragmentIterator< Fragment, Iterations, AccessTypeFragmentIterator
 The fragment iterator. More...
 
typedef FragmentConstIterator< Fragment, Iterations, AccessTypeFragmentConstIterator
 The fragment const iterator. More...
 
typedef FragmentIterator::FragmentShape FragmentShape
 The shape of the fragment. More...
 
typedef PredicateVector< ShapeCount< Iterations >::kCount > PredicateVector
 Default predicate mask type. More...
 
typedef TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ > Base
 Base class. More...
 
typedef Base::Traits Traits
 concept TileTraits More...
 
typedef Base::Scalar Scalar
 Scalar element. More...
 
typedef Base::FragmentElement FragmentElement
 Fragment element. More...
 
typedef Base::Index Index
 Index type. More...
 
typedef Base::Skew Skew
 Skew quantity. More...
 
typedef Base::Tile Tile
 Tile shape. More...
 
typedef Base::Delta Delta
 Delta. More...
 
typedef Base::Iterations Iterations
 Iterations. More...
 
typedef Base::ThreadOffset ThreadOffset
 ThreadOffset functor. More...
 
typedef Base::FragmentShape FragmentShape
 Fragment type. More...
 
typedef Base::AccessType AccessType
 Memory access type. More...
 
typedef Base::Fragment Fragment
 Fragment definition. More...
 
typedef Base::FragmentIterator FragmentIterator
 Fragment iterator definition. More...
 
typedef Base::FragmentConstIterator FragmentConstIterator
 Fragment const iterator definition. More...
 
typedef Base::PredicateVector PredicateVector
 Default predicate mask type. More...
 
typedef Base::Storage SharedStorage
 Storage object which may be stored to. More...
 
typedef Base::Params BaseParams
 IteratorBase parameters. More...
 
typedef ScalarPointer
 Pointer to underlying type. More...
 
typedef TensorRef< Scalar, 4 > TensorRef
 Tensor reference for the store iterator. More...
 
- Public Types inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
typedef Traits_ Traits
 concept TileTraits More...
 
typedef Scalar_ Scalar
 Scalar element. More...
 
typedef FragmentElement_ FragmentElement
 Fragment element. More...
 
typedef Index_ Index
 Index type. More...
 
typedef Skew_ Skew
 Skew quantity. More...
 
typedef Traits::Tile Tile
 Tile shape. More...
 
typedef Traits::Delta Delta
 Distance along each dimension. More...
 
typedef Traits::ImmediateOffsetStrides ImmediateOffsetStrides
 The strides in each dimension between different loads/stores. More...
 
typedef Traits::Iterations Iterations
 Iterations. More...
 
typedef Traits::ThreadOffset ThreadOffset
 Thread offset. More...
 
typedef Vectorize< FragmentElement, kAccessSize >::Type AccessType
 The elements loaded/store by one instruction. More...
 
typedef Fragment< Scalar, ShapeCount< Tile >::kCount, kFragmentSizeStorage
 The storage. More...
 
typedef Fragment< FragmentElement, ShapeCount< Iterations >::kCount *kAccessSizeFragment
 The fragment. More...
 
typedef FragmentIterator< Fragment, Iterations, AccessTypeFragmentIterator
 The fragment iterator. More...
 
typedef FragmentConstIterator< Fragment, Iterations, AccessTypeFragmentConstIterator
 The fragment const iterator. More...
 
typedef FragmentIterator::FragmentShape FragmentShape
 The shape of the fragment. More...
 
typedef PredicateVector< ShapeCount< Iterations >::kCount > PredicateVector
 Default predicate mask type. More...
 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Public Member Functions

template<typename PredicateIterator >
CUTLASS_HOST_DEVICE void initialize_predicates (PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &block_offset=make_Coord(0, 0, 0))
 Initializes a predicate vector. More...
 
CUTLASS_HOST_DEVICE TileStoreIterator ()
 Default constructor. More...
 
CUTLASS_HOST_DEVICE TileStoreIterator (Params const &_params, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())
 Constructs a tile store iterator. More...
 
CUTLASS_HOST_DEVICE TileStoreIterator (Params const &, SharedStorage &shared_storage, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())
 Constructs a tile store iterator. More...
 
CUTLASS_HOST_DEVICE Scalardata () const
 Returns the current pointer. More...
 
CUTLASS_HOST_DEVICE void inc_d ()
 Increment in the D dimension. More...
 
CUTLASS_HOST_DEVICE void inc_h ()
 Increment in the H dimension. More...
 
CUTLASS_HOST_DEVICE void inc_w ()
 Increment in the W dimension. More...
 
CUTLASS_HOST_DEVICE void inc_advance ()
 Increment in the next dimension. More...
 
CUTLASS_DEVICE void inc_stage ()
 Increment the stage. More...
 
template<typename Fragment , typename PredicateIterator >
CUTLASS_HOST_DEVICE void store_post_increment (Fragment &fragment, PredicateIterator pred_it)
 Stores a fragment and advances to the next tile. More...
 
template<typename Fragment >
CUTLASS_HOST_DEVICE void store_post_increment (Fragment &fragment)
 Stores a fragment and advances to the next tile. More...
 
template<typename Fragment , typename PredicateIterator >
CUTLASS_HOST_DEVICE void store (Fragment &fragment, PredicateIterator pred_it) const
 Stores a fragment without advancing the iterator. More...
 
template<typename Fragment >
CUTLASS_HOST_DEVICE void store (Fragment &fragment) const
 Stores a fragment without advancing the iterator. More...
 
- Public Member Functions inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
CUTLASS_DEVICE bool valid (int d, int h, int w, int c) const
 Is the iterator valid? More...
 
template<typename PredicateIterator >
CUTLASS_HOST_DEVICE void initialize_predicates (PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &block_offset=make_Coord(0, 0, 0))
 Initializes a predicate vector using a RegularTilePredicateFunctor. More...
 
template<typename PredicateIterator , typename PredicateFunctor >
CUTLASS_HOST_DEVICE void initialize_predicates (PredicateIterator predicate_it, PredicateFunctor const &functor, Coord< 3 > const &block_offset)
 Initializes a predicate vector using an arbitrary predicate functor. More...
 
CUTLASS_HOST_DEVICE TileStoreIterator ()
 Default constructor. More...
 
CUTLASS_HOST_DEVICE TileStoreIterator (Params const &_params, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())
 Constructs a tile store iterator. More...
 
CUTLASS_HOST_DEVICE TileStoreIterator (Params const &, Scalar *ptr, ThreadOffset thread_offset_func=ThreadOffset())
 Constructs a tile store iterator. More...
 
CUTLASS_HOST_DEVICE void inc_d ()
 Increment in the D dimension. More...
 
CUTLASS_HOST_DEVICE void inc_h ()
 Increment in the H dimension. More...
 
CUTLASS_HOST_DEVICE void inc_w ()
 Increment in the W dimension. More...
 
CUTLASS_HOST_DEVICE void inc_advance ()
 Increment in the next dimension. More...
 
CUTLASS_HOST_DEVICE void inc_stage ()
 Increment the stage. More...
 
CUTLASS_HOST_DEVICE TileStoreIteratoroperator+= (Coord< 3 > const &offset)
 Adds a vector offset to the iterator. More...
 
CUTLASS_HOST_DEVICE void add_pointer_offset (Index offset)
 Adds a raw offset to the pointer. More...
 
CUTLASS_HOST_DEVICE void store_element (AccessType const &value, int d, int h, int w, int c)
 Stores a single fragment element into memory. More...
 
template<typename Fragment , typename PredicateIterator >
CUTLASS_HOST_DEVICE void store_post_increment (Fragment const &fragment, PredicateIterator pred_it)
 Stores a fragment and advances to the next tile. More...
 
template<typename Fragment >
CUTLASS_HOST_DEVICE void store_post_increment (Fragment const &fragment)
 Stores a fragment and advances to the next tile. More...
 
template<typename Fragment , typename PredicateIterator >
CUTLASS_HOST_DEVICE void store (Fragment const &fragment, PredicateIterator pred_it) const
 Stores a fragment without advancing the iterator. More...
 
template<typename Fragment >
CUTLASS_HOST_DEVICE void store (Fragment const &fragment) const
 Stores a fragment without advancing the iterator. More...
 
CUTLASS_HOST_DEVICE void load_element (AccessType &value, int d, int h, int w, int c) const
 Loads a single fragment element from memory. More...
 
template<typename Fragment , typename PredicateIterator >
CUTLASS_HOST_DEVICE void load_post_increment (Fragment &fragment, PredicateIterator pred_it)
 Loads a fragment and advances the iterator to the next tile. More...
 
template<typename Fragment >
CUTLASS_HOST_DEVICE void load_post_increment (Fragment &fragment)
 Loads a fragment and advances the iterator to the next tile. More...
 
template<typename Fragment , typename PredicateIterator >
CUTLASS_HOST_DEVICE void load (Fragment &fragment, PredicateIterator pred_it) const
 Loads a fragment without advancing the iterator.. More...
 
template<typename Fragment >
CUTLASS_HOST_DEVICE void load (Fragment &fragment) const
 Loads a fragment without advancing the iterator.. More...
 
template<typename Fragment >
CUTLASS_HOST_DEVICE void load (Fragment &fragment, int d)
 Loads a fragment without advancing the iterator.. More...
 
- Public Member Functions inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
CUTLASS_HOST_DEVICE bool valid (int d, int h, int w, int c) const
 Is the iterator valid? More...
 
- - - - - - - - - + + + + + + + + +

Public Attributes

Params params
 Parameters structure. More...
 
Coord< 4 > thread_offset
 Offset of an individual lane from the start of the tile. More...
 
int stage
 The stage. More...
 
Params params
 Parameters structure. More...
 
Coord< 4 > thread_offset
 Offset of an individual lane from the start of the tile. More...
 
int stage
 The stage. More...
 
- - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Static Public Attributes

static IteratorAdvance::Kind const kAdvance = Base::kAdvance
 Specifies in which dimension post-increment accesses advance. More...
 
static IteratorFragment::Kind const kIteratorFragment = Base::kIteratorFragment
 Specifies type of iterator fragment storage (Salar or WmmaMatrix) More...
 
static MemorySpace::Kind const kMemorySpace = Base::kMemorySpace
 Source or destination memory space. More...
 
- Static Public Attributes inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
static IteratorAdvance::Kind const kAdvance = Advance_
 Specifies dimension in which post-increment accesses advance. More...
 
static IteratorFragment::Kind const kIteratorFragment = IteratorFragment_
 Specifies iterator storage fragment type (Scalar or WmmaMatrix) More...
 
static MemorySpace::Kind const kMemorySpace = MemorySpace
 Source or destination memory space. More...
 
static int const kAccessSize = Tile::kC
 The number of scalars accessed per load/store. More...
 
static int const kFragmentSize
 The size of storage needed per fragment. More...
 
static IteratorAdvance::Kind const kAdvance = Base::kAdvance
 Specifies in which dimension post-increment accesses advance. More...
 
static FragmentElementType::Kind const kFragmentElementType = Base::kFragmentElementType
 Specifies type of iterator fragment storage (Salar or WmmaMatrix) More...
 
static MemorySpace::Kind const kMemorySpace = Base::kMemorySpace
 Source or destination memory space. More...
 
static int const kAccessSize = Base::kAccessSize
 The number of scalars accessed per load/store. More...
 
- Static Public Attributes inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
static IteratorAdvance::Kind const kAdvance = Advance_
 Specifies dimension in which post-increment accesses advance. More...
 
static FragmentElementType::Kind const kFragmentElementType = FragmentElementType_
 Specifies iterator storage fragment type (Scalar or WmmaMatrix) More...
 
static MemorySpace::Kind const kMemorySpace = MemorySpace
 Source or destination memory space. More...
 
static int const kAccessSize = Traits::kAccessSize
 The number of scalars accessed per load/store. More...
 
static int const kFragmentSize
 The size of storage needed per fragment. More...
 
- - - - - + + + + +

Additional Inherited Members

- Static Public Member Functions inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >
template<typename PredicateIterator >
static CUTLASS_DEVICE void initialize_predicates (PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &offset=make_Coord(0, 0, 0))
 Initializes a predicate vector. More...
 
- Static Public Member Functions inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >
template<typename PredicateIterator , typename PredicateFunctor >
static CUTLASS_HOST_DEVICE void initialize_predicates (PredicateIterator predicate_it, PredicateFunctor const &predicate_func, Coord< 3 > const &offset)
 Initializes a predicate vector. More...
 

Member Typedef Documentation

- -

◆ AccessType

+ +

◆ AccessType

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::AccessType cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::AccessTypetypedef Base::AccessType cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::AccessType
- -

◆ Base

+ +

◆ Base

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef TileIteratorBase<Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_> cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Basetypedef TileIteratorBase<Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_> cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Base
- -

◆ BaseParams

+ +

◆ BaseParams

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::Params cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::BaseParamstypedef Base::Params cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::BaseParams
- -

◆ Delta

+ +

◆ Delta

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::Delta cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Deltatypedef Base::Delta cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Delta
- -

◆ Fragment

+ +

◆ Fragment

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::Fragment cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Fragmenttypedef Base::Fragment cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Fragment
- -

◆ FragmentConstIterator

+ +

◆ FragmentConstIterator

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::FragmentConstIterator cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::FragmentConstIteratortypedef Base::FragmentConstIterator cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::FragmentConstIterator
- -

◆ FragmentElement

+ +

◆ FragmentElement

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::FragmentElement cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::FragmentElementtypedef Base::FragmentElement cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::FragmentElement
- -

◆ FragmentIterator

+ +

◆ FragmentIterator

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::FragmentIterator cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::FragmentIteratortypedef Base::FragmentIterator cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::FragmentIterator
- -

◆ FragmentShape

+ +

◆ FragmentShape

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::FragmentShape cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::FragmentShapetypedef Base::FragmentShape cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::FragmentShape
- -

◆ Index

+ +

◆ Index

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::Index cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Indextypedef Base::Index cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Index
- -

◆ Iterations

+ +

◆ Iterations

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::Iterations cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Iterationstypedef Base::Iterations cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Iterations
- -

◆ PredicateVector

+ +

◆ Pointer

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::PredicateVector cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::PredicateVectortypedef Scalar* cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Pointer
- -

◆ Scalar

+ +

◆ PredicateVector

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::Scalar cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Scalartypedef Base::PredicateVector cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::PredicateVector
- -

◆ SharedStorage

+ +

◆ Scalar

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::Storage cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::SharedStoragetypedef Base::Scalar cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Scalar
- -

◆ Skew

+ +

◆ SharedStorage

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::Skew cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Skewtypedef Base::Storage cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::SharedStorage
- -

◆ ThreadOffset

+ +

◆ Skew

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::ThreadOffset cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::ThreadOffsettypedef Base::Skew cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Skew
- -

◆ Tile

+ +

◆ TensorRef

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
typedef Base::Tile cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Tiletypedef TensorRef<Scalar, 4> cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::TensorRef
- -

◆ Traits

+ +

◆ ThreadOffset

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- + + +
typedef Base::Traits cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Traitstypedef Base::ThreadOffset cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::ThreadOffset
+
+ +
+ + +

◆ Tile

+ +
+
+
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + + +
typedef Base::Tile cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Tile
+
+ +
+
+ +

◆ Traits

+ +
+
+
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + +
typedef Base::Traits cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Traits
@@ -605,19 +679,19 @@

Constructor & Destructor Documentation

- -

◆ TileStoreIterator() [1/3]

+ +

◆ TileStoreIterator() [1/3]

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- + @@ -632,19 +706,19 @@

-

◆ TileStoreIterator() [2/3]

+ +

◆ TileStoreIterator() [2/3]

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

CUTLASS_HOST_DEVICE cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::TileStoreIterator CUTLASS_HOST_DEVICE cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::TileStoreIterator ( )
- + @@ -658,8 +732,8 @@

- - + + @@ -676,19 +750,19 @@

-

◆ TileStoreIterator() [3/3]

+ +

◆ TileStoreIterator() [3/3]

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
CUTLASS_HOST_DEVICE cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::TileStoreIterator CUTLASS_HOST_DEVICE cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::TileStoreIterator ( Params const &  _params, ThreadOffset thread_offset_func = ThreadOffset() ThreadOffset thread_offset_func = ThreadOffset() 
- + @@ -696,20 +770,14 @@

- - + + - - - - - - - - + + @@ -727,22 +795,23 @@

Member Function Documentation

- -

◆ data()

+ +

◆ add_pointer_offset()

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
CUTLASS_HOST_DEVICE cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::TileStoreIterator CUTLASS_HOST_DEVICE cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::TileStoreIterator ( Params const &  , SharedStorageshared_storage, Scalarptr,
Coord< 3 > const & block_offset = make_Coord(0, 0, 0),
ThreadOffset thread_offset_func = ThreadOffset() ThreadOffset thread_offset_func = ThreadOffset() 
@@ -754,19 +823,19 @@

-

◆ inc_advance()

+ +

◆ inc_advance()

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

- + - - + + +
CUTLASS_HOST_DEVICE Scalar* cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::data CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::add_pointer_offset () constIndex offset)
- + @@ -781,19 +850,19 @@

-

◆ inc_d()

+ +

◆ inc_d()

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::inc_advance CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::inc_advance ( )
- + @@ -808,19 +877,19 @@

-

◆ inc_h()

+ +

◆ inc_h()

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::inc_d CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::inc_d ( )
- + @@ -835,19 +904,19 @@

-

◆ inc_stage()

+ +

◆ inc_stage()

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::inc_h CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::inc_h ( )
- + @@ -862,19 +931,19 @@

-

◆ inc_w()

+ +

◆ inc_w()

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

CUTLASS_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::inc_stage CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::inc_stage ( )
+ + +
- + @@ -889,13 +958,13 @@

-

◆ initialize_predicates()

+ +

◆ initialize_predicates() [1/2]

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
template<typename PredicateIterator >

CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::inc_w CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::inc_w ( )
@@ -903,7 +972,7 @@

- + @@ -935,13 +1004,59 @@

-

◆ store() [1/2]

+ +

◆ initialize_predicates() [2/2]

+ +
+
+
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+
+template<typename PredicateIterator , typename PredicateFunctor >
+
CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::initialize_predicates CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::initialize_predicates ( PredicateIterator  predicate_it,
+ + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::initialize_predicates (PredicateIterator predicate_it,
PredicateFunctor const & functor,
Coord< 3 > const & block_offset 
)
+
+inline
+
+ +
+ + +

◆ load() [1/3]

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
template<typename Fragment , typename PredicateIterator >
@@ -949,9 +1064,9 @@

- + - + @@ -975,13 +1090,13 @@

-

◆ store() [2/2]

+ +

◆ load() [2/3]

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
template<typename Fragment >
CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::store CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::load (FragmentFragment fragment,
@@ -989,9 +1104,9 @@

- + - + @@ -1005,13 +1120,109 @@

-

◆ store_post_increment() [1/2]

+ +

◆ load() [3/3]

+ +
+
+
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+
+template<typename Fragment >
+
CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::store CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::load (FragmentFragment fragment) const
+ + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::load (Fragmentfragment,
int d 
)
+
+inline
+
+ +
+ + +

◆ load_element()

+ +
+
+
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::load_element (AccessTypevalue,
int d,
int h,
int w,
int c 
) const
+
+inline
+
+ +
+
+ +

◆ load_post_increment() [1/2]

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
template<typename Fragment , typename PredicateIterator >
@@ -1019,9 +1230,9 @@

- + - + @@ -1045,13 +1256,13 @@

-

◆ store_post_increment() [2/2]

+ +

◆ load_post_increment() [2/2]

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
template<typename Fragment >
CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::store_post_increment CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::load_post_increment (FragmentFragment fragment,
@@ -1059,9 +1270,233 @@

- + - + + + + +
CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::store_post_increment CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::load_post_increment (FragmentFragmentfragment)
+
+inline
+
+ +
+ + +

◆ operator+=()

+ +
+
+
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE TileStoreIterator& cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::operator+= (Coord< 3 > const & offset)
+
+inline
+
+ +
+
+ +

◆ store() [1/2]

+ +
+
+
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+
+template<typename Fragment , typename PredicateIterator >
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::store (Fragment const & fragment,
PredicateIterator pred_it 
) const
+
+inline
+
+ +
+
+ +

◆ store() [2/2]

+ +
+
+
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+
+template<typename Fragment >
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::store (Fragment const & fragment) const
+
+inline
+
+ +
+
+ +

◆ store_element()

+ +
+
+
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::store_element (AccessType const & value,
int d,
int h,
int w,
int c 
)
+
+inline
+
+ +
+
+ +

◆ store_post_increment() [1/2]

+ +
+
+
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+
+template<typename Fragment , typename PredicateIterator >
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::store_post_increment (Fragment const & fragment,
PredicateIterator pred_it 
)
+
+inline
+
+ +
+
+ +

◆ store_post_increment() [2/2]

+ +
+
+
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+
+template<typename Fragment >
+ + +
+ + + + + @@ -1076,19 +1511,43 @@

Member Data Documentation

- -

◆ kAdvance

+ +

◆ kAccessSize

+ +
+
+
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+
CUTLASS_HOST_DEVICE void cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::store_post_increment (Fragment const &  fragment)
+ + + + +
+ + + + +
int const cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::kAccessSize = Base::kAccessSize
+
+static
+
+ +
+ + +

◆ kAdvance

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
@@ -1100,19 +1559,19 @@

-

◆ kIteratorFragment

+ +

◆ kFragmentElementType

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

- +
IteratorAdvance::Kind const cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::kAdvance = Base::kAdvanceIteratorAdvance::Kind const cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::kAdvance = Base::kAdvance
@@ -1124,19 +1583,19 @@

-

◆ kMemorySpace

+ +

◆ kMemorySpace

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

- +
IteratorFragment::Kind const cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::kIteratorFragment = Base::kIteratorFragmentFragmentElementType::Kind const cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::kFragmentElementType = Base::kFragmentElementType
@@ -1148,48 +1607,48 @@

-

◆ params

+ +

◆ params

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

- +
MemorySpace::Kind const cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::kMemorySpace = Base::kMemorySpaceMemorySpace::Kind const cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::kMemorySpace = Base::kMemorySpace
- +
Params cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::paramsParams cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::params
- -

◆ stage

+ +

◆ stage

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
int cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::stageint cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::stage
- -

◆ thread_offset

+ +

◆ thread_offset

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- +
Coord<4> cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::thread_offsetCoord<4> cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::thread_offset
@@ -1202,7 +1661,7 @@

diff --git a/docs/structcutlass_1_1TileStoreIterator.png b/docs/structcutlass_1_1TileStoreIterator.png index a20f18cfe5..c86ef843d3 100644 Binary files a/docs/structcutlass_1_1TileStoreIterator.png and b/docs/structcutlass_1_1TileStoreIterator.png differ diff --git a/docs/structcutlass_1_1TileStoreIterator_1_1Params-members.html b/docs/structcutlass_1_1TileStoreIterator_1_1Params-members.html index 5d34eba8fe..9a12c4821f 100644 --- a/docs/structcutlass_1_1TileStoreIterator_1_1Params-members.html +++ b/docs/structcutlass_1_1TileStoreIterator_1_1Params-members.html @@ -73,30 +73,39 @@

-
cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params Member List
+
cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params Member List
-

This is the complete list of members for cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params, including all inherited members.

+

This is the complete list of members for cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params, including all inherited members.

- - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + +
inc_advancecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
inc_dcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
inc_hcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
inc_wcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
initialize(SharedStorage &storage)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
initialize(Scalar *ptr, Index stride_d, Index stride_h, Index stride_w)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
initialize(Scalar *ptr, Index _stride_d, Index _stride_h, Index _stride_w, Index _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
initialize()cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
cutlass::TileIteratorBase::Params::initialize(Index _stride_d, Index _stride_h, Index _stride_w, Index _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
cutlass::TileIteratorBase::Params::initialize(Index _stride_d, Index _stride_h, Index _stride_w)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
pointercutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
stride_dcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
stride_hcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
stride_wcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
inc_advancecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
inc_dcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
inc_hcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
inc_wcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
initialize(SharedStorage &storage)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
initialize(Scalar *ptr)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
initialize(Scalar *ptr, long long stride_d, Index stride_h, Index stride_w)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
initialize(Scalar *ptr, long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
initialize()cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
cutlass::TileIteratorBase::Params::initialize(long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, long long _inc_advance)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
cutlass::TileIteratorBase::Params::initialize(Coord< 4 > const &stride)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
cutlass::TileIteratorBase::Params::initialize(long long _stride_d, Index _stride_h, Index _stride_w)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
Params()cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
Params(Scalar *ptr)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
Params(TensorRef const &ref)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
Params(Scalar *ptr, long long stride_d, Index stride_h, Index stride_w)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
Params(Scalar *ptr, long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
cutlass::TileIteratorBase::Params::Params(long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, long long _inc_advance)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
cutlass::TileIteratorBase::Params::Params(Coord< 4 > const &stride)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
pointercutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
stride_dcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
stride_hcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
stride_wcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
diff --git a/docs/structcutlass_1_1TileStoreIterator_1_1Params.html b/docs/structcutlass_1_1TileStoreIterator_1_1Params.html index 3da80d41b2..909a87cc9d 100644 --- a/docs/structcutlass_1_1TileStoreIterator_1_1Params.html +++ b/docs/structcutlass_1_1TileStoreIterator_1_1Params.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params Struct Reference +Cutlass: cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params Struct Reference @@ -77,7 +77,7 @@ Public Attributes | List of all members
-
cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params Struct Reference
+
cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params Struct Reference
@@ -86,75 +86,311 @@

#include <tile_iterator.h>

-Inheritance diagram for cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params:
+Inheritance diagram for cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params:
- - -cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params + + +cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
- - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Public Member Functions

CUTLASS_HOST_DEVICE int initialize (SharedStorage &storage)
 Initialize params to access storage object. More...
 
CUTLASS_HOST_DEVICE int initialize (Scalar *ptr, Index stride_d, Index stride_h, Index stride_w)
 Initializes params to access a raw pointer. More...
 
CUTLASS_HOST_DEVICE int initialize (Scalar *ptr, Index _stride_d, Index _stride_h, Index _stride_w, Index _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)
 Initializes params. More...
 
CUTLASS_HOST_DEVICE int initialize ()
 Initializes params to default values. More...
 
- Public Member Functions inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
CUTLASS_HOST_DEVICE int initialize (Index _stride_d, Index _stride_h, Index _stride_w, Index _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)
 Initializes params. More...
 
CUTLASS_HOST_DEVICE int initialize (Index _stride_d, Index _stride_h, Index _stride_w)
 
CUTLASS_HOST_DEVICE int initialize ()
 
CUTLASS_HOST_DEVICE Params ()
 
CUTLASS_HOST_DEVICE Params (Scalar *ptr)
 
CUTLASS_HOST_DEVICE Params (TensorRef const &ref)
 Constructs with a CompactTensorRef<> More...
 
CUTLASS_HOST_DEVICE Params (Scalar *ptr, long long stride_d, Index stride_h, Index stride_w)
 
CUTLASS_HOST_DEVICE Params (Scalar *ptr, long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)
 
CUTLASS_HOST_DEVICE int initialize (SharedStorage &storage)
 Initialize params to access storage object. More...
 
CUTLASS_HOST_DEVICE int initialize (Scalar *ptr)
 Initialize params to access storage object. More...
 
CUTLASS_HOST_DEVICE int initialize (Scalar *ptr, long long stride_d, Index stride_h, Index stride_w)
 Initializes params to access a raw pointer. More...
 
CUTLASS_HOST_DEVICE int initialize (Scalar *ptr, long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)
 Initializes params. More...
 
CUTLASS_HOST_DEVICE int initialize ()
 Initializes params to default values. More...
 
- Public Member Functions inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
CUTLASS_HOST_DEVICE Params ()
 Constructs params. More...
 
CUTLASS_HOST_DEVICE Params (long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, long long _inc_advance)
 Constructs params. More...
 
CUTLASS_HOST_DEVICE Params (Coord< 4 > const &stride)
 Constructs params with a stride vector. More...
 
CUTLASS_HOST_DEVICE int initialize (long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, long long _inc_advance)
 Initializes params. More...
 
CUTLASS_HOST_DEVICE int initialize (Coord< 4 > const &stride)
 Initializes the parameters object from a vector of strides. More...
 
CUTLASS_HOST_DEVICE int initialize (long long _stride_d, Index _stride_h, Index _stride_w)
 Initializes the parameters object from a vector of strides. More...
 
CUTLASS_HOST_DEVICE int initialize ()
 Gotta have this. More...
 
- - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + +

Public Attributes

Scalarpointer
 Pointer to memory. More...
 
- Public Attributes inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
Index stride_d
 
Index stride_h
 
Index stride_w
 
Index inc_d
 
Index inc_h
 
Index inc_w
 
Index inc_advance
 
Scalarpointer
 Pointer to memory. More...
 
- Public Attributes inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
long long stride_d
 
Index stride_h
 
Index stride_w
 
long long inc_d
 
Index inc_h
 
Index inc_w
 
long long inc_advance
 
+

Constructor & Destructor Documentation

+ +

◆ Params() [1/5]

+ +
+
+
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::Params ()
+
+inline
+
+ +
+
+ +

◆ Params() [2/5]

+ +
+
+
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::Params (Scalarptr)
+
+inline
+
+ +
+
+ +

◆ Params() [3/5]

+ +
+
+
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::Params (TensorRef const & ref)
+
+inline
+
+ +
+
+ +

◆ Params() [4/5]

+ +
+
+
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::Params (Scalarptr,
long long stride_d,
Index stride_h,
Index stride_w 
)
+
+inline
+
+ +
+
+ +

◆ Params() [5/5]

+ +
+
+
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::Params (Scalarptr,
long long _stride_d,
Index _stride_h,
Index _stride_w,
long long _inc_d,
Index _inc_h,
Index _inc_w,
Index _inc_advance 
)
+
+inline
+
+ +
+

Member Function Documentation

- -

◆ initialize() [1/4]

+ +

◆ initialize() [1/5]

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- + - + @@ -168,39 +404,67 @@

-

◆ initialize() [2/4]

+ +

◆ initialize() [2/5]

+ +
+
+
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+

CUTLASS_HOST_DEVICE int cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params::initialize CUTLASS_HOST_DEVICE int cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::initialize (SharedStorageSharedStorage storage)
+ + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE int cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::initialize (Scalarptr)
+
+inline
+
+ +
+ + +

◆ initialize() [3/5]

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
- + - + - + - + - + @@ -218,63 +482,63 @@

-

◆ initialize() [3/4]

+ +

◆ initialize() [4/5]

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

CUTLASS_HOST_DEVICE int cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params::initialize CUTLASS_HOST_DEVICE int cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::initialize (ScalarScalar ptr,
Index long long  stride_d,
Index Index  stride_h,
Index Index  stride_w 
+ + +
- + - + - + - + - + - + - + - + - + @@ -292,19 +556,19 @@

-

◆ initialize() [4/4]

+ +

◆ initialize() [5/5]

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>

CUTLASS_HOST_DEVICE int cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params::initialize CUTLASS_HOST_DEVICE int cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::initialize (ScalarScalar ptr,
Index long long  _stride_d,
Index Index  _stride_h,
Index Index  _stride_w,
Index long long  _inc_d,
Index Index  _inc_h,
Index Index  _inc_w,
Index Index  _inc_advance 
- + @@ -320,16 +584,16 @@

Member Data Documentation

- -

◆ pointer

+ +

◆ pointer

-template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, IteratorFragment::Kind IteratorFragment_ = IteratorFragment::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
+template<typename Traits_ , typename Scalar_ , IteratorAdvance::Kind Advance_ = IteratorAdvance::kH, MemorySpace::Kind MemorySpace = MemorySpace::kGeneric, typename Index_ = int, typename FragmentElement_ = Scalar_, FragmentElementType::Kind FragmentElementType_ = FragmentElementType::kScalar, typename Skew_ = Shape<0, 0, 0, 0>>
CUTLASS_HOST_DEVICE int cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params::initialize CUTLASS_HOST_DEVICE int cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::initialize ( )
- +
Scalar* cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params::pointerScalar* cutlass::TileStoreIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params::pointer
@@ -342,7 +606,7 @@

diff --git a/docs/structcutlass_1_1TileStoreIterator_1_1Params.png b/docs/structcutlass_1_1TileStoreIterator_1_1Params.png index aabb9a31b6..1e0529a453 100644 Binary files a/docs/structcutlass_1_1TileStoreIterator_1_1Params.png and b/docs/structcutlass_1_1TileStoreIterator_1_1Params.png differ diff --git a/docs/structcutlass_1_1TileStoreStream-members.html b/docs/structcutlass_1_1TileStoreStream-members.html new file mode 100644 index 0000000000..03cb5045ab --- /dev/null +++ b/docs/structcutlass_1_1TileStoreStream-members.html @@ -0,0 +1,107 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::TileStoreStream< Iterator_, Transformer_ > Member List
+
+ + + + + diff --git a/docs/structcutlass_1_1TileStoreStream.html b/docs/structcutlass_1_1TileStoreStream.html new file mode 100644 index 0000000000..06c454fc27 --- /dev/null +++ b/docs/structcutlass_1_1TileStoreStream.html @@ -0,0 +1,556 @@ + + + + + + + +Cutlass: cutlass::TileStoreStream< Iterator_, Transformer_ > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::TileStoreStream< Iterator_, Transformer_ > Struct Template Reference
+
+
+ +

Generic stream for transforming and storing fragments. +

+ +

#include <tile_stream.h>

+
+Inheritance diagram for cutlass::TileStoreStream< Iterator_, Transformer_ >:
+
+
+ + +cutlass::PredicatedTileStoreStream< Iterator_, PredicateFunctor_, Transformer_ > + +
+ + + + + + + + +

+Classes

struct  Params
 Parameters used to construct the stream. More...
 
struct  PredicateVector
 Empty predicate vector struct. More...
 
+ + + + + + + + + + + + + + + + + + + +

+Public Types

typedef Iterator_ Iterator
 TileLoadIterator. More...
 
typedef Transformer_ Transformer
 Transformer. More...
 
typedef Transformer::InputFragment Fragment
 Source fragment. More...
 
typedef Transformer::OutputFragment TransformedFragment
 Transformed fragment, compatible with Iterator::Fragment. More...
 
typedef Iterator::TensorRef TensorRef
 Tensor reference expected by the underlying iterator. More...
 
typedef Iterator::Index Index
 Index type. More...
 
+ + + + + + + + + + + + + + + + + + + + + + +

+Public Member Functions

CUTLASS_DEVICE TileStoreStream (Params const &_params, TensorRef const &_ref)
 Ctor. More...
 
CUTLASS_DEVICE TileStoreStream (Params const &_params, Coord< 3 > const &threadblock_offset=make_Coord(0, 0, 0))
 Ctor. More...
 
CUTLASS_DEVICE void copy ()
 Stores a fragment and increments the iterator. More...
 
CUTLASS_DEVICE void copy (Fragment const &frag)
 Stores a fragment and increments the iterator. More...
 
CUTLASS_DEVICE void commit ()
 Commits the store operation. More...
 
CUTLASS_DEVICE Fragmentfragment ()
 Accesses the transformed fragment. More...
 
CUTLASS_DEVICE TransformedFragmentintermediate_fragment ()
 Accesses the fragment after trasnforming. More...
 
+ + + + + + + + + + + + + +

+Public Attributes

Iterator iterator
 Iterator to store tiles. More...
 
Transformer transformer
 Transformation applied to inputs. More...
 
Fragment source_fragment
 Source fragment. More...
 
TransformedFragment transformed_fragment
 Transformed fragment from transformer. More...
 
+

Member Typedef Documentation

+ +

◆ Fragment

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
typedef Transformer::InputFragment cutlass::TileStoreStream< Iterator_, Transformer_ >::Fragment
+
+ +
+
+ +

◆ Index

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
typedef Iterator::Index cutlass::TileStoreStream< Iterator_, Transformer_ >::Index
+
+ +
+
+ +

◆ Iterator

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
typedef Iterator_ cutlass::TileStoreStream< Iterator_, Transformer_ >::Iterator
+
+ +
+
+ +

◆ TensorRef

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
typedef Iterator::TensorRef cutlass::TileStoreStream< Iterator_, Transformer_ >::TensorRef
+
+ +
+
+ +

◆ TransformedFragment

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
typedef Transformer::OutputFragment cutlass::TileStoreStream< Iterator_, Transformer_ >::TransformedFragment
+
+ +
+
+ +

◆ Transformer

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
typedef Transformer_ cutlass::TileStoreStream< Iterator_, Transformer_ >::Transformer
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ TileStoreStream() [1/2]

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE cutlass::TileStoreStream< Iterator_, Transformer_ >::TileStoreStream (Params const & _params,
TensorRef const & _ref 
)
+
+inline
+
+ +
+
+ +

◆ TileStoreStream() [2/2]

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE cutlass::TileStoreStream< Iterator_, Transformer_ >::TileStoreStream (Params const & _params,
Coord< 3 > const & threadblock_offset = make_Coord(0, 0, 0) 
)
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ commit()

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE void cutlass::TileStoreStream< Iterator_, Transformer_ >::commit ()
+
+inline
+
+ +
+
+ +

◆ copy() [1/2]

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE void cutlass::TileStoreStream< Iterator_, Transformer_ >::copy ()
+
+inline
+
+ +
+
+ +

◆ copy() [2/2]

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + + +
+ + + + + + + + +
CUTLASS_DEVICE void cutlass::TileStoreStream< Iterator_, Transformer_ >::copy (Fragment const & frag)
+
+inline
+
+ +
+
+ +

◆ fragment()

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE Fragment& cutlass::TileStoreStream< Iterator_, Transformer_ >::fragment ()
+
+inline
+
+ +
+
+ +

◆ intermediate_fragment()

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE TransformedFragment& cutlass::TileStoreStream< Iterator_, Transformer_ >::intermediate_fragment ()
+
+inline
+
+ +
+
+

Member Data Documentation

+ +

◆ iterator

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
Iterator cutlass::TileStoreStream< Iterator_, Transformer_ >::iterator
+
+ +
+
+ +

◆ source_fragment

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
Fragment cutlass::TileStoreStream< Iterator_, Transformer_ >::source_fragment
+
+ +
+
+ +

◆ transformed_fragment

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
TransformedFragment cutlass::TileStoreStream< Iterator_, Transformer_ >::transformed_fragment
+
+ +
+
+ +

◆ transformer

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
Transformer cutlass::TileStoreStream< Iterator_, Transformer_ >::transformer
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1TileStoreStream.png b/docs/structcutlass_1_1TileStoreStream.png new file mode 100644 index 0000000000..c673a40f9d Binary files /dev/null and b/docs/structcutlass_1_1TileStoreStream.png differ diff --git a/docs/structcutlass_1_1TileStoreStream_1_1Params-members.html b/docs/structcutlass_1_1TileStoreStream_1_1Params-members.html new file mode 100644 index 0000000000..2f4e2f61fe --- /dev/null +++ b/docs/structcutlass_1_1TileStoreStream_1_1Params-members.html @@ -0,0 +1,93 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::TileStoreStream< Iterator_, Transformer_ >::Params Member List
+
+ + + + + diff --git a/docs/structcutlass_1_1TileStoreStream_1_1Params.html b/docs/structcutlass_1_1TileStoreStream_1_1Params.html new file mode 100644 index 0000000000..e4b3edcafc --- /dev/null +++ b/docs/structcutlass_1_1TileStoreStream_1_1Params.html @@ -0,0 +1,188 @@ + + + + + + + +Cutlass: cutlass::TileStoreStream< Iterator_, Transformer_ >::Params Struct Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::TileStoreStream< Iterator_, Transformer_ >::Params Struct Reference
+
+
+ +

Parameters used to construct the stream. +

+ +

#include <tile_stream.h>

+ + + + + + + + +

+Public Member Functions

CUTLASS_HOST_DEVICE Params ()
 Default constructor. More...
 
CUTLASS_HOST_DEVICE Params (typename Iterator::Params const &_iterator)
 Constructor with iterator params. More...
 
+ + + + +

+Public Attributes

Iterator::Params iterator
 Parameters to the iterator. More...
 
+

Constructor & Destructor Documentation

+ +

◆ Params() [1/2]

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileStoreStream< Iterator_, Transformer_ >::Params::Params ()
+
+inline
+
+ +
+
+ +

◆ Params() [2/2]

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + + +
+ + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::TileStoreStream< Iterator_, Transformer_ >::Params::Params (typename Iterator::Params const & _iterator)
+
+inline
+
+ +
+
+

Member Data Documentation

+ +

◆ iterator

+ +
+
+
+template<typename Iterator_ , typename Transformer_ = Copy<typename Iterator_::Fragment>>
+ + + + +
Iterator::Params cutlass::TileStoreStream< Iterator_, Transformer_ >::Params::iterator
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1TileStoreStream_1_1PredicateVector.html b/docs/structcutlass_1_1TileStoreStream_1_1PredicateVector.html new file mode 100644 index 0000000000..283d739eed --- /dev/null +++ b/docs/structcutlass_1_1TileStoreStream_1_1PredicateVector.html @@ -0,0 +1,95 @@ + + + + + + + +Cutlass: cutlass::TileStoreStream< Iterator_, Transformer_ >::PredicateVector Struct Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::TileStoreStream< Iterator_, Transformer_ >::PredicateVector Struct Reference
+
+
+ +

Empty predicate vector struct. +

+ +

#include <tile_stream.h>

+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1TileTraits-members.html b/docs/structcutlass_1_1TileTraits-members.html index c4d8ddf526..7f5bea37a1 100644 --- a/docs/structcutlass_1_1TileTraits-members.html +++ b/docs/structcutlass_1_1TileTraits-members.html @@ -73,20 +73,22 @@

-
cutlass::TileTraits< Tile_, Delta_, Iterations_, ThreadOffset_ > Member List
+
cutlass::TileTraits< Tile_, Delta_, Iterations_, ThreadOffset_, AccessSize > Member List
diff --git a/docs/structcutlass_1_1TileTraits.html b/docs/structcutlass_1_1TileTraits.html index b81a519418..b95fdf273e 100644 --- a/docs/structcutlass_1_1TileTraits.html +++ b/docs/structcutlass_1_1TileTraits.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::TileTraits< Tile_, Delta_, Iterations_, ThreadOffset_ > Struct Template Reference +Cutlass: cutlass::TileTraits< Tile_, Delta_, Iterations_, ThreadOffset_, AccessSize > Struct Template Reference @@ -74,9 +74,10 @@
-
cutlass::TileTraits< Tile_, Delta_, Iterations_, ThreadOffset_ > Struct Template Reference
+
cutlass::TileTraits< Tile_, Delta_, Iterations_, ThreadOffset_, AccessSize > Struct Template Reference
@@ -87,82 +88,132 @@ - - - - - - - - - - - - + + + + + + + + + + + + + + + +

Public Types

typedef Tile_ Tile
 Shape of the tile. More...
 
typedef Delta_ Delta
 Number of steps between accesses along each dimension. More...
 
typedef Iterations_ Iterations
 Number of accesses performed. More...
 
typedef ThreadOffset_ ThreadOffset
 Functor that returns the logical coordinate of each entity's initial offset in the tile. More...
 
typedef Tile_ Tile
 Shape of the tile. More...
 
typedef Delta_ Delta
 Number of steps between accesses along each dimension. More...
 
typedef Iterations_ Iterations
 Number of accesses performed. More...
 
typedef ThreadOffset_ ThreadOffset
 Functor that returns the logical coordinate of each entity's initial offset in the tile. More...
 
typedef Shape< 0, 0, 0, 0 > ImmediateOffsetStrides
 Strides for immediate offset computation. More...
 
+ + + +

+Static Public Attributes

static int const kAccessSize = AccessSize
 Access size. More...
 

Member Typedef Documentation

- -

◆ Delta

+ +

◆ Delta

-template<typename Tile_ , typename Delta_ , typename Iterations_ , typename ThreadOffset_ >
+template<typename Tile_ , typename Delta_ , typename Iterations_ , typename ThreadOffset_ , int AccessSize>
- +
typedef Delta_ cutlass::TileTraits< Tile_, Delta_, Iterations_, ThreadOffset_ >::Deltatypedef Delta_ cutlass::TileTraits< Tile_, Delta_, Iterations_, ThreadOffset_, AccessSize >::Delta
- -

◆ Iterations

+ +

◆ ImmediateOffsetStrides

-template<typename Tile_ , typename Delta_ , typename Iterations_ , typename ThreadOffset_ >
+template<typename Tile_ , typename Delta_ , typename Iterations_ , typename ThreadOffset_ , int AccessSize>
- +
typedef Iterations_ cutlass::TileTraits< Tile_, Delta_, Iterations_, ThreadOffset_ >::Iterationstypedef Shape<0, 0, 0, 0> cutlass::TileTraits< Tile_, Delta_, Iterations_, ThreadOffset_, AccessSize >::ImmediateOffsetStrides
- -

◆ ThreadOffset

+ +

◆ Iterations

-template<typename Tile_ , typename Delta_ , typename Iterations_ , typename ThreadOffset_ >
+template<typename Tile_ , typename Delta_ , typename Iterations_ , typename ThreadOffset_ , int AccessSize>
- +
typedef ThreadOffset_ cutlass::TileTraits< Tile_, Delta_, Iterations_, ThreadOffset_ >::ThreadOffsettypedef Iterations_ cutlass::TileTraits< Tile_, Delta_, Iterations_, ThreadOffset_, AccessSize >::Iterations
- -

◆ Tile

+ +

◆ ThreadOffset

-template<typename Tile_ , typename Delta_ , typename Iterations_ , typename ThreadOffset_ >
+template<typename Tile_ , typename Delta_ , typename Iterations_ , typename ThreadOffset_ , int AccessSize>
- +
typedef Tile_ cutlass::TileTraits< Tile_, Delta_, Iterations_, ThreadOffset_ >::Tiletypedef ThreadOffset_ cutlass::TileTraits< Tile_, Delta_, Iterations_, ThreadOffset_, AccessSize >::ThreadOffset
+
+ + +

◆ Tile

+ +
+
+
+template<typename Tile_ , typename Delta_ , typename Iterations_ , typename ThreadOffset_ , int AccessSize>
+ + + + +
typedef Tile_ cutlass::TileTraits< Tile_, Delta_, Iterations_, ThreadOffset_, AccessSize >::Tile
+
+ +
+
+

Member Data Documentation

+ +

◆ kAccessSize

+ +
+
+
+template<typename Tile_ , typename Delta_ , typename Iterations_ , typename ThreadOffset_ , int AccessSize>
+ + + + + +
+ + + + +
int const cutlass::TileTraits< Tile_, Delta_, Iterations_, ThreadOffset_, AccessSize >::kAccessSize = AccessSize
+
+static
+
+

The documentation for this struct was generated from the following file:
diff --git a/docs/structcutlass_1_1TileTraitsContiguousMajor.html b/docs/structcutlass_1_1TileTraitsContiguousMajor.html index 777f6136a6..667bce4d04 100644 --- a/docs/structcutlass_1_1TileTraitsContiguousMajor.html +++ b/docs/structcutlass_1_1TileTraitsContiguousMajor.html @@ -224,7 +224,7 @@

diff --git a/docs/structcutlass_1_1TileTraitsStandard-members.html b/docs/structcutlass_1_1TileTraitsStandard-members.html index 4732a54bcb..40677b2654 100644 --- a/docs/structcutlass_1_1TileTraitsStandard-members.html +++ b/docs/structcutlass_1_1TileTraitsStandard-members.html @@ -79,14 +79,15 @@

This is the complete list of members for cutlass::TileTraitsStandard< Tile_, Threads >, including all inherited members.

- - - - + + + + +
kThreadscutlass::TileTraitsStandard< Tile_, Threads >static
kWarpCountcutlass::TileTraitsStandard< Tile_, Threads >static
kWarpSizecutlass::TileTraitsStandard< Tile_, Threads >static
Tile typedefcutlass::TileTraitsStandard< Tile_, Threads >
kAccessSizecutlass::TileTraitsStandard< Tile_, Threads >static
kThreadscutlass::TileTraitsStandard< Tile_, Threads >static
kWarpCountcutlass::TileTraitsStandard< Tile_, Threads >static
kWarpSizecutlass::TileTraitsStandard< Tile_, Threads >static
Tile typedefcutlass::TileTraitsStandard< Tile_, Threads >
diff --git a/docs/structcutlass_1_1TileTraitsStandard.html b/docs/structcutlass_1_1TileTraitsStandard.html index 7806ece42f..a7a26e38af 100644 --- a/docs/structcutlass_1_1TileTraitsStandard.html +++ b/docs/structcutlass_1_1TileTraitsStandard.html @@ -103,6 +103,9 @@

static int const kWarpCount = kThreads / kWarpSize
 Number of participating warps. More...
 
static int const kAccessSize = 1
 By default, do not do scalar loads. More...
 

Member Typedef Documentation

@@ -122,6 +125,30 @@

Member Data Documentation

+ +

◆ kAccessSize

+ +
+
+
+template<typename Tile_ , int Threads>
+ + + + + +
+ + + + +
int const cutlass::TileTraitsStandard< Tile_, Threads >::kAccessSize = 1
+
+static
+
+ +
+

◆ kThreads

@@ -200,7 +227,7 @@

diff --git a/docs/structcutlass_1_1TileTraitsStrideMajor-members.html b/docs/structcutlass_1_1TileTraitsStrideMajor-members.html index 2ae5190886..76a8c0323f 100644 --- a/docs/structcutlass_1_1TileTraitsStrideMajor-members.html +++ b/docs/structcutlass_1_1TileTraitsStrideMajor-members.html @@ -88,7 +88,7 @@

diff --git a/docs/structcutlass_1_1TileTraitsStrideMajor.html b/docs/structcutlass_1_1TileTraitsStrideMajor.html index d24bc59b03..40c37b8dc0 100644 --- a/docs/structcutlass_1_1TileTraitsStrideMajor.html +++ b/docs/structcutlass_1_1TileTraitsStrideMajor.html @@ -224,7 +224,7 @@

diff --git a/docs/structcutlass_1_1TileTraitsWarpRake-members.html b/docs/structcutlass_1_1TileTraitsWarpRake-members.html index e76c228b18..24bef8e492 100644 --- a/docs/structcutlass_1_1TileTraitsWarpRake-members.html +++ b/docs/structcutlass_1_1TileTraitsWarpRake-members.html @@ -91,7 +91,7 @@

diff --git a/docs/structcutlass_1_1TileTraitsWarpRake.html b/docs/structcutlass_1_1TileTraitsWarpRake.html index 771a8e410e..dfe8d8976e 100644 --- a/docs/structcutlass_1_1TileTraitsWarpRake.html +++ b/docs/structcutlass_1_1TileTraitsWarpRake.html @@ -318,7 +318,7 @@

diff --git a/docs/structcutlass_1_1TileTraitsWarpRake_1_1ThreadOffset-members.html b/docs/structcutlass_1_1TileTraitsWarpRake_1_1ThreadOffset-members.html index e816dc744b..942245c81f 100644 --- a/docs/structcutlass_1_1TileTraitsWarpRake_1_1ThreadOffset-members.html +++ b/docs/structcutlass_1_1TileTraitsWarpRake_1_1ThreadOffset-members.html @@ -83,7 +83,7 @@

diff --git a/docs/structcutlass_1_1TileTraitsWarpRake_1_1ThreadOffset.html b/docs/structcutlass_1_1TileTraitsWarpRake_1_1ThreadOffset.html index cc0d9db2c1..26010fb811 100644 --- a/docs/structcutlass_1_1TileTraitsWarpRake_1_1ThreadOffset.html +++ b/docs/structcutlass_1_1TileTraitsWarpRake_1_1ThreadOffset.html @@ -125,7 +125,7 @@

diff --git a/docs/structcutlass_1_1TiledThreadOffset-members.html b/docs/structcutlass_1_1TiledThreadOffset-members.html index bc28ca5dd8..9088eaddb7 100644 --- a/docs/structcutlass_1_1TiledThreadOffset-members.html +++ b/docs/structcutlass_1_1TiledThreadOffset-members.html @@ -83,7 +83,7 @@

diff --git a/docs/structcutlass_1_1TiledThreadOffset.html b/docs/structcutlass_1_1TiledThreadOffset.html index dbbccc1f22..574a6f5464 100644 --- a/docs/structcutlass_1_1TiledThreadOffset.html +++ b/docs/structcutlass_1_1TiledThreadOffset.html @@ -125,7 +125,7 @@

diff --git a/docs/structcutlass_1_1TrivialPredicateTileAdapter-members.html b/docs/structcutlass_1_1TrivialPredicateTileAdapter-members.html index 1bb156f6fc..65eb939f14 100644 --- a/docs/structcutlass_1_1TrivialPredicateTileAdapter-members.html +++ b/docs/structcutlass_1_1TrivialPredicateTileAdapter-members.html @@ -84,7 +84,7 @@

diff --git a/docs/structcutlass_1_1TrivialPredicateTileAdapter.html b/docs/structcutlass_1_1TrivialPredicateTileAdapter.html index f93bb89cf5..1a8178abea 100644 --- a/docs/structcutlass_1_1TrivialPredicateTileAdapter.html +++ b/docs/structcutlass_1_1TrivialPredicateTileAdapter.html @@ -175,7 +175,7 @@

diff --git a/docs/structcutlass_1_1VectorTraits-members.html b/docs/structcutlass_1_1VectorTraits-members.html index 011de7e914..f19499038c 100644 --- a/docs/structcutlass_1_1VectorTraits-members.html +++ b/docs/structcutlass_1_1VectorTraits-members.html @@ -86,7 +86,7 @@

diff --git a/docs/structcutlass_1_1VectorTraits.html b/docs/structcutlass_1_1VectorTraits.html index 80070a5b10..b19316ae77 100644 --- a/docs/structcutlass_1_1VectorTraits.html +++ b/docs/structcutlass_1_1VectorTraits.html @@ -192,7 +192,7 @@

diff --git a/docs/structcutlass_1_1VectorTraits_3_01Vector_3_01T_00_01Lanes_01_4_01_4-members.html b/docs/structcutlass_1_1VectorTraits_3_01Vector_3_01T_00_01Lanes_01_4_01_4-members.html index e5e2d78019..d5a0cb27d6 100644 --- a/docs/structcutlass_1_1VectorTraits_3_01Vector_3_01T_00_01Lanes_01_4_01_4-members.html +++ b/docs/structcutlass_1_1VectorTraits_3_01Vector_3_01T_00_01Lanes_01_4_01_4-members.html @@ -86,7 +86,7 @@

diff --git a/docs/structcutlass_1_1VectorTraits_3_01Vector_3_01T_00_01Lanes_01_4_01_4.html b/docs/structcutlass_1_1VectorTraits_3_01Vector_3_01T_00_01Lanes_01_4_01_4.html index 39561291cf..d029b5663b 100644 --- a/docs/structcutlass_1_1VectorTraits_3_01Vector_3_01T_00_01Lanes_01_4_01_4.html +++ b/docs/structcutlass_1_1VectorTraits_3_01Vector_3_01T_00_01Lanes_01_4_01_4.html @@ -192,7 +192,7 @@

diff --git a/docs/structcutlass_1_1VectorTraits_3_01Vector_3_01T_00_01Lanes_01_4_01const_01_4-members.html b/docs/structcutlass_1_1VectorTraits_3_01Vector_3_01T_00_01Lanes_01_4_01const_01_4-members.html index a038a43122..3ef83c65c4 100644 --- a/docs/structcutlass_1_1VectorTraits_3_01Vector_3_01T_00_01Lanes_01_4_01const_01_4-members.html +++ b/docs/structcutlass_1_1VectorTraits_3_01Vector_3_01T_00_01Lanes_01_4_01const_01_4-members.html @@ -86,7 +86,7 @@

diff --git a/docs/structcutlass_1_1VectorTraits_3_01Vector_3_01T_00_01Lanes_01_4_01const_01_4.html b/docs/structcutlass_1_1VectorTraits_3_01Vector_3_01T_00_01Lanes_01_4_01const_01_4.html index 7f9a574310..d0a8efc2d3 100644 --- a/docs/structcutlass_1_1VectorTraits_3_01Vector_3_01T_00_01Lanes_01_4_01const_01_4.html +++ b/docs/structcutlass_1_1VectorTraits_3_01Vector_3_01T_00_01Lanes_01_4_01const_01_4.html @@ -192,7 +192,7 @@

diff --git a/docs/structcutlass_1_1Vectorize-members.html b/docs/structcutlass_1_1Vectorize-members.html index 2f3903bd3b..3a8634e083 100644 --- a/docs/structcutlass_1_1Vectorize-members.html +++ b/docs/structcutlass_1_1Vectorize-members.html @@ -83,7 +83,7 @@

diff --git a/docs/structcutlass_1_1Vectorize.html b/docs/structcutlass_1_1Vectorize.html index d728c0a27a..9d5c1b22d0 100644 --- a/docs/structcutlass_1_1Vectorize.html +++ b/docs/structcutlass_1_1Vectorize.html @@ -110,7 +110,7 @@

diff --git a/docs/structcutlass_1_1Vectorize_3_01Vector_3_01bin1__t_00_0132_01_4_00_01kLanes___01_4-members.html b/docs/structcutlass_1_1Vectorize_3_01Vector_3_01bin1__t_00_0132_01_4_00_01kLanes___01_4-members.html new file mode 100644 index 0000000000..f49dfa17c4 --- /dev/null +++ b/docs/structcutlass_1_1Vectorize_3_01Vector_3_01bin1__t_00_0132_01_4_00_01kLanes___01_4-members.html @@ -0,0 +1,91 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::Vectorize< Vector< bin1_t, 32 >, kLanes_ > Member List
+
+
+ +

This is the complete list of members for cutlass::Vectorize< Vector< bin1_t, 32 >, kLanes_ >, including all inherited members.

+ + +
Type typedefcutlass::Vectorize< Vector< bin1_t, 32 >, kLanes_ >
+ + + + diff --git a/docs/structcutlass_1_1Vectorize_3_01Vector_3_01bin1__t_00_0132_01_4_00_01kLanes___01_4.html b/docs/structcutlass_1_1Vectorize_3_01Vector_3_01bin1__t_00_0132_01_4_00_01kLanes___01_4.html new file mode 100644 index 0000000000..8f743cbe9c --- /dev/null +++ b/docs/structcutlass_1_1Vectorize_3_01Vector_3_01bin1__t_00_0132_01_4_00_01kLanes___01_4.html @@ -0,0 +1,118 @@ + + + + + + + +Cutlass: cutlass::Vectorize< Vector< bin1_t, 32 >, kLanes_ > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::Vectorize< Vector< bin1_t, 32 >, kLanes_ > Struct Template Reference
+
+
+ +

#include <vector.h>

+ + + + +

+Public Types

typedef Vector< bin1_t, kLanes_ *32 > Type
 
+

Member Typedef Documentation

+ +

◆ Type

+ +
+
+
+template<int kLanes_>
+ + + + +
typedef Vector<bin1_t, kLanes_ * 32> cutlass::Vectorize< Vector< bin1_t, 32 >, kLanes_ >::Type
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1Vectorize_3_01Vector_3_01int4__t_00_018_01_4_00_01kLanes___01_4-members.html b/docs/structcutlass_1_1Vectorize_3_01Vector_3_01int4__t_00_018_01_4_00_01kLanes___01_4-members.html new file mode 100644 index 0000000000..ec223dacdd --- /dev/null +++ b/docs/structcutlass_1_1Vectorize_3_01Vector_3_01int4__t_00_018_01_4_00_01kLanes___01_4-members.html @@ -0,0 +1,91 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::Vectorize< Vector< int4_t, 8 >, kLanes_ > Member List
+
+
+ +

This is the complete list of members for cutlass::Vectorize< Vector< int4_t, 8 >, kLanes_ >, including all inherited members.

+ + +
Type typedefcutlass::Vectorize< Vector< int4_t, 8 >, kLanes_ >
+ + + + diff --git a/docs/structcutlass_1_1Vectorize_3_01Vector_3_01int4__t_00_018_01_4_00_01kLanes___01_4.html b/docs/structcutlass_1_1Vectorize_3_01Vector_3_01int4__t_00_018_01_4_00_01kLanes___01_4.html new file mode 100644 index 0000000000..c5dc1793d3 --- /dev/null +++ b/docs/structcutlass_1_1Vectorize_3_01Vector_3_01int4__t_00_018_01_4_00_01kLanes___01_4.html @@ -0,0 +1,118 @@ + + + + + + + +Cutlass: cutlass::Vectorize< Vector< int4_t, 8 >, kLanes_ > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::Vectorize< Vector< int4_t, 8 >, kLanes_ > Struct Template Reference
+
+
+ +

#include <vector.h>

+ + + + +

+Public Types

typedef Vector< int4_t, kLanes_ *8 > Type
 
+

Member Typedef Documentation

+ +

◆ Type

+ +
+
+
+template<int kLanes_>
+ + + + +
typedef Vector<int4_t, kLanes_ * 8> cutlass::Vectorize< Vector< int4_t, 8 >, kLanes_ >::Type
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1Vectorize_3_01Vector_3_01uint4__t_00_018_01_4_00_01kLanes___01_4-members.html b/docs/structcutlass_1_1Vectorize_3_01Vector_3_01uint4__t_00_018_01_4_00_01kLanes___01_4-members.html new file mode 100644 index 0000000000..82b79ce57a --- /dev/null +++ b/docs/structcutlass_1_1Vectorize_3_01Vector_3_01uint4__t_00_018_01_4_00_01kLanes___01_4-members.html @@ -0,0 +1,91 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::Vectorize< Vector< uint4_t, 8 >, kLanes_ > Member List
+
+
+ +

This is the complete list of members for cutlass::Vectorize< Vector< uint4_t, 8 >, kLanes_ >, including all inherited members.

+ + +
Type typedefcutlass::Vectorize< Vector< uint4_t, 8 >, kLanes_ >
+ + + + diff --git a/docs/structcutlass_1_1Vectorize_3_01Vector_3_01uint4__t_00_018_01_4_00_01kLanes___01_4.html b/docs/structcutlass_1_1Vectorize_3_01Vector_3_01uint4__t_00_018_01_4_00_01kLanes___01_4.html new file mode 100644 index 0000000000..5122f26e3b --- /dev/null +++ b/docs/structcutlass_1_1Vectorize_3_01Vector_3_01uint4__t_00_018_01_4_00_01kLanes___01_4.html @@ -0,0 +1,118 @@ + + + + + + + +Cutlass: cutlass::Vectorize< Vector< uint4_t, 8 >, kLanes_ > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::Vectorize< Vector< uint4_t, 8 >, kLanes_ > Struct Template Reference
+
+
+ +

#include <vector.h>

+ + + + +

+Public Types

typedef Vector< uint4_t, kLanes_ *8 > Type
 
+

Member Typedef Documentation

+ +

◆ Type

+ +
+
+
+template<int kLanes_>
+ + + + +
typedef Vector<uint4_t, kLanes_ * 8> cutlass::Vectorize< Vector< uint4_t, 8 >, kLanes_ >::Type
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1ZipConvert-members.html b/docs/structcutlass_1_1ZipConvert-members.html new file mode 100644 index 0000000000..d4e9d72fb3 --- /dev/null +++ b/docs/structcutlass_1_1ZipConvert-members.html @@ -0,0 +1,99 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::ZipConvert< First_, Second_ > Member List
+
+ + + + + diff --git a/docs/structcutlass_1_1ZipConvert.html b/docs/structcutlass_1_1ZipConvert.html new file mode 100644 index 0000000000..f2189b54ac --- /dev/null +++ b/docs/structcutlass_1_1ZipConvert.html @@ -0,0 +1,340 @@ + + + + + + + +Cutlass: cutlass::ZipConvert< First_, Second_ > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::ZipConvert< First_, Second_ > Struct Template Reference
+
+
+ +

Zips two convert operations. +

+ +

#include <zip_fragment.h>

+ + + + + + + + + + + + + + +

+Public Types

typedef First_ First
 First convert operator. More...
 
typedef Second_ Second
 Second convert operator. More...
 
typedef ZipFragment< typename First::InputFragment, typename Second::InputFragment > InputFragment
 Defines the input zip fragment. More...
 
typedef ZipFragment< typename First::OutputFragment, typename Second::OutputFragment > OutputFragment
 Defines the output zip fragment. More...
 
+ + + + + + + + + + +

+Public Member Functions

CUTLASS_DEVICE ZipConvert ()
 Ctor. More...
 
CUTLASS_DEVICE ZipConvert (First const &_first, Second const &_second)
 Ctor. More...
 
CUTLASS_DEVICE void transform (InputFragment const &src, OutputFragment &dst)
 Transform a fragment. More...
 
+ + + + + + + +

+Public Attributes

First first
 First transformer. More...
 
Second second
 Second transformer. More...
 
+

Member Typedef Documentation

+ +

◆ First

+ +
+
+
+template<typename First_, typename Second_>
+ + + + +
typedef First_ cutlass::ZipConvert< First_, Second_ >::First
+
+ +
+
+ +

◆ InputFragment

+ +
+
+
+template<typename First_, typename Second_>
+ + + + +
typedef ZipFragment<typename First::InputFragment, typename Second::InputFragment> cutlass::ZipConvert< First_, Second_ >::InputFragment
+
+ +
+
+ +

◆ OutputFragment

+ +
+
+
+template<typename First_, typename Second_>
+ + + + +
typedef ZipFragment<typename First::OutputFragment, typename Second::OutputFragment> cutlass::ZipConvert< First_, Second_ >::OutputFragment
+
+ +
+
+ +

◆ Second

+ +
+
+
+template<typename First_, typename Second_>
+ + + + +
typedef Second_ cutlass::ZipConvert< First_, Second_ >::Second
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ ZipConvert() [1/2]

+ +
+
+
+template<typename First_, typename Second_>
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE cutlass::ZipConvert< First_, Second_ >::ZipConvert ()
+
+inline
+
+ +
+
+ +

◆ ZipConvert() [2/2]

+ +
+
+
+template<typename First_, typename Second_>
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE cutlass::ZipConvert< First_, Second_ >::ZipConvert (First const & _first,
Second const & _second 
)
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ transform()

+ +
+
+
+template<typename First_, typename Second_>
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE void cutlass::ZipConvert< First_, Second_ >::transform (InputFragment const & src,
OutputFragmentdst 
)
+
+inline
+
+ +
+
+

Member Data Documentation

+ +

◆ first

+ +
+
+
+template<typename First_, typename Second_>
+ + + + +
First cutlass::ZipConvert< First_, Second_ >::first
+
+ +
+
+ +

◆ second

+ +
+
+
+template<typename First_, typename Second_>
+ + + + +
Second cutlass::ZipConvert< First_, Second_ >::second
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1ZipFragment-members.html b/docs/structcutlass_1_1ZipFragment-members.html new file mode 100644 index 0000000000..51049f254a --- /dev/null +++ b/docs/structcutlass_1_1ZipFragment-members.html @@ -0,0 +1,98 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::ZipFragment< First_, Second_ > Member List
+
+ + + + + diff --git a/docs/structcutlass_1_1ZipFragment.html b/docs/structcutlass_1_1ZipFragment.html new file mode 100644 index 0000000000..8311f018bd --- /dev/null +++ b/docs/structcutlass_1_1ZipFragment.html @@ -0,0 +1,310 @@ + + + + + + + +Cutlass: cutlass::ZipFragment< First_, Second_ > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::ZipFragment< First_, Second_ > Struct Template Reference
+
+
+ +

A template defining Fragment Concept. +

+ +

#include <zip_fragment.h>

+ + + + + + + + + + + +

+Public Types

typedef First_ First
 First fragment object. More...
 
typedef Second_ Second
 Second fragment object. More...
 
typedef ZipFragment< First, SecondThis_
 This class. More...
 
+ + + + + + + + + + +

+Public Member Functions

CUTLASS_DEVICE ZipFragment ()
 Default ctor. More...
 
CUTLASS_DEVICE ZipFragment (First const &_first, Second const &_second)
 Copy ctor. More...
 
CUTLASS_DEVICE void clear ()
 Clear a fragment. More...
 
+ + + + + + + +

+Public Attributes

First first
 First fragment object. More...
 
Second second
 Second fragment object. More...
 
+

Member Typedef Documentation

+ +

◆ First

+ +
+
+
+template<typename First_, typename Second_>
+ + + + +
typedef First_ cutlass::ZipFragment< First_, Second_ >::First
+
+ +
+
+ +

◆ Second

+ +
+
+
+template<typename First_, typename Second_>
+ + + + +
typedef Second_ cutlass::ZipFragment< First_, Second_ >::Second
+
+ +
+
+ +

◆ This_

+ +
+
+
+template<typename First_, typename Second_>
+ + + + +
typedef ZipFragment<First, Second> cutlass::ZipFragment< First_, Second_ >::This_
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ ZipFragment() [1/2]

+ +
+
+
+template<typename First_, typename Second_>
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE cutlass::ZipFragment< First_, Second_ >::ZipFragment ()
+
+inline
+
+ +
+
+ +

◆ ZipFragment() [2/2]

+ +
+
+
+template<typename First_, typename Second_>
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE cutlass::ZipFragment< First_, Second_ >::ZipFragment (First const & _first,
Second const & _second 
)
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ clear()

+ +
+
+
+template<typename First_, typename Second_>
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE void cutlass::ZipFragment< First_, Second_ >::clear ()
+
+inline
+
+ +
+
+

Member Data Documentation

+ +

◆ first

+ +
+
+
+template<typename First_, typename Second_>
+ + + + +
First cutlass::ZipFragment< First_, Second_ >::first
+
+ +
+
+ +

◆ second

+ +
+
+
+template<typename First_, typename Second_>
+ + + + +
Second cutlass::ZipFragment< First_, Second_ >::second
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1ZipTensorRef-members.html b/docs/structcutlass_1_1ZipTensorRef-members.html new file mode 100644 index 0000000000..41d9fa2760 --- /dev/null +++ b/docs/structcutlass_1_1ZipTensorRef-members.html @@ -0,0 +1,96 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::ZipTensorRef< First_, Second_ > Member List
+
+ + + + + diff --git a/docs/structcutlass_1_1ZipTensorRef.html b/docs/structcutlass_1_1ZipTensorRef.html new file mode 100644 index 0000000000..e9943f2748 --- /dev/null +++ b/docs/structcutlass_1_1ZipTensorRef.html @@ -0,0 +1,255 @@ + + + + + + + +Cutlass: cutlass::ZipTensorRef< First_, Second_ > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::ZipTensorRef< First_, Second_ > Struct Template Reference
+
+
+ +

#include <zip_tensor_ref.h>

+ + + + + + + + +

+Public Types

typedef First_ First
 First tensor ref. More...
 
typedef Second_ Second
 Second tensor ref. More...
 
+ + + + + +

+Public Member Functions

CUTLASS_HOST_DEVICE ZipTensorRef ()
 
CUTLASS_HOST_DEVICE ZipTensorRef (First const &_first, Second const &_second)
 
+ + + + + + + +

+Public Attributes

First first
 First TensorRef. More...
 
Second second
 Second TensorRef. More...
 
+

Member Typedef Documentation

+ +

◆ First

+ +
+
+
+template<typename First_, typename Second_>
+ + + + +
typedef First_ cutlass::ZipTensorRef< First_, Second_ >::First
+
+ +
+
+ +

◆ Second

+ +
+
+
+template<typename First_, typename Second_>
+ + + + +
typedef Second_ cutlass::ZipTensorRef< First_, Second_ >::Second
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ ZipTensorRef() [1/2]

+ +
+
+
+template<typename First_, typename Second_>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE cutlass::ZipTensorRef< First_, Second_ >::ZipTensorRef ()
+
+inline
+
+ +
+
+ +

◆ ZipTensorRef() [2/2]

+ +
+
+
+template<typename First_, typename Second_>
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::ZipTensorRef< First_, Second_ >::ZipTensorRef (First const & _first,
Second const & _second 
)
+
+inline
+
+ +
+
+

Member Data Documentation

+ +

◆ first

+ +
+
+
+template<typename First_, typename Second_>
+ + + + +
First cutlass::ZipTensorRef< First_, Second_ >::first
+
+ +
+
+ +

◆ second

+ +
+
+
+template<typename First_, typename Second_>
+ + + + +
Second cutlass::ZipTensorRef< First_, Second_ >::second
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1ZipTileAllocation-members.html b/docs/structcutlass_1_1ZipTileAllocation-members.html new file mode 100644 index 0000000000..f7489bd008 --- /dev/null +++ b/docs/structcutlass_1_1ZipTileAllocation-members.html @@ -0,0 +1,98 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::ZipTileAllocation< First_, Second_ > Member List
+
+ + + + + diff --git a/docs/structcutlass_1_1ZipTileAllocation.html b/docs/structcutlass_1_1ZipTileAllocation.html new file mode 100644 index 0000000000..844ff780f6 --- /dev/null +++ b/docs/structcutlass_1_1ZipTileAllocation.html @@ -0,0 +1,287 @@ + + + + + + + +Cutlass: cutlass::ZipTileAllocation< First_, Second_ > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::ZipTileAllocation< First_, Second_ > Struct Template Reference
+
+
+ +

Manages a pair of tile allocations as if they are one allocation. +

+ +

#include <tile_allocation.h>

+ + + + + + + + + + + + + + +

+Public Types

typedef First_ First
 First tensor allocation. More...
 
typedef Second_ Second
 Second tensor allocation. More...
 
typedef ZipTensorRef< typename First::TensorRef, typename Second::TensorRef > TensorRef
 Defines the tensor reference for this allocation. More...
 
typedef ZipTensorRef< typename First::ConstTensorRef, typename Second::ConstTensorRef > ConstTensorRef
 Defines the tensor reference for this allocation. More...
 
+ + + + + + + +

+Public Member Functions

CUTLASS_DEVICE TensorRef reference ()
 Returns a TensorRef object pointing to the data. More...
 
CUTLASS_DEVICE ConstTensorRef reference () const
 Returns a TensorRef object pointing to the data. More...
 
+ + + + + + + +

+Public Attributes

First first
 First tensor allocation. More...
 
Second second
 Second tensor allocation. More...
 
+

Member Typedef Documentation

+ +

◆ ConstTensorRef

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + +
typedef ZipTensorRef<typename First::ConstTensorRef, typename Second::ConstTensorRef> cutlass::ZipTileAllocation< First_, Second_ >::ConstTensorRef
+
+ +
+
+ +

◆ First

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + +
typedef First_ cutlass::ZipTileAllocation< First_, Second_ >::First
+
+ +
+
+ +

◆ Second

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + +
typedef Second_ cutlass::ZipTileAllocation< First_, Second_ >::Second
+
+ +
+
+ +

◆ TensorRef

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + +
typedef ZipTensorRef<typename First::TensorRef, typename Second::TensorRef> cutlass::ZipTileAllocation< First_, Second_ >::TensorRef
+
+ +
+
+

Member Function Documentation

+ +

◆ reference() [1/2]

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE TensorRef cutlass::ZipTileAllocation< First_, Second_ >::reference ()
+
+inline
+
+ +
+
+ +

◆ reference() [2/2]

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE ConstTensorRef cutlass::ZipTileAllocation< First_, Second_ >::reference () const
+
+inline
+
+ +
+
+

Member Data Documentation

+ +

◆ first

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + +
First cutlass::ZipTileAllocation< First_, Second_ >::first
+
+ +
+
+ +

◆ second

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + +
Second cutlass::ZipTileAllocation< First_, Second_ >::second
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1ZipTileIterator_1_1Params-members.html b/docs/structcutlass_1_1ZipTileIterator_1_1Params-members.html new file mode 100644 index 0000000000..a197a4a8c8 --- /dev/null +++ b/docs/structcutlass_1_1ZipTileIterator_1_1Params-members.html @@ -0,0 +1,94 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::ZipTileIterator< First_, Second_ >::Params Member List
+
+ + + + + diff --git a/docs/structcutlass_1_1ZipTileIterator_1_1Params.html b/docs/structcutlass_1_1ZipTileIterator_1_1Params.html new file mode 100644 index 0000000000..8034c42aad --- /dev/null +++ b/docs/structcutlass_1_1ZipTileIterator_1_1Params.html @@ -0,0 +1,217 @@ + + + + + + + +Cutlass: cutlass::ZipTileIterator< First_, Second_ >::Params Struct Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::ZipTileIterator< First_, Second_ >::Params Struct Reference
+
+
+ +

Params object. +

+ +

#include <zip_tile_iterator.h>

+ + + + + + + + +

+Public Member Functions

CUTLASS_HOST_DEVICE Params ()
 Constructs a parameters object. More...
 
CUTLASS_HOST_DEVICE Params (typename First::Params const &_first, typename Second::Params const &_second)
 Constructs a parameters object. More...
 
+ + + + + + + +

+Public Attributes

First::Params first
 Parameters of first iterator. More...
 
Second::Params second
 Parameters of second iterator. More...
 
+

Constructor & Destructor Documentation

+ +

◆ Params() [1/2]

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE cutlass::ZipTileIterator< First_, Second_ >::Params::Params ()
+
+inline
+
+ +
+
+ +

◆ Params() [2/2]

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE cutlass::ZipTileIterator< First_, Second_ >::Params::Params (typename First::Params const & _first,
typename Second::Params const & _second 
)
+
+inline
+
+ +
+
+

Member Data Documentation

+ +

◆ first

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + +
First::Params cutlass::ZipTileIterator< First_, Second_ >::Params::first
+
+ +
+
+ +

◆ second

+ +
+
+
+template<typename First_ , typename Second_ >
+ + + + +
Second::Params cutlass::ZipTileIterator< First_, Second_ >::Params::second
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1bin1__t.html b/docs/structcutlass_1_1bin1__t.html new file mode 100644 index 0000000000..c3f4a98446 --- /dev/null +++ b/docs/structcutlass_1_1bin1__t.html @@ -0,0 +1,92 @@ + + + + + + + +Cutlass: cutlass::bin1_t Struct Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::bin1_t Struct Reference
+
+
+ +

#include <numeric_types.h>

+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1divide__assert-members.html b/docs/structcutlass_1_1divide__assert-members.html index 59e5af796f..8ddcbdf9d1 100644 --- a/docs/structcutlass_1_1divide__assert-members.html +++ b/docs/structcutlass_1_1divide__assert-members.html @@ -83,7 +83,7 @@

diff --git a/docs/structcutlass_1_1divide__assert.html b/docs/structcutlass_1_1divide__assert.html index f7dd669007..e7741e4712 100644 --- a/docs/structcutlass_1_1divide__assert.html +++ b/docs/structcutlass_1_1divide__assert.html @@ -119,7 +119,7 @@

diff --git a/docs/structcutlass_1_1gemm_1_1ClearAccumulators-members.html b/docs/structcutlass_1_1gemm_1_1ClearAccumulators-members.html index c3f2e3e929..647a7ed989 100644 --- a/docs/structcutlass_1_1gemm_1_1ClearAccumulators-members.html +++ b/docs/structcutlass_1_1gemm_1_1ClearAccumulators-members.html @@ -81,10 +81,11 @@ +
clear(Fragment_ &fragment)cutlass::gemm::ClearAccumulators< Scalar_, kLanes_ >inline
ClearAccumulators(SharedStorage &shared_storage)cutlass::gemm::ClearAccumulators< Scalar_, kLanes_ >inline
ClearAccumulators()cutlass::gemm::ClearAccumulators< Scalar_, kLanes_ >inline

diff --git a/docs/structcutlass_1_1gemm_1_1ClearAccumulators.html b/docs/structcutlass_1_1gemm_1_1ClearAccumulators.html index e815e57d6f..314cffc664 100644 --- a/docs/structcutlass_1_1gemm_1_1ClearAccumulators.html +++ b/docs/structcutlass_1_1gemm_1_1ClearAccumulators.html @@ -94,6 +94,9 @@
CUTLASS_DEVICE ClearAccumulators (SharedStorage &shared_storage)
 Ctor. More...
 
CUTLASS_DEVICE ClearAccumulators ()
 Ctor. More...
 
template<typename Fragment_ >
CUTLASS_DEVICE void clear (Fragment_ &fragment)
 Clear the fragment. More...

Constructor & Destructor Documentation

-

◆ ClearAccumulators()

+

◆ ClearAccumulators() [1/2]

@@ -126,6 +129,33 @@

+

+
+ +

◆ ClearAccumulators() [2/2]

+ +
+
+
+template<typename Scalar_ , int kLanes_ = 1>
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE cutlass::gemm::ClearAccumulators< Scalar_, kLanes_ >::ClearAccumulators ()
+
+inline
+
+

Member Function Documentation

@@ -165,7 +195,7 @@

diff --git a/docs/structcutlass_1_1gemm_1_1ClearAccumulators_1_1SharedStorage.html b/docs/structcutlass_1_1gemm_1_1ClearAccumulators_1_1SharedStorage.html index b97be88f35..c30f552209 100644 --- a/docs/structcutlass_1_1gemm_1_1ClearAccumulators_1_1SharedStorage.html +++ b/docs/structcutlass_1_1gemm_1_1ClearAccumulators_1_1SharedStorage.html @@ -87,7 +87,7 @@ diff --git a/docs/structcutlass_1_1gemm_1_1ColumnMajorBlockSwizzle-members.html b/docs/structcutlass_1_1gemm_1_1ColumnMajorBlockSwizzle-members.html new file mode 100644 index 0000000000..698bd4d6c3 --- /dev/null +++ b/docs/structcutlass_1_1gemm_1_1ColumnMajorBlockSwizzle-members.html @@ -0,0 +1,95 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::gemm::ColumnMajorBlockSwizzle< groupCols, swDirection > Member List
+
+ + + + + diff --git a/docs/structcutlass_1_1gemm_1_1ColumnMajorBlockSwizzle.html b/docs/structcutlass_1_1gemm_1_1ColumnMajorBlockSwizzle.html new file mode 100644 index 0000000000..19c0d5e2e8 --- /dev/null +++ b/docs/structcutlass_1_1gemm_1_1ColumnMajorBlockSwizzle.html @@ -0,0 +1,260 @@ + + + + + + + +Cutlass: cutlass::gemm::ColumnMajorBlockSwizzle< groupCols, swDirection > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::gemm::ColumnMajorBlockSwizzle< groupCols, swDirection > Struct Template Reference
+
+
+ +

#include <threadblock_swizzle.h>

+ + + + + + + + + + + + + + +

+Public Member Functions

CUTLASS_HOST_DEVICE ColumnMajorBlockSwizzle ()
 Ctor. More...
 
CUTLASS_DEVICE dim3 swizzle ()
 Swizzle the block index. More...
 
CUTLASS_HOST_DEVICE dim3 get_grid_layout (GemmCoord const &problem_size, Coord< 3 > const &OutputTile)
 
CUTLASS_DEVICE Coord< 3 > get_threadblock_offset (Coord< 3 > const &OutputTile)
 
CUTLASS_DEVICE int get_batch_id ()
 
+

Constructor & Destructor Documentation

+ +

◆ ColumnMajorBlockSwizzle()

+ +
+
+
+template<int groupCols, enum swizzleDirection::Kind swDirection>
+ + + + + +
+ + + + + + + +
CUTLASS_HOST_DEVICE cutlass::gemm::ColumnMajorBlockSwizzle< groupCols, swDirection >::ColumnMajorBlockSwizzle ()
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ get_batch_id()

+ +
+
+
+template<int groupCols, enum swizzleDirection::Kind swDirection>
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE int cutlass::gemm::ColumnMajorBlockSwizzle< groupCols, swDirection >::get_batch_id ()
+
+inline
+
+ +
+
+ +

◆ get_grid_layout()

+ +
+
+
+template<int groupCols, enum swizzleDirection::Kind swDirection>
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
CUTLASS_HOST_DEVICE dim3 cutlass::gemm::ColumnMajorBlockSwizzle< groupCols, swDirection >::get_grid_layout (GemmCoord const & problem_size,
Coord< 3 > const & OutputTile 
)
+
+inline
+
+ +
+
+ +

◆ get_threadblock_offset()

+ +
+
+
+template<int groupCols, enum swizzleDirection::Kind swDirection>
+ + + + + +
+ + + + + + + + +
CUTLASS_DEVICE Coord<3> cutlass::gemm::ColumnMajorBlockSwizzle< groupCols, swDirection >::get_threadblock_offset (Coord< 3 > const & OutputTile)
+
+inline
+
+ +
+
+ +

◆ swizzle()

+ +
+
+
+template<int groupCols, enum swizzleDirection::Kind swDirection>
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE dim3 cutlass::gemm::ColumnMajorBlockSwizzle< groupCols, swDirection >::swizzle ()
+
+inline
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1gemm_1_1DgemmConfig-members.html b/docs/structcutlass_1_1gemm_1_1DgemmConfig-members.html index 256b383d18..74a8b93953 100644 --- a/docs/structcutlass_1_1gemm_1_1DgemmConfig-members.html +++ b/docs/structcutlass_1_1gemm_1_1DgemmConfig-members.html @@ -73,41 +73,44 @@
-
cutlass::gemm::DgemmConfig< OutputTile_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_ > Member List
+
cutlass::gemm::DgemmConfig< OutputTile_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_ > Member List
-

This is the complete list of members for cutlass::gemm::DgemmConfig< OutputTile_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_ >, including all inherited members.

+

This is the complete list of members for cutlass::gemm::DgemmConfig< OutputTile_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_ >, including all inherited members.

- - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Accumulators typedefcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >
AccumulatorsPerWarp typedefcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >
InstructionShape typedefcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >
kAccumulatorsPerLdsAcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >static
kAccumulatorsPerLdsBcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >static
kScalarsPerLdgAcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >static
kScalarsPerLdgBcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >static
kScalarsPerLdgCcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >static
kScalarsPerLdsAcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >static
kScalarsPerLdsBcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >static
kScalarsPerLdsDcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >static
kScalarsPerStgDcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >static
kScalarsPerStsAcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >static
kScalarsPerStsBcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >static
kScalarsPerStsDcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >static
kStagescutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >static
kThreadscutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >static
kWarpSizecutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >static
MultiplyAdd typedefcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >
OutputTile typedefcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >
ScalarA typedefcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >
ScalarB typedefcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >
ScalarC typedefcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >
ScalarD typedefcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >
Warps typedefcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >
Accumulators typedefcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >
AccumulatorsPerWarp typedefcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >
InstructionShape typedefcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >
kAccumulatorsPerLdsAcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >static
kAccumulatorsPerLdsBcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >static
kLaunchBoundscutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >static
kResidueInPrologcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >static
kResidueSeparatecutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >static
kScalarsPerLdgAcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >static
kScalarsPerLdgBcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >static
kScalarsPerLdgCcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >static
kScalarsPerLdsAcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >static
kScalarsPerLdsBcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >static
kScalarsPerLdsDcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >static
kScalarsPerStgDcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >static
kScalarsPerStsAcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >static
kScalarsPerStsBcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >static
kScalarsPerStsDcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >static
kStagescutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >static
kThreadscutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >static
kWarpSizecutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >static
MultiplyAdd typedefcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >
OutputTile typedefcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >
ScalarA typedefcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >
ScalarB typedefcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >
ScalarC typedefcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >
ScalarD typedefcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >
Warps typedefcutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >
diff --git a/docs/structcutlass_1_1gemm_1_1DgemmConfig.html b/docs/structcutlass_1_1gemm_1_1DgemmConfig.html index 7ac0411289..cb727aca5e 100644 --- a/docs/structcutlass_1_1gemm_1_1DgemmConfig.html +++ b/docs/structcutlass_1_1gemm_1_1DgemmConfig.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::gemm::DgemmConfig< OutputTile_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_ > Struct Template Reference +Cutlass: cutlass::gemm::DgemmConfig< OutputTile_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_ > Struct Template Reference @@ -75,93 +75,102 @@
-
cutlass::gemm::DgemmConfig< OutputTile_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_ > Struct Template Reference
+
cutlass::gemm::DgemmConfig< OutputTile_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_ > Struct Template Reference

#include <dgemm_traits.h>

-Inheritance diagram for cutlass::gemm::DgemmConfig< OutputTile_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_ >:
+Inheritance diagram for cutlass::gemm::DgemmConfig< OutputTile_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_ >:
- - -cutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 > + + +cutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Additional Inherited Members

- Public Types inherited from cutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >
typedef double ScalarA
 The scalar for A. More...
 
typedef double ScalarB
 The scalar for B. More...
 
typedef double ScalarC
 The scalar for C. More...
 
typedef double ScalarD
 The scalar for D. More...
 
typedef OutputTile_ OutputTile
 The tile. More...
 
typedef ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double > MultiplyAdd
 The functor to do D = A*B + C. More...
 
typedef MultiplyAdd::InstructionShape InstructionShape
 The shape of the instruction. More...
 
typedef MultiplyAdd::AccumulatorsPerWarp AccumulatorsPerWarp
 The number of accumulators per warp. More...
 
typedef MultiplyAdd::Accumulators Accumulators
 The accumulators. More...
 
typedef ShapeDiv< OutputTile, AccumulatorsPerWarp >::Shape Warps
 The number of warps. More...
 
- Static Public Attributes inherited from cutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2 >
static int const kWarpSize
 The default warp size (32 threads per warp). More...
 
static int const kThreads
 The numnber of threads. More...
 
static int const kScalarsPerLdgA
 The number of scalars per LDG/STS/LDS for A. More...
 
static int const kScalarsPerStsA
 
static int const kScalarsPerLdsA
 
static int const kScalarsPerLdgB
 The number of scalars per LDG/STS/LDS for B. More...
 
static int const kScalarsPerStsB
 
static int const kScalarsPerLdsB
 
static int const kScalarsPerLdgC
 The number of scalars per LDG for C. More...
 
static int const kScalarsPerStgD
 The number of scalars per STS/LDS/STG for D. More...
 
static int const kScalarsPerStsD
 
static int const kScalarsPerLdsD
 
static int const kAccumulatorsPerLdsA
 The number of accumulators that are going to be fed from one LDS A/B. More...
 
static int const kAccumulatorsPerLdsB
 
static int const kStages
 The number of stages in shared memory to implement double, triple, more-buffering. More...
 
- Public Types inherited from cutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >
typedef double ScalarA
 The scalar for A. More...
 
typedef double ScalarB
 The scalar for B. More...
 
typedef double ScalarC
 The scalar for C. More...
 
typedef double ScalarD
 The scalar for D. More...
 
typedef OutputTile_ OutputTile
 The tile. More...
 
typedef ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double > MultiplyAdd
 The functor to do D = A*B + C. More...
 
typedef MultiplyAdd::InstructionShape InstructionShape
 The shape of the instruction. More...
 
typedef MultiplyAdd::AccumulatorsPerWarp AccumulatorsPerWarp
 The shape of warp-level GEMM. More...
 
typedef MultiplyAdd::Accumulators Accumulators
 The accumulators. More...
 
typedef ShapeDiv< OutputTile, AccumulatorsPerWarp >::Shape Warps
 The number of warps. More...
 
- Static Public Attributes inherited from cutlass::gemm::GemmConfig< double, double, double, double, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, double, double, double >, kScalarsPerLdgA_, kScalarsPerLdgA_, 2, kScalarsPerLdgB_, kScalarsPerLdgB_, 2, 1, 2, 1, 2, false, false, false >
static int const kWarpSize
 The default warp size (32 threads per warp). More...
 
static int const kThreads
 The numnber of threads. More...
 
static int const kScalarsPerLdgA
 The number of scalars per LDG/STS/LDS for A. More...
 
static int const kScalarsPerStsA
 
static int const kScalarsPerLdsA
 
static int const kScalarsPerLdgB
 The number of scalars per LDG/STS/LDS for B. More...
 
static int const kScalarsPerStsB
 
static int const kScalarsPerLdsB
 
static int const kScalarsPerLdgC
 The number of scalars per LDG for C. More...
 
static int const kScalarsPerStgD
 The number of scalars per STS/LDS/STG for D. More...
 
static int const kScalarsPerStsD
 
static int const kScalarsPerLdsD
 
static int const kAccumulatorsPerLdsA
 The number of accumulators that are going to be fed from one LDS A/B. More...
 
static int const kAccumulatorsPerLdsB
 
static int const kStages
 The number of stages in shared memory to implement double, triple, more-buffering. More...
 
static bool const kResidueSeparate
 If true, mainloop is instantiated twice. The first instantiation contains no predicate. More...
 
static bool const kResidueInProlog
 If true, residue is computed in the prologue. More...
 
static bool const kLaunchBounds
 If true, kernel is launched with launch bounds specified. More...
 

The documentation for this struct was generated from the following file:

Additional Inherited Members

- Public Types inherited from cutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
typedef GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > > This_
 This traits. More...
 
typedef cutlass::gemm::Gemm< This_KernelClass
 The struct that consumes this Traits. More...
 
typedef GemmConfig_ GemmConfig
 The configuration. More...
 
typedef GemmConfig::OutputTile OutputTile
typedef GemmConfig::OutputTile OutputTile
 The output tile. More...
 
typedef SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA GlobalLoadStreamA
typedef SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB SharedLoadStreamB
 The iterator for B to load from shared memory. More...
 
typedef GlobalLoadStreamA::SharedStoreStorage SharedStoreStorageA
 The shared storage for A. More...
 
typedef GlobalLoadStreamB::SharedStoreStorage SharedStoreStorageB
 The shared storage for B. More...
 
typedef GemmConfig::MultiplyAdd MultiplyAdd
typedef GemmConfig::MultiplyAdd MultiplyAdd
 The multiply-add functor. More...
 
typedef GemmEpilogue< GemmEpilogueTraits_ > Epilogue
typedef ClearAccumulators< GemmConfig_::Accumulators::Element > ClearAccumulators
 Clear the accumulators. More...
 
typedef GlobalLoadStreamPair< GlobalLoadStreamA, GlobalLoadStreamB, GemmConfig::kResidueInProlog > GlobalLoadStream
 Assemble the global load streams for A/B. More...
 
typedef GlobalLoadStream::ThreadblockTileStorage ThreadblockTileStorage
 Memory needed to store the threadblock-scoped GEMM tile. More...
 
typedef SharedStreamPair< SharedLoadStreamA, SharedLoadStreamBSharedStream
 Assemble the shared load streams for A/B. More...
 
- Static Public Member Functions inherited from cutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
static CUTLASS_DEVICE void shared_load_fence (bool in_loop)
 The memory fence for shared loads. More...
+ + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+ + + + + + + + + +
+
+ + +
+ +
+ + + +
+
+
cutlass::gemm::Fp16SgemmConfig< OutputTile_, ThreadGemmShape_, ScalarA_, ScalarB_, ScalarC_, ScalarD_, kScalarsPerLdgA_, kScalarsPerLdgB_ > Member List
+
+
+ +

This is the complete list of members for cutlass::gemm::Fp16SgemmConfig< OutputTile_, ThreadGemmShape_, ScalarA_, ScalarB_, ScalarC_, ScalarD_, kScalarsPerLdgA_, kScalarsPerLdgB_ >, including all inherited members.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Accumulators typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >
AccumulatorsPerWarp typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >
InstructionShape typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >
kAccumulatorsPerLdsAcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >static
kAccumulatorsPerLdsBcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >static
kLaunchBoundscutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >static
kResidueInPrologcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >static
kResidueSeparatecutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >static
kScalarsPerLdgAcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >static
kScalarsPerLdgBcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >static
kScalarsPerLdgCcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >static
kScalarsPerLdsAcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >static
kScalarsPerLdsBcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >static
kScalarsPerLdsDcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >static
kScalarsPerStgDcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >static
kScalarsPerStsAcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >static
kScalarsPerStsBcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >static
kScalarsPerStsDcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >static
kStagescutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >static
kThreadscutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >static
kWarpSizecutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >static
MultiplyAdd typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >
OutputTile typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >
ScalarA typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >
ScalarB typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >
ScalarC typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >
ScalarD typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >
Warps typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >
+ + + + diff --git a/docs/structcutlass_1_1gemm_1_1Fp16SgemmConfig.html b/docs/structcutlass_1_1gemm_1_1Fp16SgemmConfig.html new file mode 100644 index 0000000000..b377b2896f --- /dev/null +++ b/docs/structcutlass_1_1gemm_1_1Fp16SgemmConfig.html @@ -0,0 +1,186 @@ + + + + + + + +Cutlass: cutlass::gemm::Fp16SgemmConfig< OutputTile_, ThreadGemmShape_, ScalarA_, ScalarB_, ScalarC_, ScalarD_, kScalarsPerLdgA_, kScalarsPerLdgB_ > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::gemm::Fp16SgemmConfig< OutputTile_, ThreadGemmShape_, ScalarA_, ScalarB_, ScalarC_, ScalarD_, kScalarsPerLdgA_, kScalarsPerLdgB_ > Struct Template Reference
+
+
+ +

#include <fp16_sgemm_traits.h>

+
+Inheritance diagram for cutlass::gemm::Fp16SgemmConfig< OutputTile_, ThreadGemmShape_, ScalarA_, ScalarB_, ScalarC_, ScalarD_, kScalarsPerLdgA_, kScalarsPerLdgB_ >:
+
+
+ + +cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 > + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

+Additional Inherited Members

- Public Types inherited from cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >
typedef ScalarA_ ScalarA
 The scalar for A. More...
 
typedef ScalarB_ ScalarB
 The scalar for B. More...
 
typedef ScalarC_ ScalarC
 The scalar for C. More...
 
typedef ScalarD_ ScalarD
 The scalar for D. More...
 
typedef OutputTile_ OutputTile
 The tile. More...
 
typedef ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float > MultiplyAdd
 The functor to do D = A*B + C. More...
 
typedef MultiplyAdd::InstructionShape InstructionShape
 The shape of the instruction. More...
 
typedef MultiplyAdd::AccumulatorsPerWarp AccumulatorsPerWarp
 The shape of warp-level GEMM. More...
 
typedef MultiplyAdd::Accumulators Accumulators
 The accumulators. More...
 
typedef ShapeDiv< OutputTile, AccumulatorsPerWarp >::Shape Warps
 The number of warps. More...
 
- Static Public Attributes inherited from cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, ScalarA_, ScalarB_, float >, kScalarsPerLdgA_, kScalarsPerLdgA_, 4, kScalarsPerLdgB_, kScalarsPerLdgB_, 4, 1, 4, 1, 2 >
static int const kWarpSize
 The default warp size (32 threads per warp). More...
 
static int const kThreads
 The numnber of threads. More...
 
static int const kScalarsPerLdgA
 The number of scalars per LDG/STS/LDS for A. More...
 
static int const kScalarsPerStsA
 
static int const kScalarsPerLdsA
 
static int const kScalarsPerLdgB
 The number of scalars per LDG/STS/LDS for B. More...
 
static int const kScalarsPerStsB
 
static int const kScalarsPerLdsB
 
static int const kScalarsPerLdgC
 The number of scalars per LDG for C. More...
 
static int const kScalarsPerStgD
 The number of scalars per STS/LDS/STG for D. More...
 
static int const kScalarsPerStsD
 
static int const kScalarsPerLdsD
 
static int const kAccumulatorsPerLdsA
 The number of accumulators that are going to be fed from one LDS A/B. More...
 
static int const kAccumulatorsPerLdsB
 
static int const kStages
 The number of stages in shared memory to implement double, triple, more-buffering. More...
 
static bool const kResidueSeparate
 If true, mainloop is instantiated twice. The first instantiation contains no predicate. More...
 
static bool const kResidueInProlog
 If true, residue is computed in the prologue. More...
 
static bool const kLaunchBounds
 If true, kernel is launched with launch bounds specified. More...
 
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1gemm_1_1Fp16SgemmConfig.png b/docs/structcutlass_1_1gemm_1_1Fp16SgemmConfig.png new file mode 100644 index 0000000000..b06220083c Binary files /dev/null and b/docs/structcutlass_1_1gemm_1_1Fp16SgemmConfig.png differ diff --git a/docs/structcutlass_1_1gemm_1_1Fp16SgemmSgemmTraits-members.html b/docs/structcutlass_1_1gemm_1_1Fp16SgemmSgemmTraits-members.html new file mode 100644 index 0000000000..b133a1dc10 --- /dev/null +++ b/docs/structcutlass_1_1gemm_1_1Fp16SgemmSgemmTraits-members.html @@ -0,0 +1,114 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+
+
cutlass::gemm::Fp16SgemmSgemmTraits< kLayoutA_, kLayoutB_, OutputTile_, ScalarA_, ScalarB_, ScalarC_, ScalarD_, Scalar_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_, GemmConfig_, GemmEpilogueTraits_ > Member List
+
+
+ +

This is the complete list of members for cutlass::gemm::Fp16SgemmSgemmTraits< kLayoutA_, kLayoutB_, OutputTile_, ScalarA_, ScalarB_, ScalarC_, ScalarD_, Scalar_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_, GemmConfig_, GemmEpilogueTraits_ >, including all inherited members.

+ + + + + + + + + + + + + + + + + + + + + + + + + +
BlockSwizzle typedefcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
ClearAccumulators typedefcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
Epilogue typedefcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
GemmConfig typedefcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
GlobalLoadStream typedefcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
GlobalLoadStreamA typedefcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
GlobalLoadStreamB typedefcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
Index typedefcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
KernelClass typedefcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
kLayoutAcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >static
kLayoutBcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >static
MultiplyAdd typedefcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
OutputTile typedefcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
ScalarA typedefcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
ScalarB typedefcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
ScalarC typedefcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
ScalarD typedefcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
shared_load_fence(bool in_loop)cutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >inlinestatic
shared_store_fence(bool in_loop)cutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >inlinestatic
SharedLoadStreamA typedefcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
SharedLoadStreamB typedefcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
SharedStream typedefcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
This_ typedefcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
ThreadblockTileStorage typedefcutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
+ + + + diff --git a/docs/structcutlass_1_1gemm_1_1Fp16SgemmSgemmTraits.html b/docs/structcutlass_1_1gemm_1_1Fp16SgemmSgemmTraits.html new file mode 100644 index 0000000000..1d06a26c89 --- /dev/null +++ b/docs/structcutlass_1_1gemm_1_1Fp16SgemmSgemmTraits.html @@ -0,0 +1,182 @@ + + + + + + + +Cutlass: cutlass::gemm::Fp16SgemmSgemmTraits< kLayoutA_, kLayoutB_, OutputTile_, ScalarA_, ScalarB_, ScalarC_, ScalarD_, Scalar_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_, GemmConfig_, GemmEpilogueTraits_ > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::gemm::Fp16SgemmSgemmTraits< kLayoutA_, kLayoutB_, OutputTile_, ScalarA_, ScalarB_, ScalarC_, ScalarD_, Scalar_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_, GemmConfig_, GemmEpilogueTraits_ > Struct Template Reference
+
+
+ +

#include <fp16_sgemm_traits.h>

+
+Inheritance diagram for cutlass::gemm::Fp16SgemmSgemmTraits< kLayoutA_, kLayoutB_, OutputTile_, ScalarA_, ScalarB_, ScalarC_, ScalarD_, Scalar_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_, GemmConfig_, GemmEpilogueTraits_ >:
+
+
+ + +cutlass::gemm::SimplifiedGemmTraits< kLayoutA_, kLayoutB_, GemmConfig_, GemmEpilogue< GemmEpilogueTraits_ >, Index_ > +cutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > > + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

+Additional Inherited Members

- Public Types inherited from cutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
typedef GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > > This_
 This traits. More...
 
typedef cutlass::gemm::Gemm< This_KernelClass
 The struct that consumes this Traits. More...
 
typedef GemmConfig_ GemmConfig
 The configuration. More...
 
typedef GemmConfig::OutputTile OutputTile
 The output tile. More...
 
typedef SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA GlobalLoadStreamA
 The stream to load A from global memory to shared memory. More...
 
typedef SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA ::Scalar ScalarA
 The scalar for A. More...
 
typedef SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB GlobalLoadStreamB
 The stream to load B from global memory to shared memory. More...
 
typedef SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB ::Scalar ScalarB
 The scalar for B. More...
 
typedef SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA SharedLoadStreamA
 The iterator for A to load from shared memory. More...
 
typedef SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB SharedLoadStreamB
 The iterator for B to load from shared memory. More...
 
typedef GemmConfig::MultiplyAdd MultiplyAdd
 The multiply-add functor. More...
 
typedef GemmEpilogue< GemmEpilogueTraits_ > Epilogue
 The epilogue. More...
 
typedef Epilogue::ScalarC ScalarC
 The scalars in the epilogue. More...
 
typedef Epilogue::ScalarD ScalarD
 
typedef IdentityBlockSwizzle BlockSwizzle
 The block swizzle to reorganize the grid. More...
 
typedef Index_ Index
 The index. More...
 
typedef ClearAccumulators< GemmConfig_::Accumulators::Element > ClearAccumulators
 Clear the accumulators. More...
 
typedef GlobalLoadStreamPair< GlobalLoadStreamA, GlobalLoadStreamB, GemmConfig::kResidueInProlog > GlobalLoadStream
 Assemble the global load streams for A/B. More...
 
typedef GlobalLoadStream::ThreadblockTileStorage ThreadblockTileStorage
 Memory needed to store the threadblock-scoped GEMM tile. More...
 
typedef SharedStreamPair< SharedLoadStreamA, SharedLoadStreamBSharedStream
 Assemble the shared load streams for A/B. More...
 
- Static Public Member Functions inherited from cutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
static CUTLASS_DEVICE void shared_load_fence (bool in_loop)
 The memory fence for shared loads. More...
 
static CUTLASS_DEVICE void shared_store_fence (bool in_loop)
 The memory fence for shared stores. More...
 
- Static Public Attributes inherited from cutlass::gemm::GemmTraits< GemmConfig_, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::GlobalLoadStreamB, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamA, SimplifiedGemmTraitsHelper< GemmTileTraitsHelperA< kLayoutA_, GemmConfig_ >, GemmTileTraitsHelperB< kLayoutB_, GemmConfig_ >, Index_ > ::SharedLoadStreamB, GemmEpilogue< GemmEpilogueTraits_ >, IdentityBlockSwizzle, Index_, ClearAccumulators< GemmConfig_::Accumulators::Element > >
static MatrixLayout::Kind const kLayoutA
 The layout of A. More...
 
static MatrixLayout::Kind const kLayoutB
 The layout of B. More...
 
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1gemm_1_1Fp16SgemmSgemmTraits.png b/docs/structcutlass_1_1gemm_1_1Fp16SgemmSgemmTraits.png new file mode 100644 index 0000000000..c5dd66deab Binary files /dev/null and b/docs/structcutlass_1_1gemm_1_1Fp16SgemmSgemmTraits.png differ diff --git a/docs/structcutlass_1_1gemm_1_1FragmentMultiplyAdd-members.html b/docs/structcutlass_1_1gemm_1_1FragmentMultiplyAdd-members.html index f03e26ac28..d7c382ac07 100644 --- a/docs/structcutlass_1_1gemm_1_1FragmentMultiplyAdd-members.html +++ b/docs/structcutlass_1_1gemm_1_1FragmentMultiplyAdd-members.html @@ -73,23 +73,22 @@
-
cutlass::gemm::FragmentMultiplyAdd< Scalar_ > Member List
+
cutlass::gemm::FragmentMultiplyAdd< ScalarAlphaBeta_, ScalarAccum_, fragMul2 > Member List
-

This is the complete list of members for cutlass::gemm::FragmentMultiplyAdd< Scalar_ >, including all inherited members.

+

This is the complete list of members for cutlass::gemm::FragmentMultiplyAdd< ScalarAlphaBeta_, ScalarAccum_, fragMul2 >, including all inherited members.

- - - - - - - + + + + + +
FragmentMultiplyAdd()cutlass::gemm::FragmentMultiplyAdd< Scalar_ >inline
InstructionShape typedefcutlass::gemm::FragmentMultiplyAdd< Scalar_ >
multiply(Scalar_ a, Fragment_ const &b, Fragment_ &d)cutlass::gemm::FragmentMultiplyAdd< Scalar_ >inline
multiply_add(Scalar_ a, Fragment_ const &b, Fragment_ const &c, Fragment_ &d)cutlass::gemm::FragmentMultiplyAdd< Scalar_ >inline
ScalarA typedefcutlass::gemm::FragmentMultiplyAdd< Scalar_ >
ScalarB typedefcutlass::gemm::FragmentMultiplyAdd< Scalar_ >
ScalarC typedefcutlass::gemm::FragmentMultiplyAdd< Scalar_ >
FragmentMultiplyAdd()cutlass::gemm::FragmentMultiplyAdd< ScalarAlphaBeta_, ScalarAccum_, fragMul2 >inline
InstructionShape typedefcutlass::gemm::FragmentMultiplyAdd< ScalarAlphaBeta_, ScalarAccum_, fragMul2 >
multiply(ScalarAlphaBeta a, FragmentB_ const &b, FragmentCd_ &d)cutlass::gemm::FragmentMultiplyAdd< ScalarAlphaBeta_, ScalarAccum_, fragMul2 >inline
multiply_add(ScalarAlphaBeta a, FragmentB_ const &b, FragmentCd_ const &c, FragmentCd_ &d)cutlass::gemm::FragmentMultiplyAdd< ScalarAlphaBeta_, ScalarAccum_, fragMul2 >inline
ScalarAccum typedefcutlass::gemm::FragmentMultiplyAdd< ScalarAlphaBeta_, ScalarAccum_, fragMul2 >
ScalarAlphaBeta typedefcutlass::gemm::FragmentMultiplyAdd< ScalarAlphaBeta_, ScalarAccum_, fragMul2 >
diff --git a/docs/structcutlass_1_1gemm_1_1FragmentMultiplyAdd.html b/docs/structcutlass_1_1gemm_1_1FragmentMultiplyAdd.html index bde87a6e3c..29bd3c0746 100644 --- a/docs/structcutlass_1_1gemm_1_1FragmentMultiplyAdd.html +++ b/docs/structcutlass_1_1gemm_1_1FragmentMultiplyAdd.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::gemm::FragmentMultiplyAdd< Scalar_ > Struct Template Reference +Cutlass: cutlass::gemm::FragmentMultiplyAdd< ScalarAlphaBeta_, ScalarAccum_, fragMul2 > Struct Template Reference @@ -77,7 +77,7 @@ Public Member Functions | List of all members
-
cutlass::gemm::FragmentMultiplyAdd< Scalar_ > Struct Template Reference
+
cutlass::gemm::FragmentMultiplyAdd< ScalarAlphaBeta_, ScalarAccum_, fragMul2 > Struct Template Reference
@@ -85,92 +85,73 @@ - - - - - - - - - - - - + + + + + + + + +

Public Types

typedef Shape< 1, 1, 1, 1 > InstructionShape
 The shape of the instruction. More...
 
typedef Scalar_ ScalarA
 The type for A. More...
 
typedef Scalar_ ScalarB
 The type for B. More...
 
typedef Scalar_ ScalarC
 The type for C and D. More...
 
typedef Shape< 1, 1, 1, 1 > InstructionShape
 The shape of the instruction. More...
 
typedef ScalarAlphaBeta_ ScalarAlphaBeta
 The type for alpha and beta. More...
 
typedef ScalarAccum_ ScalarAccum
 The type for accumlator. More...
 
- - - - - - - - - - - + + + + + + + + + + +

Public Member Functions

CUTLASS_DEVICE FragmentMultiplyAdd ()
 Ctor. More...
 
template<typename Fragment_ >
CUTLASS_DEVICE void multiply (Scalar_ a, Fragment_ const &b, Fragment_ &d)
 Multiply : d = a*b. More...
 
template<typename Fragment_ >
CUTLASS_DEVICE void multiply_add (Scalar_ a, Fragment_ const &b, Fragment_ const &c, Fragment_ &d)
 Multiply : d = a*b + c. More...
 
CUTLASS_DEVICE FragmentMultiplyAdd ()
 Ctor. More...
 
template<typename FragmentB_ , typename FragmentCd_ >
CUTLASS_DEVICE void multiply (ScalarAlphaBeta a, FragmentB_ const &b, FragmentCd_ &d)
 Multiply : d = a*b. More...
 
template<typename FragmentB_ , typename FragmentCd_ >
CUTLASS_DEVICE void multiply_add (ScalarAlphaBeta a, FragmentB_ const &b, FragmentCd_ const &c, FragmentCd_ &d)
 Multiply : d = a*b + c. More...
 

Member Typedef Documentation

- -

◆ InstructionShape

+ +

◆ InstructionShape

-template<typename Scalar_ >
+template<typename ScalarAlphaBeta_ , typename ScalarAccum_ , bool fragMul2 = true>
- +
typedef Shape<1, 1, 1, 1> cutlass::gemm::FragmentMultiplyAdd< Scalar_ >::InstructionShapetypedef Shape<1, 1, 1, 1> cutlass::gemm::FragmentMultiplyAdd< ScalarAlphaBeta_, ScalarAccum_, fragMul2 >::InstructionShape
- -

◆ ScalarA

+ +

◆ ScalarAccum

-template<typename Scalar_ >
+template<typename ScalarAlphaBeta_ , typename ScalarAccum_ , bool fragMul2 = true>
- +
typedef Scalar_ cutlass::gemm::FragmentMultiplyAdd< Scalar_ >::ScalarAtypedef ScalarAccum_ cutlass::gemm::FragmentMultiplyAdd< ScalarAlphaBeta_, ScalarAccum_, fragMul2 >::ScalarAccum
- -

◆ ScalarB

+ +

◆ ScalarAlphaBeta

-template<typename Scalar_ >
+template<typename ScalarAlphaBeta_ , typename ScalarAccum_ , bool fragMul2 = true>
- - -
typedef Scalar_ cutlass::gemm::FragmentMultiplyAdd< Scalar_ >::ScalarB
-
- -
- - -

◆ ScalarC

- -
-
-
-template<typename Scalar_ >
- - - +
typedef Scalar_ cutlass::gemm::FragmentMultiplyAdd< Scalar_ >::ScalarCtypedef ScalarAlphaBeta_ cutlass::gemm::FragmentMultiplyAdd< ScalarAlphaBeta_, ScalarAccum_, fragMul2 >::ScalarAlphaBeta
@@ -178,19 +159,19 @@

Constructor & Destructor Documentation

- -

◆ FragmentMultiplyAdd()

+ +

◆ FragmentMultiplyAdd()

-template<typename Scalar_ >
+template<typename ScalarAlphaBeta_ , typename ScalarAccum_ , bool fragMul2 = true>
- +
- + @@ -206,35 +187,35 @@

Member Function Documentation

- -

◆ multiply()

+ +

◆ multiply()

-template<typename Scalar_ >
+template<typename ScalarAlphaBeta_ , typename ScalarAccum_ , bool fragMul2 = true>
-template<typename Fragment_ >
+template<typename FragmentB_ , typename FragmentCd_ >
CUTLASS_DEVICE cutlass::gemm::FragmentMultiplyAdd< Scalar_ >::FragmentMultiplyAdd CUTLASS_DEVICE cutlass::gemm::FragmentMultiplyAdd< ScalarAlphaBeta_, ScalarAccum_, fragMul2 >::FragmentMultiplyAdd ( )
- + @@ -376,6 +465,7 @@

+

Launch the kernel.

@@ -399,7 +489,7 @@

- + @@ -447,7 +537,6 @@

-

Define the mainloop iteration size

@@ -474,6 +563,30 @@

+ + + +

◆ kWarpGemmSteps

+ +
+
+
+template<typename GemmTraits_ >
+

- + - + - + - + @@ -252,41 +233,41 @@

-

◆ multiply_add()

+ +

◆ multiply_add()

-template<typename Scalar_ >
+template<typename ScalarAlphaBeta_ , typename ScalarAccum_ , bool fragMul2 = true>
-template<typename Fragment_ >
+template<typename FragmentB_ , typename FragmentCd_ >

CUTLASS_DEVICE void cutlass::gemm::FragmentMultiplyAdd< Scalar_ >::multiply CUTLASS_DEVICE void cutlass::gemm::FragmentMultiplyAdd< ScalarAlphaBeta_, ScalarAccum_, fragMul2 >::multiply (Scalar_ ScalarAlphaBeta  a,
Fragment_ const & FragmentB_ const &  b,
Fragment_ & FragmentCd_ &  d 
- + @@ -341,6 +378,58 @@

Member Function Documentation

+ +

◆ consume_tile()

+ +
+
+
+template<typename GemmTraits_ >
+
+template<bool Residue, bool LastIteration>
+
- + - + - + - + - + @@ -310,7 +291,7 @@

diff --git a/docs/structcutlass_1_1gemm_1_1FragmentMultiplyAdd_3_01half_00_01half_00_01true_01_4-members.html b/docs/structcutlass_1_1gemm_1_1FragmentMultiplyAdd_3_01half_00_01half_00_01true_01_4-members.html new file mode 100644 index 0000000000..9730de1f66 --- /dev/null +++ b/docs/structcutlass_1_1gemm_1_1FragmentMultiplyAdd_3_01half_00_01half_00_01true_01_4-members.html @@ -0,0 +1,96 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
+
+

CUTLASS_DEVICE void cutlass::gemm::FragmentMultiplyAdd< Scalar_ >::multiply_add CUTLASS_DEVICE void cutlass::gemm::FragmentMultiplyAdd< ScalarAlphaBeta_, ScalarAccum_, fragMul2 >::multiply_add (Scalar_ ScalarAlphaBeta  a,
Fragment_ const & FragmentB_ const &  b,
Fragment_ const & FragmentCd_ const &  c,
Fragment_ & FragmentCd_ &  d 
+ + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+ + + + + + + + + +
+
+ + +
+ +
+ + + +
+
+
cutlass::gemm::FragmentMultiplyAdd< half, half, true > Member List
+
+ + + + + diff --git a/docs/structcutlass_1_1gemm_1_1FragmentMultiplyAdd_3_01half_00_01half_00_01true_01_4.html b/docs/structcutlass_1_1gemm_1_1FragmentMultiplyAdd_3_01half_00_01half_00_01true_01_4.html new file mode 100644 index 0000000000..85a462d29f --- /dev/null +++ b/docs/structcutlass_1_1gemm_1_1FragmentMultiplyAdd_3_01half_00_01half_00_01true_01_4.html @@ -0,0 +1,287 @@ + + + + + + + +Cutlass: cutlass::gemm::FragmentMultiplyAdd< half, half, true > Struct Template Reference + + + + + + + + + + +
+
+ + + + + + +
+
Cutlass +
+
CUDA Templates for Linear Algebra Subroutines and Solvers
+
+
+ + + + + + + + +
+
+ + +
+ +
+ + +
+
+ +
+
cutlass::gemm::FragmentMultiplyAdd< half, half, true > Struct Template Reference
+
+
+ +

#include <fragment_multiply_add.h>

+ + + + + + + + + + + +

+Public Types

typedef Shape< 1, 1, 1, 1 > InstructionShape
 The shape of the instruction. More...
 
typedef half ScalarAlphaBeta
 The type for alpha and beta. More...
 
typedef half ScalarAccum
 The type for accumlator. More...
 
+ + + + + + + + + + + + +

+Public Member Functions

CUTLASS_DEVICE FragmentMultiplyAdd ()
 Ctor. More...
 
template<typename FragmentB_ , typename FragmentCd_ >
CUTLASS_DEVICE void multiply (half a, FragmentB_ const &b, FragmentCd_ &d)
 Multiply : d = a*b. More...
 
template<typename FragmentB_ , typename FragmentCd_ >
CUTLASS_DEVICE void multiply_add (half a, FragmentB_ const &b, FragmentCd_ const &c, FragmentCd_ &d)
 Multiply : d = a*b + c. More...
 
+

Member Typedef Documentation

+ +

◆ InstructionShape

+ +
+
+ + + + +
typedef Shape<1, 1, 1, 1> cutlass::gemm::FragmentMultiplyAdd< half, half, true >::InstructionShape
+
+ +
+
+ +

◆ ScalarAccum

+ +
+
+ + + + +
typedef half cutlass::gemm::FragmentMultiplyAdd< half, half, true >::ScalarAccum
+
+ +
+
+ +

◆ ScalarAlphaBeta

+ +
+
+ + + + +
typedef half cutlass::gemm::FragmentMultiplyAdd< half, half, true >::ScalarAlphaBeta
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ FragmentMultiplyAdd()

+ +
+
+ + + + + +
+ + + + + + + +
CUTLASS_DEVICE cutlass::gemm::FragmentMultiplyAdd< half, half, true >::FragmentMultiplyAdd ()
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ multiply()

+ +
+
+
+template<typename FragmentB_ , typename FragmentCd_ >
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE void cutlass::gemm::FragmentMultiplyAdd< half, half, true >::multiply (half a,
FragmentB_ const & b,
FragmentCd_ & d 
)
+
+inline
+
+ +
+
+ +

◆ multiply_add()

+ +
+
+
+template<typename FragmentB_ , typename FragmentCd_ >
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE void cutlass::gemm::FragmentMultiplyAdd< half, half, true >::multiply_add (half a,
FragmentB_ const & b,
FragmentCd_ const & c,
FragmentCd_ & d 
)
+
+inline
+
+ +
+
+
The documentation for this struct was generated from the following file: +
+ + + + diff --git a/docs/structcutlass_1_1gemm_1_1Gemm-members.html b/docs/structcutlass_1_1gemm_1_1Gemm-members.html index f0424e29ac..d22133c4b7 100644 --- a/docs/structcutlass_1_1gemm_1_1Gemm-members.html +++ b/docs/structcutlass_1_1gemm_1_1Gemm-members.html @@ -79,12 +79,16 @@

This is the complete list of members for cutlass::gemm::Gemm< GemmTraits_ >, including all inherited members.

- - - + + + + + + + @@ -98,7 +102,7 @@
Gemm(Params const &params_, SharedStorage &shared_storage_)cutlass::gemm::Gemm< GemmTraits_ >inline
Index typedefcutlass::gemm::Gemm< GemmTraits_ >
kThreadscutlass::gemm::Gemm< GemmTraits_ >static
consume_tile(typename Traits::GlobalLoadStream &global_to_shared_stream, typename Traits::SharedStream &shared_load_stream, typename MultiplyAdd::Accumulators &accumulators, Index outer_k)cutlass::gemm::Gemm< GemmTraits_ >inline
Gemm(Params const &params_, SharedStorage &shared_storage_)cutlass::gemm::Gemm< GemmTraits_ >inline
Index typedefcutlass::gemm::Gemm< GemmTraits_ >
kThreadscutlass::gemm::Gemm< GemmTraits_ >static
kWarpGemmStepscutlass::gemm::Gemm< GemmTraits_ >static
launch(Params const &params, cudaStream_t stream=cudaStreamDefault)cutlass::gemm::Gemm< GemmTraits_ >inlinestatic
launch(CUfunction kernel, Params const &params, CUstream stream=CU_STREAM_LEGACY)cutlass::gemm::Gemm< GemmTraits_ >inlinestatic
multiply_add()cutlass::gemm::Gemm< GemmTraits_ >inline
MultiplyAdd typedefcutlass::gemm::Gemm< GemmTraits_ >
Params typedefcutlass::gemm::Gemm< GemmTraits_ >
paramscutlass::gemm::Gemm< GemmTraits_ >
ScalarA typedefcutlass::gemm::Gemm< GemmTraits_ >
ScalarB typedefcutlass::gemm::Gemm< GemmTraits_ >
diff --git a/docs/structcutlass_1_1gemm_1_1Gemm.html b/docs/structcutlass_1_1gemm_1_1Gemm.html index c2f993efa9..fcb0fa46f0 100644 --- a/docs/structcutlass_1_1gemm_1_1Gemm.html +++ b/docs/structcutlass_1_1gemm_1_1Gemm.html @@ -73,7 +73,6 @@
-Classes | Public Types | Public Member Functions | Static Public Member Functions | @@ -87,12 +86,6 @@

#include <gemm.h>

- - - - -

-Classes

struct  Params
 The params. More...
 
@@ -122,28 +115,38 @@ + + + + + +

Public Types

typedef Gemm< GemmTraits_ > This_
typedef Traits::Index Index
 The index. More...
 
typedef Traits::MultiplyAdd MultiplyAdd
 Define the mainloop iteration size. More...
 
typedef Traits::Params Params
 Use the params object defined in traits. More...
 
- + + + + +

Public Member Functions

CUTLASS_DEVICE Gemm (Params const &params_, SharedStorage &shared_storage_)
CUTLASS_DEVICE Gemm (Params const &params_, SharedStorage &shared_storage_)
 Ctor. More...
 
template<bool Residue, bool LastIteration>
CUTLASS_DEVICE void consume_tile (typename Traits::GlobalLoadStream &global_to_shared_stream, typename Traits::SharedStream &shared_load_stream, typename MultiplyAdd::Accumulators &accumulators, Index outer_k)
 Computes a warp-level GEMM on data held in shared memory. More...
 
CUTLASS_DEVICE void multiply_add ()
 Do the GEMM. More...
 
- - + + - - + +

Static Public Member Functions

static __host__ cudaError_t launch (Params const &params, cudaStream_t stream=cudaStreamDefault)
 Launch the kernel. More...
static __host__ cudaError_t launch (Params const &params, cudaStream_t stream=cudaStreamDefault)
 Support for NVRTC. More...
 
static __host__ cudaError_t launch (CUfunction kernel, Params const &params, CUstream stream=CU_STREAM_LEGACY)
 Launch the kernel. More...
static __host__ cudaError_t launch (CUfunction kernel, Params const &params, CUstream stream=CU_STREAM_LEGACY)
 Launch the kernel. More...
 
- + @@ -155,6 +158,8 @@ + +

Public Attributes

Params const & params
Params const & params
 The params. More...
 
SharedStorageshared_storage
static int const kThreads = Traits::GemmConfig::kThreads
 The number of threads. More...
 
static Index const kWarpGemmSteps
 

Member Typedef Documentation

@@ -171,6 +176,38 @@

+

+
+ +

◆ MultiplyAdd

+ +
+
+
+template<typename GemmTraits_ >
+ + + + +
typedef Traits::MultiplyAdd cutlass::gemm::Gemm< GemmTraits_ >::MultiplyAdd
+
+ +
+
+ +

◆ Params

+ +
+
+
+template<typename GemmTraits_ >
+ + + + +
typedef Traits::Params cutlass::gemm::Gemm< GemmTraits_ >::Params
+
+
@@ -316,7 +353,7 @@

CUTLASS_DEVICE cutlass::gemm::Gemm< GemmTraits_ >::Gemm

(Params const & Params const &  params_,
+ + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CUTLASS_DEVICE void cutlass::gemm::Gemm< GemmTraits_ >::consume_tile (typename Traits::GlobalLoadStream & global_to_shared_stream,
typename Traits::SharedStream & shared_load_stream,
typename MultiplyAdd::Accumulators & accumulators,
Index outer_k 
)
+
+inline
+
+ +
+

◆ launch() [1/2]

@@ -355,7 +444,7 @@

static __host__ cudaError_t cutlass::gemm::Gemm< GemmTraits_ >::launch

(Params const & Params const &  params,
Params const & Params const &  params,
+ + + + +
+ + + + +
Index const cutlass::gemm::Gemm< GemmTraits_ >::kWarpGemmSteps
+
+static
+
+Initial value:
=
Traits::GemmConfig::AccumulatorsPerWarp::kD / MultiplyAdd::InstructionShape::kD
@@ -485,7 +598,7 @@

Params const& cutlass::gemm::Gemm< GemmTraits_ >::paramsParams const& cutlass::gemm::Gemm< GemmTraits_ >::params
@@ -514,7 +627,7 @@

diff --git a/docs/structcutlass_1_1gemm_1_1GemmConfig-members.html b/docs/structcutlass_1_1gemm_1_1GemmConfig-members.html index 18c258d733..f6bd03c8dc 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmConfig-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmConfig-members.html @@ -73,41 +73,44 @@

-
cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ > Member List
+
cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ > Member List
-

This is the complete list of members for cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >, including all inherited members.

+

This is the complete list of members for cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >, including all inherited members.

- - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Accumulators typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >
AccumulatorsPerWarp typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >
InstructionShape typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >
kAccumulatorsPerLdsAcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >static
kAccumulatorsPerLdsBcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >static
kScalarsPerLdgAcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >static
kScalarsPerLdgBcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >static
kScalarsPerLdgCcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >static
kScalarsPerLdsAcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >static
kScalarsPerLdsBcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >static
kScalarsPerLdsDcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >static
kScalarsPerStgDcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >static
kScalarsPerStsAcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >static
kScalarsPerStsBcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >static
kScalarsPerStsDcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >static
kStagescutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >static
kThreadscutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >static
kWarpSizecutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >static
MultiplyAdd typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >
OutputTile typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >
ScalarA typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >
ScalarB typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >
ScalarC typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >
ScalarD typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >
Warps typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >
Accumulators typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >
AccumulatorsPerWarp typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >
InstructionShape typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >
kAccumulatorsPerLdsAcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >static
kAccumulatorsPerLdsBcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >static
kLaunchBoundscutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >static
kResidueInPrologcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >static
kResidueSeparatecutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >static
kScalarsPerLdgAcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >static
kScalarsPerLdgBcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >static
kScalarsPerLdgCcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >static
kScalarsPerLdsAcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >static
kScalarsPerLdsBcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >static
kScalarsPerLdsDcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >static
kScalarsPerStgDcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >static
kScalarsPerStsAcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >static
kScalarsPerStsBcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >static
kScalarsPerStsDcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >static
kStagescutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >static
kThreadscutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >static
kWarpSizecutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >static
MultiplyAdd typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >
OutputTile typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >
ScalarA typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >
ScalarB typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >
ScalarC typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >
ScalarD typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >
Warps typedefcutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >
diff --git a/docs/structcutlass_1_1gemm_1_1GemmConfig.html b/docs/structcutlass_1_1gemm_1_1GemmConfig.html index 3bc9b65f34..4471551bcb 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmConfig.html +++ b/docs/structcutlass_1_1gemm_1_1GemmConfig.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ > Struct Template Reference +Cutlass: cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ > Struct Template Reference @@ -77,241 +77,250 @@ Static Public Attributes | List of all members
-
cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ > Struct Template Reference
+
cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ > Struct Template Reference
-

#include <gemm_traits.h>

+

#include <gemm_config.h>

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Public Types

typedef ScalarA_ ScalarA
 The scalar for A. More...
 
typedef ScalarB_ ScalarB
 The scalar for B. More...
 
typedef ScalarC_ ScalarC
 The scalar for C. More...
 
typedef ScalarD_ ScalarD
 The scalar for D. More...
 
typedef OutputTile_ OutputTile
 The tile. More...
 
typedef MultiplyAdd_ MultiplyAdd
 The functor to do D = A*B + C. More...
 
typedef MultiplyAdd::InstructionShape InstructionShape
 The shape of the instruction. More...
 
typedef MultiplyAdd::AccumulatorsPerWarp AccumulatorsPerWarp
 The number of accumulators per warp. More...
 
typedef MultiplyAdd::Accumulators Accumulators
 The accumulators. More...
 
typedef ShapeDiv< OutputTile, AccumulatorsPerWarp >::Shape Warps
 The number of warps. More...
 
typedef ScalarA_ ScalarA
 The scalar for A. More...
 
typedef ScalarB_ ScalarB
 The scalar for B. More...
 
typedef ScalarC_ ScalarC
 The scalar for C. More...
 
typedef ScalarD_ ScalarD
 The scalar for D. More...
 
typedef OutputTile_ OutputTile
 The tile. More...
 
typedef MultiplyAdd_ MultiplyAdd
 The functor to do D = A*B + C. More...
 
typedef MultiplyAdd::InstructionShape InstructionShape
 The shape of the instruction. More...
 
typedef MultiplyAdd::AccumulatorsPerWarp AccumulatorsPerWarp
 The shape of warp-level GEMM. More...
 
typedef MultiplyAdd::Accumulators Accumulators
 The accumulators. More...
 
typedef ShapeDiv< OutputTile, AccumulatorsPerWarp >::Shape Warps
 The number of warps. More...
 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Static Public Attributes

static int const kWarpSize = cutlass::kWarpSize
 The default warp size (32 threads per warp). More...
 
static int const kThreads = ShapeCount<Warps>::kCount * kWarpSize
 The numnber of threads. More...
 
static int const kScalarsPerLdgA = kScalarsPerLdgA_
 The number of scalars per LDG/STS/LDS for A. More...
 
static int const kScalarsPerStsA = kScalarsPerStsA_
 
static int const kScalarsPerLdsA = kScalarsPerLdsA_
 
static int const kScalarsPerLdgB = kScalarsPerLdgB_
 The number of scalars per LDG/STS/LDS for B. More...
 
static int const kScalarsPerStsB = kScalarsPerStsB_
 
static int const kScalarsPerLdsB = kScalarsPerLdsB_
 
static int const kScalarsPerLdgC = kScalarsPerLdgCAndStgD_
 The number of scalars per LDG for C. More...
 
static int const kScalarsPerStgD = kScalarsPerLdgCAndStgD_
 The number of scalars per STS/LDS/STG for D. More...
 
static int const kScalarsPerStsD = kScalarsPerStsD_
 
static int const kScalarsPerLdsD = kScalarsPerLdsD_
 
static int const kAccumulatorsPerLdsA = kScalarsPerLdsA / InstructionShape::kD
 The number of accumulators that are going to be fed from one LDS A/B. More...
 
static int const kAccumulatorsPerLdsB = kScalarsPerLdsB / InstructionShape::kD
 
static int const kStages = kStages_
 The number of stages in shared memory to implement double, triple, more-buffering. More...
 
static int const kWarpSize = cutlass::kWarpSize
 The default warp size (32 threads per warp). More...
 
static int const kThreads = ShapeCount<Warps>::kCount * kWarpSize
 The numnber of threads. More...
 
static int const kScalarsPerLdgA = kScalarsPerLdgA_
 The number of scalars per LDG/STS/LDS for A. More...
 
static int const kScalarsPerStsA = kScalarsPerStsA_
 
static int const kScalarsPerLdsA = kScalarsPerLdsA_
 
static int const kScalarsPerLdgB = kScalarsPerLdgB_
 The number of scalars per LDG/STS/LDS for B. More...
 
static int const kScalarsPerStsB = kScalarsPerStsB_
 
static int const kScalarsPerLdsB = kScalarsPerLdsB_
 
static int const kScalarsPerLdgC = kScalarsPerLdgCAndStgD_
 The number of scalars per LDG for C. More...
 
static int const kScalarsPerStgD = kScalarsPerLdgCAndStgD_
 The number of scalars per STS/LDS/STG for D. More...
 
static int const kScalarsPerStsD = kScalarsPerStsD_
 
static int const kScalarsPerLdsD = kScalarsPerLdsD_
 
static int const kAccumulatorsPerLdsA = kScalarsPerLdsA / InstructionShape::kD
 The number of accumulators that are going to be fed from one LDS A/B. More...
 
static int const kAccumulatorsPerLdsB = kScalarsPerLdsB / InstructionShape::kD
 
static int const kStages = kStages_
 The number of stages in shared memory to implement double, triple, more-buffering. More...
 
static bool const kResidueSeparate = kResidueSeparate_
 If true, mainloop is instantiated twice. The first instantiation contains no predicate. More...
 
static bool const kResidueInProlog = kResidueInProlog_
 If true, residue is computed in the prologue. More...
 
static bool const kLaunchBounds = kLaunchBounds_
 If true, kernel is launched with launch bounds specified. More...
 

Member Typedef Documentation

- -

◆ Accumulators

+ +

◆ Accumulators

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>
- +
typedef MultiplyAdd::Accumulators cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::Accumulatorstypedef MultiplyAdd::Accumulators cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::Accumulators
- -

◆ AccumulatorsPerWarp

+ +

◆ AccumulatorsPerWarp

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>
- +
typedef MultiplyAdd::AccumulatorsPerWarp cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::AccumulatorsPerWarptypedef MultiplyAdd::AccumulatorsPerWarp cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::AccumulatorsPerWarp
- -

◆ InstructionShape

+ +

◆ InstructionShape

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>
- +
typedef MultiplyAdd::InstructionShape cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::InstructionShapetypedef MultiplyAdd::InstructionShape cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::InstructionShape
- -

◆ MultiplyAdd

+ +

◆ MultiplyAdd

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>
- +
typedef MultiplyAdd_ cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::MultiplyAddtypedef MultiplyAdd_ cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::MultiplyAdd
- -

◆ OutputTile

+ +

◆ OutputTile

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>
- +
typedef OutputTile_ cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::OutputTiletypedef OutputTile_ cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::OutputTile
- -

◆ ScalarA

+ +

◆ ScalarA

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>
- +
typedef ScalarA_ cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::ScalarAtypedef ScalarA_ cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::ScalarA
- -

◆ ScalarB

+ +

◆ ScalarB

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>
- +
typedef ScalarB_ cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::ScalarBtypedef ScalarB_ cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::ScalarB
- -

◆ ScalarC

+ +

◆ ScalarC

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>
- +
typedef ScalarC_ cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::ScalarCtypedef ScalarC_ cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::ScalarC
- -

◆ ScalarD

+ +

◆ ScalarD

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>
- +
typedef ScalarD_ cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::ScalarDtypedef ScalarD_ cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::ScalarD
- -

◆ Warps

+ +

◆ Warps

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>
- +
typedef ShapeDiv<OutputTile, AccumulatorsPerWarp>::Shape cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::Warpstypedef ShapeDiv<OutputTile, AccumulatorsPerWarp>::Shape cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::Warps
@@ -319,19 +328,19 @@

Member Data Documentation

- -

◆ kAccumulatorsPerLdsA

+ +

◆ kAccumulatorsPerLdsA

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>
@@ -343,19 +352,19 @@

-

◆ kAccumulatorsPerLdsB

+ +

◆ kAccumulatorsPerLdsB

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>

- +
int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::kAccumulatorsPerLdsA = kScalarsPerLdsA / InstructionShape::kDint const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::kAccumulatorsPerLdsA = kScalarsPerLdsA / InstructionShape::kD
@@ -367,19 +376,19 @@

-

◆ kScalarsPerLdgA

+ +

◆ kLaunchBounds

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>

- +
int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::kAccumulatorsPerLdsB = kScalarsPerLdsB / InstructionShape::kDint const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::kAccumulatorsPerLdsB = kScalarsPerLdsB / InstructionShape::kD
@@ -391,19 +400,19 @@

-

◆ kScalarsPerLdgB

+ +

◆ kResidueInProlog

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>

- +
int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::kScalarsPerLdgA = kScalarsPerLdgA_bool const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::kLaunchBounds = kLaunchBounds_
@@ -415,19 +424,19 @@

-

◆ kScalarsPerLdgC

+ +

◆ kResidueSeparate

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>

- +
int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::kScalarsPerLdgB = kScalarsPerLdgB_bool const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::kResidueInProlog = kResidueInProlog_
@@ -439,19 +448,19 @@

-

◆ kScalarsPerLdsA

+ +

◆ kScalarsPerLdgA

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>

- +
int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::kScalarsPerLdgC = kScalarsPerLdgCAndStgD_bool const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::kResidueSeparate = kResidueSeparate_
@@ -463,19 +472,19 @@

-

◆ kScalarsPerLdsB

+ +

◆ kScalarsPerLdgB

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>

- +
int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::kScalarsPerLdsA = kScalarsPerLdsA_int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::kScalarsPerLdgA = kScalarsPerLdgA_
@@ -487,19 +496,19 @@

-

◆ kScalarsPerLdsD

+ +

◆ kScalarsPerLdgC

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>

- +
int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::kScalarsPerLdsB = kScalarsPerLdsB_int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::kScalarsPerLdgB = kScalarsPerLdgB_
@@ -511,19 +520,19 @@

-

◆ kScalarsPerStgD

+ +

◆ kScalarsPerLdsA

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>

- +
int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::kScalarsPerLdsD = kScalarsPerLdsD_int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::kScalarsPerLdgC = kScalarsPerLdgCAndStgD_
@@ -535,19 +544,19 @@

-

◆ kScalarsPerStsA

+ +

◆ kScalarsPerLdsB

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>

- +
int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::kScalarsPerStgD = kScalarsPerLdgCAndStgD_int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::kScalarsPerLdsA = kScalarsPerLdsA_
@@ -559,19 +568,19 @@

-

◆ kScalarsPerStsB

+ +

◆ kScalarsPerLdsD

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>

- +
int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::kScalarsPerStsA = kScalarsPerStsA_int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::kScalarsPerLdsB = kScalarsPerLdsB_
@@ -583,19 +592,19 @@

-

◆ kScalarsPerStsD

+ +

◆ kScalarsPerStgD

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>

- +
int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::kScalarsPerStsB = kScalarsPerStsB_int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::kScalarsPerLdsD = kScalarsPerLdsD_
@@ -607,19 +616,19 @@

-

◆ kStages

+ +

◆ kScalarsPerStsA

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>

- +
int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::kScalarsPerStsD = kScalarsPerStsD_int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::kScalarsPerStgD = kScalarsPerLdgCAndStgD_
@@ -631,19 +640,19 @@

-

◆ kThreads

+ +

◆ kScalarsPerStsB

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>

- +
int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::kStages = kStages_int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::kScalarsPerStsA = kScalarsPerStsA_
@@ -655,19 +664,91 @@

-

◆ kWarpSize

+ +

◆ kScalarsPerStsD

-template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_>
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>

- +
int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::kThreads = ShapeCount<Warps>::kCount * kWarpSizeint const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::kScalarsPerStsB = kScalarsPerStsB_
+ + +
- + + +
int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_ >::kWarpSize = cutlass::kWarpSizeint const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::kScalarsPerStsD = kScalarsPerStsD_
+
+static
+
+ +
+
+ +

◆ kStages

+ +
+
+
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>
+ + + + + +
+ + + + +
int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::kStages = kStages_
+
+static
+
+ +
+
+ +

◆ kThreads

+ +
+
+
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>
+ + + + + +
+ + + + +
int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::kThreads = ShapeCount<Warps>::kCount * kWarpSize
+
+static
+
+ +
+
+ +

◆ kWarpSize

+ +
+
+
+template<typename ScalarA_, typename ScalarB_, typename ScalarC_, typename ScalarD_, typename OutputTile_, typename MultiplyAdd_, int kScalarsPerLdgA_, int kScalarsPerStsA_, int kScalarsPerLdsA_, int kScalarsPerLdgB_, int kScalarsPerStsB_, int kScalarsPerLdsB_, int kScalarsPerLdgCAndStgD_, int kScalarsPerStsD_, int kScalarsPerLdsD_, int kStages_, bool kResidueSeparate_ = false, bool kResidueInProlog_ = false, bool kLaunchBounds_ = true>
+ + + @@ -680,12 +761,12 @@

gemm_traits.h +
  • gemm_config.h
  • diff --git a/docs/structcutlass_1_1gemm_1_1GemmCoord-members.html b/docs/structcutlass_1_1gemm_1_1GemmCoord-members.html new file mode 100644 index 0000000000..b3bb6a70bf --- /dev/null +++ b/docs/structcutlass_1_1gemm_1_1GemmCoord-members.html @@ -0,0 +1,152 @@ + + + + + + + +Cutlass: Member List + + + + + + + + + + +
    +
    +

    + + +
    int const cutlass::gemm::GemmConfig< ScalarA_, ScalarB_, ScalarC_, ScalarD_, OutputTile_, MultiplyAdd_, kScalarsPerLdgA_, kScalarsPerStsA_, kScalarsPerLdsA_, kScalarsPerLdgB_, kScalarsPerStsB_, kScalarsPerLdsB_, kScalarsPerLdgCAndStgD_, kScalarsPerStsD_, kScalarsPerLdsD_, kStages_, kResidueSeparate_, kResidueInProlog_, kLaunchBounds_ >::kWarpSize = cutlass::kWarpSize
    + + + + + +
    +
    Cutlass +
    +
    CUDA Templates for Linear Algebra Subroutines and Solvers
    +
    +
    + + + + + + + + +
    +
    + + +
    + +
    + + +
    +
    +
    +
    cutlass::gemm::GemmCoord Member List
    +
    +
    + +

    This is the complete list of members for cutlass::gemm::GemmCoord, including all inherited members.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    at()cutlass::Coord< 4, int >inline
    at(int dim)cutlass::Coord< 4, int >inline
    at() constcutlass::Coord< 4, int >inline
    at(int dim) constcutlass::Coord< 4, int >inline
    Base typedefcutlass::gemm::GemmCoord
    batch() constcutlass::gemm::GemmCoordinline
    batch()cutlass::gemm::GemmCoordinline
    clamp(Coord< kRank > const &max, Coord< kRank > const &min=Coord< kRank >())cutlass::Coord< 4, int >inline
    Coord(Index value=0)cutlass::Coord< 4, int >inline
    Coord(Index _idx[])cutlass::Coord< 4, int >inline
    Coord(Coord< kRank > const &coord)cutlass::Coord< 4, int >inline
    count() constcutlass::Coord< 4, int >inline
    dot(Coord const &b, T sum) constcutlass::Coord< 4, int >inline
    dot(Coord const &b) constcutlass::Coord< 4, int >inline
    GemmCoord()cutlass::gemm::GemmCoordinline
    GemmCoord(Coord< 3, Index > const &coord, Index _batch=0)cutlass::gemm::GemmCoordinline
    GemmCoord(Coord< 4, Index > const &coord)cutlass::gemm::GemmCoordinline
    GemmCoord(Index coord[4])cutlass::gemm::GemmCoordinline
    GemmCoord(Index k, Index n, Index m, Index batch=0)cutlass::gemm::GemmCoordinline
    idxcutlass::Coord< 4, int >
    Index typedefcutlass::gemm::GemmCoord
    k() constcutlass::gemm::GemmCoordinline
    k()cutlass::gemm::GemmCoordinline
    kBatchcutlass::gemm::GemmCoordstatic
    kKcutlass::gemm::GemmCoordstatic
    kMcutlass::gemm::GemmCoordstatic
    km() constcutlass::gemm::GemmCoordinline
    kNcutlass::gemm::GemmCoordstatic
    kn() constcutlass::gemm::GemmCoordinline
    knm() constcutlass::gemm::GemmCoordinline
    kRankcutlass::Coord< 4, int >static
    m() constcutlass::gemm::GemmCoordinline
    m()cutlass::gemm::GemmCoordinline
    Ncutlass::Coord< 4, int >static
    n() constcutlass::gemm::GemmCoordinline
    n()cutlass::gemm::GemmCoordinline
    nm() constcutlass::gemm::GemmCoordinline
    operator bool() constcutlass::Coord< 4, int >inline
    operator!() constcutlass::Coord< 4, int >inline
    operator!=(Coord< kRank > const &b) constcutlass::Coord< 4, int >inline
    operator*(Base const &b) constcutlass::gemm::GemmCoordinline
    Coord< 4, int >::operator*(Coord const &b) constcutlass::Coord< 4, int >inline
    operator*=(Base const &b)cutlass::gemm::GemmCoordinline
    Coord< 4, int >::operator*=(Coord const &b)cutlass::Coord< 4, int >inline
    operator+(Base const &b) constcutlass::gemm::GemmCoordinline
    Coord< 4, int >::operator+(Coord const &b) constcutlass::Coord< 4, int >inline
    operator+=(Base const &b)cutlass::gemm::GemmCoordinline
    Coord< 4, int >::operator+=(Coord const &b)cutlass::Coord< 4, int >inline
    operator-(Base const &b) constcutlass::gemm::GemmCoordinline
    Coord< 4, int >::operator-(Coord const &b) constcutlass::Coord< 4, int >inline
    operator-=(Base const &b)cutlass::gemm::GemmCoordinline
    Coord< 4, int >::operator-=(Coord const &b)cutlass::Coord< 4, int >inline
    operator/(Base const &b) constcutlass::gemm::GemmCoordinline
    Coord< 4, int >::operator/(Coord const &b) constcutlass::Coord< 4, int >inline
    operator/=(Base const &b)cutlass::gemm::GemmCoordinline
    Coord< 4, int >::operator/=(Coord const &b)cutlass::Coord< 4, int >inline
    operator<(Coord< kRank > const &b) constcutlass::Coord< 4, int >inline
    operator<=(Coord< kRank > const &b) constcutlass::Coord< 4, int >inline
    operator==(Coord< kRank > const &b) constcutlass::Coord< 4, int >inline
    operator[](int dim)cutlass::Coord< 4, int >inline
    operator[](int dim) constcutlass::Coord< 4, int >inline
    slice(int start=0, Index identity=0) constcutlass::Coord< 4, int >inline
    + + + + diff --git a/docs/structcutlass_1_1gemm_1_1GemmCoord.html b/docs/structcutlass_1_1gemm_1_1GemmCoord.html new file mode 100644 index 0000000000..9326fe152a --- /dev/null +++ b/docs/structcutlass_1_1gemm_1_1GemmCoord.html @@ -0,0 +1,1102 @@ + + + + + + + +Cutlass: cutlass::gemm::GemmCoord Struct Reference + + + + + + + + + + +
    +
    + + + + + + +
    +
    Cutlass +
    +
    CUDA Templates for Linear Algebra Subroutines and Solvers
    +
    +
    + + + + + + + + +
    +
    + + +
    + +
    + + +
    +
    + +
    +
    cutlass::gemm::GemmCoord Struct Reference
    +
    +
    + +

    #include <gemm_coord.h>

    +
    +Inheritance diagram for cutlass::gemm::GemmCoord:
    +
    +
    + + +cutlass::Coord< 4, int > + +
    + + + + + + + + + + + + +

    +Public Types

    typedef int Index
     Integer-valued index. More...
     
    typedef Coord< 4, IndexBase
     Base type is a Coord of rank=4. More...
     
    - Public Types inherited from cutlass::Coord< 4, int >
    typedef int Index
     Index type used to store elements. More...
     
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    +Public Member Functions

    CUTLASS_HOST_DEVICE GemmCoord ()
     Default ctor. More...
     
    CUTLASS_HOST_DEVICE GemmCoord (Coord< 3, Index > const &coord, Index _batch=0)
     Constructs from Coord<3> and a batch. More...
     
    CUTLASS_HOST_DEVICE GemmCoord (Coord< 4, Index > const &coord)
     Constructs from Coord<4> More...
     
    CUTLASS_HOST_DEVICE GemmCoord (Index coord[4])
     Constructs from an array of coordinate elements. More...
     
    CUTLASS_HOST_DEVICE GemmCoord (Index k, Index n, Index m, Index batch=0)
     Helper to construct from a K, N, M, batch variables. More...
     
    CUTLASS_HOST_DEVICE Index const & m () const
     Returns the GEMM M coordinate. More...
     
    CUTLASS_HOST_DEVICE Indexm ()
     Returns reference to the GEMM M coordinate. More...
     
    CUTLASS_HOST_DEVICE Index const & n () const
     Returns the GEMM N coordinate. More...
     
    CUTLASS_HOST_DEVICE Indexn ()
     Returns reference to the GEMM N coordinate. More...
     
    CUTLASS_HOST_DEVICE Index const & k () const
     Returns the GEMM K coordinate. More...
     
    CUTLASS_HOST_DEVICE Indexk ()
     Returns reference to the GEMM K coordinate. More...
     
    CUTLASS_HOST_DEVICE Index const & batch () const
     Returns the GEMM batch coordinate. More...
     
    CUTLASS_HOST_DEVICE Indexbatch ()
     Returns reference to the GEMM batch coordinate. More...
     
    CUTLASS_HOST_DEVICE Coord< 3 > knm () const
     Obtains a Coord<3> from GemmCoord. More...
     
    CUTLASS_HOST_DEVICE Coord< 2 > nm () const
     Obtains a Coord<2> from GemmCoord. More...
     
    CUTLASS_HOST_DEVICE Coord< 2 > km () const
     Obtains a Coord<2> from GemmCoord. More...
     
    CUTLASS_HOST_DEVICE Coord< 2 > kn () const
     Obtains a Coord<2> from GemmCoord. More...
     
    CUTLASS_HOST_DEVICE GemmCoord operator+ (Base const &b) const
     Element-wise addition. More...
     
    CUTLASS_HOST_DEVICE GemmCoord operator- (Base const &b) const
     Element-wise subtraction. More...
     
    CUTLASS_HOST_DEVICE GemmCoord operator* (Base const &b) const
     Element-wise multiplication. More...
     
    CUTLASS_HOST_DEVICE GemmCoord operator/ (Base const &b) const
     Element-wise division. More...
     
    CUTLASS_HOST_DEVICE GemmCoordoperator+= (Base const &b)
     In-place addition. More...
     
    CUTLASS_HOST_DEVICE GemmCoordoperator-= (Base const &b)
     In-place subtraction. More...
     
    CUTLASS_HOST_DEVICE GemmCoordoperator*= (Base const &b)
     In-place multiplication. More...
     
    CUTLASS_HOST_DEVICE GemmCoordoperator/= (Base const &b)
     In-place division. More...
     
    - Public Member Functions inherited from cutlass::Coord< 4, int >
    CUTLASS_HOST_DEVICE Coord (Index value=0)
     Default ctor initializes uniformly. More...
     
    CUTLASS_HOST_DEVICE Coord (Index _idx[])
     Constructs from an array of integers. More...
     
    CUTLASS_HOST_DEVICE Coord (Coord< kRank > const &coord)
     Constructs from an array of integers. More...
     
    CUTLASS_HOST_DEVICE Coord< Slice > slice (int start=0, Index identity=0) const
     
    CUTLASS_HOST_DEVICE operator bool () const
     Returns true if Coord is non-zero. More...
     
    CUTLASS_HOST_DEVICE bool operator! () const
     Returns true if Coord is uniformly zero. More...
     
    CUTLASS_HOST_DEVICE Coord operator+ (Coord const &b) const
     Element-wise addition. More...
     
    CUTLASS_HOST_DEVICE Coord operator- (Coord const &b) const
     Element-wise subtraction. More...
     
    CUTLASS_HOST_DEVICE Coord operator* (Coord const &b) const
     Element-wise multiplication. More...
     
    CUTLASS_HOST_DEVICE Coord operator/ (Coord const &b) const
     Element-wise division. More...
     
    CUTLASS_HOST_DEVICE Coordoperator+= (Coord const &b)
     In-place addition. More...
     
    CUTLASS_HOST_DEVICE Coordoperator-= (Coord const &b)
     In-place subtraction. More...
     
    CUTLASS_HOST_DEVICE Coordoperator*= (Coord const &b)
     In-place multiplication. More...
     
    CUTLASS_HOST_DEVICE Coordoperator/= (Coord const &b)
     In-place division. More...
     
    CUTLASS_HOST_DEVICE Indexoperator[] (int dim)
     Member access operator. More...
     
    CUTLASS_HOST_DEVICE Index const & operator[] (int dim) const
     Member access operator. More...
     
    CUTLASS_HOST_DEVICEdot (Coord const &b, T sum) const
     Computes the dot product of two Coord instances. More...
     
    CUTLASS_HOST_DEVICEdot (Coord const &b) const
     Computes the dot product of two Coord instances. More...
     
    CUTLASS_HOST_DEVICE Indexat ()
     Gets the index of a given Coord element. More...
     
    CUTLASS_HOST_DEVICE Indexat (int dim)
     Access via index; may limit unrolling potential. More...
     
    CUTLASS_HOST_DEVICE Index const & at () const
     Gets the index of a given Coord element. More...
     
    CUTLASS_HOST_DEVICE Index const & at (int dim) const
     Access via index; may limit unrolling potential. More...
     
    CUTLASS_HOST_DEVICE bool operator== (Coord< kRank > const &b) const
     Determines if two Coord<> objects are equal. More...
     
    CUTLASS_HOST_DEVICE bool operator!= (Coord< kRank > const &b) const
     Not equal. More...
     
    CUTLASS_HOST_DEVICE Coordclamp (Coord< kRank > const &max, Coord< kRank > const &min=Coord< kRank >())
     Clamps a coordinate to a range specified by maximum and minimum values. More...
     
    CUTLASS_HOST_DEVICE Index count () const
     Returns the product of all elements. More...
     
    CUTLASS_HOST_DEVICE bool operator< (Coord< kRank > const &b) const
     Less than operator. More...
     
    CUTLASS_HOST_DEVICE bool operator<= (Coord< kRank > const &b) const
     Less than or equals operator. More...
     
    + + + + + + + + + + + + + + + + + + + + +

    +Static Public Attributes

    static int const kK = 0
     GEMM K dimension - inner dimension of the GEMM problem. More...
     
    static int const kN = 1
     GEMM N dimension - columns of the output C matrix. More...
     
    static int const kM = 2
     GEMM M dimension - rows of the output C matrix. More...
     
    static int const kBatch = 3
     Batch dimension - for generalizing to larger problems. More...
     
    - Static Public Attributes inherited from cutlass::Coord< 4, int >
    static int const kRank
     Number of elements in Coord. More...
     
    static int const N
     Number of elements in Coord, aliased for compatibility. More...
     
    + + + + + +

    +Additional Inherited Members

    - Public Attributes inherited from cutlass::Coord< 4, int >
    Index idx [kRank]
     Indices. More...
     
    +

    Detailed Description

    +

    GemmCoord is a structure derived from Coord<4> that specifies a location within the coordinate space of a GEMM problem.

    +

    Member Typedef Documentation

    + +

    ◆ Base

    + +
    +
    + + + + +
    typedef Coord<4, Index> cutlass::gemm::GemmCoord::Base
    +
    + +
    +
    + +

    ◆ Index

    + +
    +
    + + + + +
    typedef int cutlass::gemm::GemmCoord::Index
    +
    + +
    +
    +

    Constructor & Destructor Documentation

    + +

    ◆ GemmCoord() [1/5]

    + +
    +
    + + + + + +
    + + + + + + + +
    CUTLASS_HOST_DEVICE cutlass::gemm::GemmCoord::GemmCoord ()
    +
    +inline
    +
    + +
    +
    + +

    ◆ GemmCoord() [2/5]

    + +
    +
    + + + + + +
    + + + + + + + + + + + + + + + + + + +
    CUTLASS_HOST_DEVICE cutlass::gemm::GemmCoord::GemmCoord (Coord< 3, Index > const & coord,
    Index _batch = 0 
    )
    +
    +inline
    +
    + +
    +
    + +

    ◆ GemmCoord() [3/5]

    + +
    +
    + + + + + +
    + + + + + + + + +
    CUTLASS_HOST_DEVICE cutlass::gemm::GemmCoord::GemmCoord (Coord< 4, Index > const & coord)
    +
    +inline
    +
    + +
    +
    + +

    ◆ GemmCoord() [4/5]

    + +
    +
    + + + + + +
    + + + + + + + + +
    CUTLASS_HOST_DEVICE cutlass::gemm::GemmCoord::GemmCoord (Index coord[4])
    +
    +inline
    +
    + +
    +
    + +

    ◆ GemmCoord() [5/5]

    + +
    +
    + + + + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    CUTLASS_HOST_DEVICE cutlass::gemm::GemmCoord::GemmCoord (Index k,
    Index n,
    Index m,
    Index batch = 0 
    )
    +
    +inline
    +
    + +
    +
    +

    Member Function Documentation

    + +

    ◆ batch() [1/2]

    + +
    +
    + + + + + +
    + + + + + + + +
    CUTLASS_HOST_DEVICE Index const& cutlass::gemm::GemmCoord::batch () const
    +
    +inline
    +
    + +
    +
    + +

    ◆ batch() [2/2]

    + +
    +
    + + + + + +
    + + + + + + + +
    CUTLASS_HOST_DEVICE Index& cutlass::gemm::GemmCoord::batch ()
    +
    +inline
    +
    + +
    +
    + +

    ◆ k() [1/2]

    + +
    +
    + + + + + +
    + + + + + + + +
    CUTLASS_HOST_DEVICE Index const& cutlass::gemm::GemmCoord::k () const
    +
    +inline
    +
    + +
    +
    + +

    ◆ k() [2/2]

    + +
    +
    + + + + + +
    + + + + + + + +
    CUTLASS_HOST_DEVICE Index& cutlass::gemm::GemmCoord::k ()
    +
    +inline
    +
    + +
    +
    + +

    ◆ km()

    + +
    +
    + + + + + +
    + + + + + + + +
    CUTLASS_HOST_DEVICE Coord<2> cutlass::gemm::GemmCoord::km () const
    +
    +inline
    +
    + +
    +
    + +

    ◆ kn()

    + +
    +
    + + + + + +
    + + + + + + + +
    CUTLASS_HOST_DEVICE Coord<2> cutlass::gemm::GemmCoord::kn () const
    +
    +inline
    +
    + +
    +
    + +

    ◆ knm()

    + +
    +
    + + + + + +
    + + + + + + + +
    CUTLASS_HOST_DEVICE Coord<3> cutlass::gemm::GemmCoord::knm () const
    +
    +inline
    +
    + +
    +
    + +

    ◆ m() [1/2]

    + +
    +
    + + + + + +
    + + + + + + + +
    CUTLASS_HOST_DEVICE Index const& cutlass::gemm::GemmCoord::m () const
    +
    +inline
    +
    + +
    +
    + +

    ◆ m() [2/2]

    + +
    +
    + + + + + +
    + + + + + + + +
    CUTLASS_HOST_DEVICE Index& cutlass::gemm::GemmCoord::m ()
    +
    +inline
    +
    + +
    +
    + +

    ◆ n() [1/2]

    + +
    +
    + + + + + +
    + + + + + + + +
    CUTLASS_HOST_DEVICE Index const& cutlass::gemm::GemmCoord::n () const
    +
    +inline
    +
    + +
    +
    + +

    ◆ n() [2/2]

    + +
    +
    + + + + + +
    + + + + + + + +
    CUTLASS_HOST_DEVICE Index& cutlass::gemm::GemmCoord::n ()
    +
    +inline
    +
    + +
    +
    + +

    ◆ nm()

    + +
    +
    + + + + + +
    + + + + + + + +
    CUTLASS_HOST_DEVICE Coord<2> cutlass::gemm::GemmCoord::nm () const
    +
    +inline
    +
    + +
    +
    + +

    ◆ operator*()

    + +
    +
    + + + + + +
    + + + + + + + + +
    CUTLASS_HOST_DEVICE GemmCoord cutlass::gemm::GemmCoord::operator* (Base const & b) const
    +
    +inline
    +
    + +
    +
    + +

    ◆ operator*=()

    + +
    +
    + + + + + +
    + + + + + + + + +
    CUTLASS_HOST_DEVICE GemmCoord& cutlass::gemm::GemmCoord::operator*= (Base const & b)
    +
    +inline
    +
    + +
    +
    + +

    ◆ operator+()

    + +
    +
    + + + + + +
    + + + + + + + + +
    CUTLASS_HOST_DEVICE GemmCoord cutlass::gemm::GemmCoord::operator+ (Base const & b) const
    +
    +inline
    +
    + +
    +
    + +

    ◆ operator+=()

    + +
    +
    + + + + + +
    + + + + + + + + +
    CUTLASS_HOST_DEVICE GemmCoord& cutlass::gemm::GemmCoord::operator+= (Base const & b)
    +
    +inline
    +
    + +
    +
    + +

    ◆ operator-()

    + +
    +
    + + + + + +
    + + + + + + + + +
    CUTLASS_HOST_DEVICE GemmCoord cutlass::gemm::GemmCoord::operator- (Base const & b) const
    +
    +inline
    +
    + +
    +
    + +

    ◆ operator-=()

    + +
    +
    + + + + + +
    + + + + + + + + +
    CUTLASS_HOST_DEVICE GemmCoord& cutlass::gemm::GemmCoord::operator-= (Base const & b)
    +
    +inline
    +
    + +
    +
    + +

    ◆ operator/()

    + +
    +
    + + + + + +
    + + + + + + + + +
    CUTLASS_HOST_DEVICE GemmCoord cutlass::gemm::GemmCoord::operator/ (Base const & b) const
    +
    +inline
    +
    + +
    +
    + +

    ◆ operator/=()

    + +
    +
    + + + + + +
    + + + + + + + + +
    CUTLASS_HOST_DEVICE GemmCoord& cutlass::gemm::GemmCoord::operator/= (Base const & b)
    +
    +inline
    +
    + +
    +
    +

    Member Data Documentation

    + +

    ◆ kBatch

    + +
    +
    + + + + + +
    + + + + +
    int const cutlass::gemm::GemmCoord::kBatch = 3
    +
    +static
    +
    + +
    +
    + +

    ◆ kK

    + +
    +
    + + + + + +
    + + + + +
    int const cutlass::gemm::GemmCoord::kK = 0
    +
    +static
    +
    + +
    +
    + +

    ◆ kM

    + +
    +
    + + + + + +
    + + + + +
    int const cutlass::gemm::GemmCoord::kM = 2
    +
    +static
    +
    + +
    +
    + +

    ◆ kN

    + +
    +
    + + + + + +
    + + + + +
    int const cutlass::gemm::GemmCoord::kN = 1
    +
    +static
    +
    + +
    +
    +
    The documentation for this struct was generated from the following file: +
    + + + + diff --git a/docs/structcutlass_1_1gemm_1_1GemmCoord.png b/docs/structcutlass_1_1gemm_1_1GemmCoord.png new file mode 100644 index 0000000000..225d8b64e1 Binary files /dev/null and b/docs/structcutlass_1_1gemm_1_1GemmCoord.png differ diff --git a/docs/structcutlass_1_1gemm_1_1GemmDesc-members.html b/docs/structcutlass_1_1gemm_1_1GemmDesc-members.html index 5c3b045aa6..28c238453a 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmDesc-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmDesc-members.html @@ -73,29 +73,41 @@
    -
    cutlass::gemm::GemmDesc< Scalar_, Index_ > Member List
    +
    cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ > Member List
    -

    This is the complete list of members for cutlass::gemm::GemmDesc< Scalar_, Index_ >, including all inherited members.

    +

    This is the complete list of members for cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >, including all inherited members.

    - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + +
    alphacutlass::gemm::GemmDesc< Scalar_, Index_ >
    betacutlass::gemm::GemmDesc< Scalar_, Index_ >
    d_acutlass::gemm::GemmDesc< Scalar_, Index_ >
    d_bcutlass::gemm::GemmDesc< Scalar_, Index_ >
    d_ccutlass::gemm::GemmDesc< Scalar_, Index_ >
    d_dcutlass::gemm::GemmDesc< Scalar_, Index_ >
    kcutlass::gemm::GemmDesc< Scalar_, Index_ >
    ldacutlass::gemm::GemmDesc< Scalar_, Index_ >
    ldbcutlass::gemm::GemmDesc< Scalar_, Index_ >
    ldccutlass::gemm::GemmDesc< Scalar_, Index_ >
    lddcutlass::gemm::GemmDesc< Scalar_, Index_ >
    mcutlass::gemm::GemmDesc< Scalar_, Index_ >
    ncutlass::gemm::GemmDesc< Scalar_, Index_ >
    Acutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    alphacutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    AType typedefcutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    Bcutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    batch_stride_Acutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    batch_stride_Bcutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    batch_stride_Ccutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    batch_stride_Dcutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    betacutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    BType typedefcutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    Ccutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    CType typedefcutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    Dcutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    DType typedefcutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    GemmDesc()cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >inline
    GemmDesc(Coord< 3 > _problem_size, SType _alpha, TensorRefA const &_A, TensorRefB const &_B, SType _beta, TensorRefC const &_C, TensorRefD const &_D)cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >inline
    GemmDesc(GemmCoord _problem_size, SType _alpha, TensorRefA const &_A, TensorRefB const &_B, SType _beta, TensorRefC const &_C, TensorRefD const &_D)cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >inline
    GemmDesc(GemmCoord _problem_size, SType _alpha, TensorRefA const &_A, long long _batch_stride_A, TensorRefB const &_B, long long _batch_stride_B, SType _beta, TensorRefC const &_C, long long _batch_stride_C, TensorRefD const &_D, long long _batch_stride_D)cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >inline
    Index typedefcutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    problem_sizecutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    SType typedefcutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    TensorRefA typedefcutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    TensorRefB typedefcutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    TensorRefC typedefcutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    TensorRefD typedefcutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >
    diff --git a/docs/structcutlass_1_1gemm_1_1GemmDesc.html b/docs/structcutlass_1_1gemm_1_1GemmDesc.html index 9f4c8fd0b2..e527a78df4 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmDesc.html +++ b/docs/structcutlass_1_1gemm_1_1GemmDesc.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::gemm::GemmDesc< Scalar_, Index_ > Struct Template Reference +Cutlass: cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ > Struct Template Reference @@ -73,257 +73,692 @@
    -
    cutlass::gemm::GemmDesc< Scalar_, Index_ > Struct Template Reference
    +
    cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ > Struct Template Reference
    -

    #include <gemm.h>

    +

    GEMM problem description. +

    + +

    #include <gemm_desc.h>

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    +Public Types

    typedef Index_ Index
     Index type for dimensions and strides. More...
     
    typedef AType_ AType
     Source accumulator matrix type. More...
     
    typedef TensorRef< AType const, 2 > TensorRefA
     Tensor reference to A operand. More...
     
    typedef BType_ BType
     Destination accumulator type. More...
     
    typedef TensorRef< BType const, 2 > TensorRefB
     Tensor reference to B operand. More...
     
    typedef CType_ CType
     Source accumulator matrix type. More...
     
    typedef TensorRef< CType const, 2 > TensorRefC
     Tensor reference to C operand. More...
     
    typedef DType_ DType
     Destination accumulator type. More...
     
    typedef TensorRef< DType, 2 > TensorRefD
     Tensor reference to D operand. More...
     
    typedef SType_ SType
     Scalar type for alpha and beta. More...
     
    + + + + + + + + + + + + + +

    +Public Member Functions

    CUTLASS_HOST_DEVICE GemmDesc ()
     Default ctor. More...
     
    CUTLASS_HOST_DEVICE GemmDesc (Coord< 3 > _problem_size, SType _alpha, TensorRefA const &_A, TensorRefB const &_B, SType _beta, TensorRefC const &_C, TensorRefD const &_D)
     Constructor for basic GEMM with batch count = 1. More...
     
    CUTLASS_HOST_DEVICE GemmDesc (GemmCoord _problem_size, SType _alpha, TensorRefA const &_A, TensorRefB const &_B, SType _beta, TensorRefC const &_C, TensorRefD const &_D)
     Constructor for basic GEMM with batch count = 1. More...
     
    CUTLASS_HOST_DEVICE GemmDesc (GemmCoord _problem_size, SType _alpha, TensorRefA const &_A, long long _batch_stride_A, TensorRefB const &_B, long long _batch_stride_B, SType _beta, TensorRefC const &_C, long long _batch_stride_C, TensorRefD const &_D, long long _batch_stride_D)
     Constructor for strided batch GEMM GEMM. More...
     
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    Public Attributes

    Index_ m
     The dimensions of the GEMM. More...
     
    Index_ n
     
    Index_ k
     
    Scalar_ alpha
     The alpha/beta scaling values. More...
     
    Scalar_ beta
     
    void const * d_a
     The source matrix A. More...
     
    Index_ lda
     The stride for A. More...
     
    void const * d_b
     The source matrix B. More...
     
    Index_ ldb
     The stride for B. More...
     
    void const * d_c
     The source matrix C. More...
     
    Index_ ldc
     The stride for C. More...
     
    void * d_d
     The destination matrix D. More...
     
    Index_ ldd
     The stride for D. More...
     
    GemmCoord problem_size
     The dimensions of the GEMM. More...
     
    SType alpha
     The alpha scaling values. More...
     
    TensorRefA A
     The source matrix A. More...
     
    long long batch_stride_A
     batch stride for A operand More...
     
    TensorRefB B
     The source matrix B. More...
     
    long long batch_stride_B
     batch stride for B operand More...
     
    SType beta
     The beta scaling values. More...
     
    TensorRefC C
     The source matrix C. More...
     
    long long batch_stride_C
     batch stride for C operand More...
     
    TensorRefD D
     The destination matrix D. More...
     
    long long batch_stride_D
     batch stride for D operand More...
     
    -

    Member Data Documentation

    - -

    ◆ alpha

    +

    Member Typedef Documentation

    + +

    ◆ AType

    + +
    +
    +
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    + + + + +
    typedef AType_ cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::AType
    +
    + +
    +
    + +

    ◆ BType

    + +
    +
    +
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    + + + + +
    typedef BType_ cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::BType
    +
    + +
    +
    + +

    ◆ CType

    -template<typename Scalar_, typename Index_ = int>
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    - +
    Scalar_ cutlass::gemm::GemmDesc< Scalar_, Index_ >::alphatypedef CType_ cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::CType
    - -

    ◆ beta

    + +

    ◆ DType

    -template<typename Scalar_, typename Index_ = int>
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    - +
    Scalar_ cutlass::gemm::GemmDesc< Scalar_, Index_ >::betatypedef DType_ cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::DType
    - -

    ◆ d_a

    + +

    ◆ Index

    + +
    +
    +
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    + + + + +
    typedef Index_ cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::Index
    +
    + +
    +
    + +

    ◆ SType

    + +
    +
    +
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    + + + + +
    typedef SType_ cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::SType
    +
    + +
    +
    + +

    ◆ TensorRefA

    + +
    +
    +
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    + + + + +
    typedef TensorRef<AType const, 2> cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::TensorRefA
    +
    + +
    +
    + +

    ◆ TensorRefB

    + +
    +
    +
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    + + + + +
    typedef TensorRef<BType const, 2> cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::TensorRefB
    +
    + +
    +
    + +

    ◆ TensorRefC

    + +
    +
    +
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    + + + + +
    typedef TensorRef<CType const, 2> cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::TensorRefC
    +
    + +
    +
    + +

    ◆ TensorRefD

    + +
    +
    +
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    + + + + +
    typedef TensorRef<DType, 2> cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::TensorRefD
    +
    + +
    +
    +

    Constructor & Destructor Documentation

    + +

    ◆ GemmDesc() [1/4]

    + +
    +
    +
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    + + + + + +
    + + + + + + + +
    CUTLASS_HOST_DEVICE cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::GemmDesc ()
    +
    +inline
    +
    + +
    +
    + +

    ◆ GemmDesc() [2/4]

    + +
    +
    +
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    + + + + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    CUTLASS_HOST_DEVICE cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::GemmDesc (Coord< 3 > _problem_size,
    SType _alpha,
    TensorRefA const & _A,
    TensorRefB const & _B,
    SType _beta,
    TensorRefC const & _C,
    TensorRefD const & _D 
    )
    +
    +inline
    +
    + +
    +
    + +

    ◆ GemmDesc() [3/4]

    + +
    +
    +
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    + + + + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    CUTLASS_HOST_DEVICE cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::GemmDesc (GemmCoord _problem_size,
    SType _alpha,
    TensorRefA const & _A,
    TensorRefB const & _B,
    SType _beta,
    TensorRefC const & _C,
    TensorRefD const & _D 
    )
    +
    +inline
    +
    + +
    +
    + +

    ◆ GemmDesc() [4/4]

    + +
    +
    +
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    + + + + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    CUTLASS_HOST_DEVICE cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::GemmDesc (GemmCoord _problem_size,
    SType _alpha,
    TensorRefA const & _A,
    long long _batch_stride_A,
    TensorRefB const & _B,
    long long _batch_stride_B,
    SType _beta,
    TensorRefC const & _C,
    long long _batch_stride_C,
    TensorRefD const & _D,
    long long _batch_stride_D 
    )
    +
    +inline
    +
    + +
    +
    +

    Member Data Documentation

    + +

    ◆ A

    -template<typename Scalar_, typename Index_ = int>
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    - +
    void const* cutlass::gemm::GemmDesc< Scalar_, Index_ >::d_aTensorRefA cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::A
    - -

    ◆ d_b

    + +

    ◆ alpha

    -template<typename Scalar_, typename Index_ = int>
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    - +
    void const* cutlass::gemm::GemmDesc< Scalar_, Index_ >::d_bSType cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::alpha
    - -

    ◆ d_c

    + +

    ◆ B

    -template<typename Scalar_, typename Index_ = int>
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    - +
    void const* cutlass::gemm::GemmDesc< Scalar_, Index_ >::d_cTensorRefB cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::B
    - -

    ◆ d_d

    + +

    ◆ batch_stride_A

    -template<typename Scalar_, typename Index_ = int>
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    - +
    void* cutlass::gemm::GemmDesc< Scalar_, Index_ >::d_dlong long cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::batch_stride_A
    - -

    ◆ k

    + +

    ◆ batch_stride_B

    -template<typename Scalar_, typename Index_ = int>
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    - +
    Index_ cutlass::gemm::GemmDesc< Scalar_, Index_ >::klong long cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::batch_stride_B
    - -

    ◆ lda

    + +

    ◆ batch_stride_C

    -template<typename Scalar_, typename Index_ = int>
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    - +
    Index_ cutlass::gemm::GemmDesc< Scalar_, Index_ >::ldalong long cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::batch_stride_C
    - -

    ◆ ldb

    + +

    ◆ batch_stride_D

    -template<typename Scalar_, typename Index_ = int>
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    - +
    Index_ cutlass::gemm::GemmDesc< Scalar_, Index_ >::ldblong long cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::batch_stride_D
    - -

    ◆ ldc

    + +

    ◆ beta

    -template<typename Scalar_, typename Index_ = int>
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    - +
    Index_ cutlass::gemm::GemmDesc< Scalar_, Index_ >::ldcSType cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::beta
    - -

    ◆ ldd

    + +

    ◆ C

    -template<typename Scalar_, typename Index_ = int>
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    - +
    Index_ cutlass::gemm::GemmDesc< Scalar_, Index_ >::lddTensorRefC cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::C
    - -

    ◆ m

    + +

    ◆ D

    -template<typename Scalar_, typename Index_ = int>
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    - +
    Index_ cutlass::gemm::GemmDesc< Scalar_, Index_ >::mTensorRefD cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::D
    - -

    ◆ n

    + +

    ◆ problem_size

    -template<typename Scalar_, typename Index_ = int>
    +template<typename AType_, typename BType_, typename CType_, typename DType_, typename SType_, typename Index_ = int>
    - +
    Index_ cutlass::gemm::GemmDesc< Scalar_, Index_ >::nGemmCoord cutlass::gemm::GemmDesc< AType_, BType_, CType_, DType_, SType_, Index_ >::problem_size
    @@ -331,12 +766,12 @@

    gemm.h +
  • gemm_desc.h
  • diff --git a/docs/structcutlass_1_1gemm_1_1GemmEpilogue-members.html b/docs/structcutlass_1_1gemm_1_1GemmEpilogue-members.html index 6024711bae..88bf1a749a 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmEpilogue-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmEpilogue-members.html @@ -80,37 +80,36 @@

    This is the complete list of members for cutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >, including all inherited members.

    - - + + - - - - - - - - - - - - + + + + + + + + + + + + - - - - - - + + + + +
    Accumulators typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    epilogue(Coord< 3 > const &block, Accumulators &accumulators)cutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >inline
    epilogue_with_or_without_beta(Coord< 3 > const &block, Accumulators &accumulators)cutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >inline
    epilogue(Accumulators &accumulators, Coord< 3 > const &block=make_Coord(0, 0, 0), int batch_id=0)cutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >inline
    epilogue_with_or_without_beta(Accumulators &accumulators, Coord< 3 > const &block, int batch_id)cutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >inline
    Functor typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    GemmEpilogue(Params const &params_, SharedStorage &shared_storage_, Index m_, Index n_)cutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >inline
    GlobalLoadIteratorC typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    GlobalStoreIteratorD typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    GlobalTransformerC typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    GlobalTransformerD typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    Index typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    Iterations typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    mcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    ncutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    OutputTile typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    paramscutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    Params typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    functorcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    GemmEpilogue(Params const &params_, SharedStorage &shared_storage_, Coord< 3 > const &_problem_size)cutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >inline
    GlobalLoadIteratorC typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    GlobalStoreIteratorD typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    GlobalTransformerC typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    GlobalTransformerD typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    Index typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    Iterations typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    OutputTile typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    paramscutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    Params typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    problem_sizecutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    Scalar typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    ScalarC typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    ScalarD typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    shared_load_fence()cutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >inline
    shared_storagecutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    shared_store_fence()cutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >inline
    SharedLoadIteratorD typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    SharedLoadTransformerD typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    SharedStorage typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    SharedStoreIteratorD typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    SharedStoreTransformerD typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    Traits typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    SharedLoadStreamD typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    SharedStorage typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    SharedStoreIteratorD typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    SharedStoreTransformerD typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    Traits typedefcutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >
    diff --git a/docs/structcutlass_1_1gemm_1_1GemmEpilogue.html b/docs/structcutlass_1_1gemm_1_1GemmEpilogue.html index 3f08c9cf73..ec6b33ee41 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmEpilogue.html +++ b/docs/structcutlass_1_1gemm_1_1GemmEpilogue.html @@ -138,12 +138,9 @@
    typedef Traits::SharedStoreTransformerD SharedStoreTransformerD
     The shared store transformer for D. More...
     
    typedef Traits::SharedLoadIteratorD SharedLoadIteratorD
     The iterator to load D in shared memory. More...
     
    typedef Copy< typename SharedLoadIteratorD::Fragment > SharedLoadTransformerD
     The shared load transformer for D. More...
     
    typedef Traits::SharedLoadStreamD SharedLoadStreamD
     The iterator to load D in shared memory. More...
     
    typedef Traits::Index Index
     The index. More...
     
    - - - - - - - - - + + + + + + + + + @@ -180,11 +177,11 @@ - - - - - + + + + +

    Public Member Functions

    CUTLASS_DEVICE GemmEpilogue (Params const &params_, SharedStorage &shared_storage_, Index m_, Index n_)
     Ctor. More...
     
    CUTLASS_DEVICE void epilogue (Coord< 3 > const &block, Accumulators &accumulators)
     Execute the epilogue. More...
     
    template<bool kBetaIsZero_>
    CUTLASS_DEVICE void epilogue_with_or_without_beta (Coord< 3 > const &block, Accumulators &accumulators)
     
    CUTLASS_DEVICE GemmEpilogue (Params const &params_, SharedStorage &shared_storage_, Coord< 3 > const &_problem_size)
     Ctor. More...
     
    CUTLASS_DEVICE void epilogue (Accumulators &accumulators, Coord< 3 > const &block=make_Coord(0, 0, 0), int batch_id=0)
     Execute the epilogue. More...
     
    template<bool kSourceRequired>
    CUTLASS_DEVICE void epilogue_with_or_without_beta (Accumulators &accumulators, Coord< 3 > const &block, int batch_id)
     
    CUTLASS_DEVICE void shared_load_fence ()
     The memory fence for shared loads. More...
     
    SharedStorageshared_storage
     The shared storage. More...
     
    Index m
     The dimensions of the GEMM. More...
     
    Index n
     
    Coord< 3 > problem_size
     The dimensions of the GEMM. More...
     
    Functor functor
     

    Member Typedef Documentation

    @@ -396,8 +393,8 @@

    -

    ◆ SharedLoadIteratorD

    + +

    ◆ SharedLoadStreamD

    @@ -405,23 +402,7 @@

    typedef Traits::SharedLoadIteratorD cutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >::SharedLoadIteratorD
    -
    - -
    -
    - -

    ◆ SharedLoadTransformerD

    - -
    -
    -
    -template<typename GemmEpilogueTraits_ >
    - - - +
    typedef Copy<typename SharedLoadIteratorD::Fragment> cutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >::SharedLoadTransformerDtypedef Traits::SharedLoadStreamD cutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >::SharedLoadStreamD
    @@ -493,8 +474,8 @@

    Constructor & Destructor Documentation

    - -

    ◆ GemmEpilogue()

    + +

    ◆ GemmEpilogue()

    @@ -519,14 +500,8 @@

    Index m_,
    Index n_ Coord< 3 > const & _problem_size 
    (Accumulatorsaccumulators,
    Coord< 3 > const & block, block = make_Coord(0, 0, 0),
    Accumulatorsaccumulators int batch_id = 0 
    + + + + + + - - + + @@ -677,8 +664,8 @@

    Member Data Documentation

    - -

    ◆ m

    + +

    ◆ functor

    @@ -686,15 +673,15 @@

    - +
    @@ -598,14 +579,20 @@

    CUTLASS_DEVICE void cutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >::epilogue_with_or_without_beta

    (Accumulatorsaccumulators,
    Coord< 3 > const &  block,
    Accumulatorsaccumulators int batch_id 
    Index cutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >::mFunctor cutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >::functor
    -
    -

    ◆ n

    + +

    ◆ params

    @@ -702,15 +689,15 @@

    Index cutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >::nParams const& cutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >::params
    - -

    ◆ params

    + +

    ◆ problem_size

    @@ -718,7 +705,7 @@

    Params const& cutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >::paramsCoord<3> cutlass::gemm::GemmEpilogue< GemmEpilogueTraits_ >::problem_size
    @@ -747,7 +734,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits-members.html b/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits-members.html index 2035e3bf4f..894f46bb58 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits-members.html @@ -73,32 +73,32 @@

    -
    cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ > Member List
    +
    cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ > Member List
    -

    This is the complete list of members for cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >, including all inherited members.

    +

    This is the complete list of members for cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >, including all inherited members.

    - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + +
    Accumulators typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >
    Delta typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >
    Functor typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >
    GlobalLoadIteratorC typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >
    GlobalStoreIteratorD typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >
    GlobalTransformerC typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >
    GlobalTransformerD typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >
    Index typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >
    Iterations typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >
    OutputTile typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >
    Scalar typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >
    ScalarC typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >
    ScalarD typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >
    SharedLoadIteratorD typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >
    SharedStoreIteratorD typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >
    SharedStoreTransformerD typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >
    Accumulators typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >
    Delta typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >
    Functor typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >
    GlobalLoadIteratorC typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >
    GlobalStoreIteratorD typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >
    GlobalTransformerC typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >
    GlobalTransformerD typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >
    Index typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >
    Iterations typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >
    OutputTile typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >
    Scalar typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >
    ScalarC typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >
    ScalarD typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >
    SharedLoadStreamD typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >
    SharedStoreIteratorD typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >
    SharedStoreTransformerD typedefcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >
    diff --git a/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits.html b/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits.html index 8d99223d79..c932485477 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits.html +++ b/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ > Struct Template Reference +Cutlass: cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ > Struct Template Reference @@ -77,7 +77,7 @@ Public Types | List of all members
    -
    cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ > Struct Template Reference
    +
    cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ > Struct Template Reference
    @@ -97,65 +97,65 @@
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    Public Types

    typedef OutputTile_ OutputTile
     The output tile. More...
     
    typedef Accumulators_ Accumulators
     
    typedef GlobalLoadIteratorC_ GlobalLoadIteratorC
     The iterator for C in global memory. More...
     
    typedef GlobalTransformerC_ GlobalTransformerC
     The transformer for C. More...
     
    typedef GlobalTransformerD_ GlobalTransformerD
     The transformer for D. More...
     
    typedef GlobalStoreIteratorD_ GlobalStoreIteratorD
     The iterator for D in global memory. More...
     
    typedef SharedStoreIteratorD_ SharedStoreIteratorD
     The iterator to store D in shared memory. More...
     
    typedef SharedStoreTransformerD_ SharedStoreTransformerD
     The shared store transformer for D. More...
     
    typedef SharedLoadIteratorD_ SharedLoadIteratorD
     The iterator to store D in shared memory. More...
     
    typedef Iterations_ Iterations
     typedef typename GemmConfig::EpilogueIterations Iterations; More...
     
    typedef Delta_ Delta
     The iterations strides. More...
     
    typedef Functor_ Functor
     The functor in charge of the math. More...
     
    typedef Index_ Index
     The index. More...
     
    typedef Functor::Scalar Scalar
     We do not support 3D or 4D shapes. More...
     
    typedef GlobalLoadIteratorC::Scalar ScalarC
     The scalar for C. More...
     
    typedef GlobalStoreIteratorD::Scalar ScalarD
     The scalar for D. More...
     
    typedef OutputTile_ OutputTile
     The output tile. More...
     
    typedef Accumulators_ Accumulators
     
    typedef GlobalLoadIteratorC_ GlobalLoadIteratorC
     The iterator for C in global memory. More...
     
    typedef GlobalTransformerC_ GlobalTransformerC
     The transformer for C. More...
     
    typedef GlobalTransformerD_ GlobalTransformerD
     The transformer for D. More...
     
    typedef GlobalStoreIteratorD_ GlobalStoreIteratorD
     The iterator for D in global memory. More...
     
    typedef SharedStoreIteratorD_ SharedStoreIteratorD
     The iterator to store D in shared memory. More...
     
    typedef SharedStoreTransformerD_ SharedStoreTransformerD
     The shared store transformer for D. More...
     
    typedef SharedLoadStreamD_ SharedLoadStreamD
     The stream to store D in shared memory. More...
     
    typedef Iterations_ Iterations
     typedef typename GemmConfig::EpilogueIterations Iterations; More...
     
    typedef Delta_ Delta
     The iterations strides. More...
     
    typedef Functor_ Functor
     The functor in charge of the math. More...
     
    typedef Index_ Index
     The index. More...
     
    typedef Functor::Scalar Scalar
     We do not support 3D or 4D shapes. More...
     
    typedef GlobalLoadIteratorC::Scalar ScalarC
     The scalar for C. More...
     
    typedef GlobalStoreIteratorD::Scalar ScalarD
     The scalar for D. More...
     

    Member Typedef Documentation

    - -

    ◆ Accumulators

    + +

    ◆ Accumulators

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    typedef Accumulators_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Accumulatorstypedef Accumulators_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Accumulators
    @@ -163,160 +163,160 @@

    -

    ◆ Delta

    + +

    ◆ Delta

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    typedef Delta_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Deltatypedef Delta_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Delta

    - -

    ◆ Functor

    + +

    ◆ Functor

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    typedef Functor_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Functortypedef Functor_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Functor
    - -

    ◆ GlobalLoadIteratorC

    + +

    ◆ GlobalLoadIteratorC

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    typedef GlobalLoadIteratorC_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::GlobalLoadIteratorCtypedef GlobalLoadIteratorC_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::GlobalLoadIteratorC
    - -

    ◆ GlobalStoreIteratorD

    + +

    ◆ GlobalStoreIteratorD

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    typedef GlobalStoreIteratorD_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::GlobalStoreIteratorDtypedef GlobalStoreIteratorD_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::GlobalStoreIteratorD
    - -

    ◆ GlobalTransformerC

    + +

    ◆ GlobalTransformerC

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    typedef GlobalTransformerC_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::GlobalTransformerCtypedef GlobalTransformerC_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::GlobalTransformerC
    - -

    ◆ GlobalTransformerD

    + +

    ◆ GlobalTransformerD

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    typedef GlobalTransformerD_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::GlobalTransformerDtypedef GlobalTransformerD_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::GlobalTransformerD
    - -

    ◆ Index

    + +

    ◆ Index

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    typedef Index_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Indextypedef Index_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Index
    - -

    ◆ Iterations

    + +

    ◆ Iterations

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    typedef Iterations_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Iterationstypedef Iterations_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Iterations
    - -

    ◆ OutputTile

    + +

    ◆ OutputTile

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    typedef OutputTile_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::OutputTiletypedef OutputTile_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::OutputTile
    - -

    ◆ Scalar

    + +

    ◆ Scalar

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    typedef Functor::Scalar cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Scalartypedef Functor::Scalar cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Scalar
    @@ -324,80 +324,80 @@

    -

    ◆ ScalarC

    + +

    ◆ ScalarC

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    typedef GlobalLoadIteratorC::Scalar cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::ScalarCtypedef GlobalLoadIteratorC::Scalar cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::ScalarC

    - -

    ◆ ScalarD

    + +

    ◆ ScalarD

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    typedef GlobalStoreIteratorD::Scalar cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::ScalarDtypedef GlobalStoreIteratorD::Scalar cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::ScalarD
    - -

    ◆ SharedLoadIteratorD

    + +

    ◆ SharedLoadStreamD

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    typedef SharedLoadIteratorD_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::SharedLoadIteratorDtypedef SharedLoadStreamD_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::SharedLoadStreamD
    - -

    ◆ SharedStoreIteratorD

    + +

    ◆ SharedStoreIteratorD

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    typedef SharedStoreIteratorD_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::SharedStoreIteratorDtypedef SharedStoreIteratorD_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::SharedStoreIteratorD
    - -

    ◆ SharedStoreTransformerD

    + +

    ◆ SharedStoreTransformerD

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    typedef SharedStoreTransformerD_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::SharedStoreTransformerDtypedef SharedStoreTransformerD_ cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::SharedStoreTransformerD
    @@ -410,7 +410,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraitsHelper-members.html b/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraitsHelper-members.html index 4f04d91f30..83a1f951ae 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraitsHelper-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraitsHelper-members.html @@ -91,14 +91,15 @@

    OutputTile typedefcutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >
    Scalar typedefcutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >
    SharedLoadIteratorD typedefcutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >
    SharedLoadTileTraits typedefcutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >
    SharedStoreIteratorD typedefcutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >
    SharedStoreTileTraits typedefcutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >
    SharedStoreTransformerD typedefcutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >
    SharedLoadStreamD typedefcutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >
    SharedLoadTileTraits typedefcutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >
    SharedStoreIteratorD typedefcutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >
    SharedStoreTileTraits typedefcutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >
    SharedStoreTransformerD typedefcutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >
    diff --git a/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraitsHelper.html b/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraitsHelper.html index 55d6652f41..642c964a62 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraitsHelper.html +++ b/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraitsHelper.html @@ -98,28 +98,31 @@
    typedef EpilogueFunctor_ Functor
     The functor to do the math in the epilogue. More...
     
    typedef GemmSharedStoreTileDTraits< typename Functor::Scalar, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, GemmConfig_::kScalarsPerStsD, 128/sizeof(typename GemmConfig_::ScalarD)/GemmConfig_::kScalarsPerStsD/2 *GemmConfig_::kScalarsPerStsD > SharedStoreTileTraits
     The traits class to build the iterator to store to shared memory for D. More...
     
    typedef TileStoreIterator< SharedStoreTileTraits, typename SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kSharedSharedStoreIteratorD
    typedef GemmSharedStoreTileDTraits< typename Functor::ScalarAccum, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, GemmConfig_::kScalarsPerStsD, 128/sizeof(typename GemmConfig_::ScalarD)/GemmConfig_::kScalarsPerStsD/2 *GemmConfig_::kScalarsPerStsD > SharedStoreTileTraits
     The traits class to build the iterator to store to shared memory for D. More...
     
    typedef TileStoreIterator< SharedStoreTileTraits, typename SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kSharedSharedStoreIteratorD
     The iterator to store D to shared memory. More...
     
    typedef Copy< typename SharedStoreIteratorD::FragmentSharedStoreTransformerD
    typedef Copy< typename SharedStoreIteratorD::FragmentSharedStoreTransformerD
     The shared store transformer for D. More...
     
    typedef GemmSharedLoadTileDTraits< typename Functor::Scalar, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, GemmConfig_::OutputTile::kH/ShapeCount< Iterations >::kCount, GemmConfig_::kScalarsPerLdsD, SharedStoreTileTraits::kSkewSharedLoadTileTraits
     The traits class to build the iterator to load from shared memory for D. More...
     
    typedef TileLoadIterator< SharedLoadTileTraits, typename SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kSharedSharedLoadIteratorD
    typedef GemmSharedLoadTileDTraits< typename Functor::ScalarAccum, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, GemmConfig_::OutputTile::kH/ShapeCount< Iterations >::kCount, GemmConfig_::kScalarsPerLdsD, SharedStoreTileTraits::kSkewSharedLoadTileTraits
     The traits class to build the iterator to load from shared memory for D. More...
     
    typedef TileLoadIterator< SharedLoadTileTraits, typename SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kSharedSharedLoadIteratorD
     The iterator to load D from shared memory. More...
     
    typedef SharedLoadStream< SharedLoadIteratorDSharedLoadStreamD
     The stream to load D. More...
     
    typedef GemmGlobalTileCdTraits< typename GemmConfig_::ScalarC const, Shape< 1, GemmConfig_::OutputTile::kH/ShapeCount< Iterations >::kCount, GemmConfig_::OutputTile::kW >, Shape< 1, ShapeCount< typename GemmConfig_::Warps >::kCount, GemmConfig_::kWarpSize >, Iterations::kW, GemmConfig_::kScalarsPerLdgC > GlobalLoadTileTraits
     The traits class to build the iterator to load data from global memory for C^N. More...
     
    typedef GemmGlobalIteratorCd< GlobalLoadTileTraits, Index_ > GlobalLoadIteratorC
     The iterator to load C. More...
     
    typedef Copy< typename GlobalLoadIteratorC::FragmentGlobalTransformerC
    typedef Copy< typename GlobalLoadIteratorC::FragmentGlobalTransformerC
     The transformer for C. More...
     
    typedef GemmGlobalTileCdTraits< typename GemmConfig_::ScalarD, Shape< 1, GemmConfig_::OutputTile::kH/ShapeCount< Iterations >::kCount, GemmConfig_::OutputTile::kW >, Shape< 1, ShapeCount< typename GemmConfig_::Warps >::kCount, GemmConfig_::kWarpSize >, Iterations::kW, GemmConfig_::kScalarsPerStgD > GlobalStoreTileTraits
    typedef GemmGlobalIteratorCd< GlobalStoreTileTraits, Index_ > GlobalStoreIteratorD
     The iterator to store D. More...
     
    typedef Copy< typename GlobalStoreIteratorD::FragmentGlobalTransformerD
    typedef Copy< typename GlobalStoreIteratorD::FragmentGlobalTransformerD
     The transformer for D. More...
     
    @@ -238,7 +241,7 @@

    typedef Copy<typename GlobalLoadIteratorC::Fragment> cutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >::GlobalTransformerCtypedef Copy<typename GlobalLoadIteratorC::Fragment> cutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >::GlobalTransformerC
    @@ -254,7 +257,7 @@

    typedef Copy<typename GlobalStoreIteratorD::Fragment> cutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >::GlobalTransformerDtypedef Copy<typename GlobalStoreIteratorD::Fragment> cutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >::GlobalTransformerD
    @@ -318,15 +321,15 @@

    typedef TileLoadIterator<SharedLoadTileTraits, typename SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared> cutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >::SharedLoadIteratorDtypedef TileLoadIterator<SharedLoadTileTraits, typename SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared> cutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >::SharedLoadIteratorD
    - -

    ◆ SharedLoadTileTraits

    + +

    ◆ SharedLoadStreamD

    @@ -334,7 +337,23 @@

    typedef GemmSharedLoadTileDTraits< typename Functor::Scalar, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, GemmConfig_::OutputTile::kH / ShapeCount<Iterations>::kCount, GemmConfig_::kScalarsPerLdsD, SharedStoreTileTraits::kSkew> cutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >::SharedLoadTileTraitstypedef SharedLoadStream<SharedLoadIteratorD> cutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >::SharedLoadStreamD
    +
    + +
    + + +

    ◆ SharedLoadTileTraits

    + +
    +
    +
    +template<typename GemmConfig_, typename EpilogueFunctor_, typename Index_ = int>
    + + +
    typedef GemmSharedLoadTileDTraits< typename Functor::ScalarAccum, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, GemmConfig_::OutputTile::kH / ShapeCount<Iterations>::kCount, GemmConfig_::kScalarsPerLdsD, SharedStoreTileTraits::kSkew> cutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >::SharedLoadTileTraits
    @@ -350,15 +369,15 @@

    typedef TileStoreIterator<SharedStoreTileTraits, typename SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared> cutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >::SharedStoreIteratorDtypedef TileStoreIterator<SharedStoreTileTraits, typename SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared> cutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >::SharedStoreIteratorD
    - -

    ◆ SharedStoreTileTraits

    + +

    ◆ SharedStoreTileTraits

    @@ -366,7 +385,7 @@

    typedef GemmSharedStoreTileDTraits< typename Functor::Scalar, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, GemmConfig_::kScalarsPerStsD, 128 / sizeof(typename GemmConfig_::ScalarD) / GemmConfig_::kScalarsPerStsD / 2 * GemmConfig_::kScalarsPerStsD> cutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >::SharedStoreTileTraitstypedef GemmSharedStoreTileDTraits< typename Functor::ScalarAccum, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, GemmConfig_::kScalarsPerStsD, 128 / sizeof(typename GemmConfig_::ScalarD) / GemmConfig_::kScalarsPerStsD / 2 * GemmConfig_::kScalarsPerStsD> cutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >::SharedStoreTileTraits
    @@ -382,7 +401,7 @@

    typedef Copy<typename SharedStoreIteratorD::Fragment> cutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >::SharedStoreTransformerDtypedef Copy<typename SharedStoreIteratorD::Fragment> cutlass::gemm::GemmEpilogueTraitsHelper< GemmConfig_, EpilogueFunctor_, Index_ >::SharedStoreTransformerD
    @@ -395,7 +414,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits_1_1Params-members.html b/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits_1_1Params-members.html index b6a1ec7804..3608cbfb7e 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits_1_1Params-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits_1_1Params-members.html @@ -73,24 +73,24 @@

    -
    cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Params Member List
    +
    cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Params Member List
    -

    This is the complete list of members for cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Params, including all inherited members.

    +

    This is the complete list of members for cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Params, including all inherited members.

    - - - - - - - - + + + + + + + +
    functorcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Params
    initialize(GemmDesc_ const &desc)cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Paramsinline
    iterator_ccutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Params
    iterator_dcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Params
    shared_load_iterator_dcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Params
    shared_store_iterator_dcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Params
    stride_hcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Params
    stride_wcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Params
    functorcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Params
    initialize(GemmDesc_ const &desc)cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Paramsinline
    iterator_ccutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Params
    iterator_dcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Params
    shared_load_stream_dcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Params
    shared_store_iterator_dcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Params
    stride_hcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Params
    stride_wcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Params
    diff --git a/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits_1_1Params.html b/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits_1_1Params.html index c94e55e07d..5f2f16c3cb 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits_1_1Params.html +++ b/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits_1_1Params.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Params Struct Reference +Cutlass: cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Params Struct Reference @@ -77,7 +77,7 @@ Public Attributes | List of all members
    -
    cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Params Struct Reference
    +
    cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Params Struct Reference
    @@ -88,42 +88,42 @@ - - - - + + + +

    Public Member Functions

    template<typename GemmDesc_ >
    CUTLASS_HOST_DEVICE int initialize (GemmDesc_ const &desc)
     Setup the params. More...
     
    template<typename GemmDesc_ >
    CUTLASS_HOST_DEVICE int initialize (GemmDesc_ const &desc)
     Setup the params. More...
     
    - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + +

    Public Attributes

    Index stride_h
     The strides for H and W in the different iterations of the epilogue. More...
     
    Index stride_w
     
    GlobalLoadIteratorC::Params iterator_c
     The params for the C iterator. More...
     
    GlobalStoreIteratorD::Params iterator_d
     The params for the D global iterator. More...
     
    SharedStoreIteratorD::Params shared_store_iterator_d
     The params for the D shared store iterator. More...
     
    SharedLoadIteratorD::Params shared_load_iterator_d
     The params for the D shared load iterator. More...
     
    Functor::Params functor
     The functor params. More...
     
    Index stride_h
     The strides for H and W in the different iterations of the epilogue. More...
     
    Index stride_w
     
    GlobalLoadIteratorC::Params iterator_c
     The params for the C iterator. More...
     
    GlobalStoreIteratorD::Params iterator_d
     The params for the D global iterator. More...
     
    SharedStoreIteratorD::Params shared_store_iterator_d
     The params for the D shared store iterator. More...
     
    SharedLoadStreamD::Params shared_load_stream_d
     The params for the D shared load stream. More...
     
    Functor::Params functor
     The functor params. More...
     

    Member Function Documentation

    - -

    ◆ initialize()

    + +

    ◆ initialize()

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    template<typename GemmDesc_ >
    @@ -131,7 +131,7 @@

    - + @@ -148,112 +148,112 @@

    Member Data Documentation

    - -

    ◆ functor

    + +

    ◆ functor

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    CUTLASS_HOST_DEVICE int cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Params::initialize CUTLASS_HOST_DEVICE int cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Params::initialize ( GemmDesc_ const &  desc)
    - +
    Functor::Params cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Params::functorFunctor::Params cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Params::functor
    - -

    ◆ iterator_c

    + +

    ◆ iterator_c

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    GlobalLoadIteratorC::Params cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Params::iterator_cGlobalLoadIteratorC::Params cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Params::iterator_c
    - -

    ◆ iterator_d

    + +

    ◆ iterator_d

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    GlobalStoreIteratorD::Params cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Params::iterator_dGlobalStoreIteratorD::Params cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Params::iterator_d
    - -

    ◆ shared_load_iterator_d

    + +

    ◆ shared_load_stream_d

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    SharedLoadIteratorD::Params cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Params::shared_load_iterator_dSharedLoadStreamD::Params cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Params::shared_load_stream_d
    - -

    ◆ shared_store_iterator_d

    + +

    ◆ shared_store_iterator_d

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    SharedStoreIteratorD::Params cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Params::shared_store_iterator_dSharedStoreIteratorD::Params cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Params::shared_store_iterator_d
    - -

    ◆ stride_h

    + +

    ◆ stride_h

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    Index cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Params::stride_hIndex cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Params::stride_h
    - -

    ◆ stride_w

    + +

    ◆ stride_w

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    Index cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::Params::stride_wIndex cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::Params::stride_w
    @@ -266,7 +266,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits_1_1SharedStorage-members.html b/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits_1_1SharedStorage-members.html index 4856ef616e..9fdea968c3 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits_1_1SharedStorage-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits_1_1SharedStorage-members.html @@ -73,17 +73,18 @@

    -
    cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::SharedStorage Member List
    +
    cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::SharedStorage Member List
    -

    This is the complete list of members for cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::SharedStorage, including all inherited members.

    +

    This is the complete list of members for cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::SharedStorage, including all inherited members.

    - + +
    shared_streamcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::SharedStorage
    data()cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::SharedStorageinline
    shared_streamcutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::SharedStorage
    diff --git a/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits_1_1SharedStorage.html b/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits_1_1SharedStorage.html index 4cad48c739..cad6b91dae 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits_1_1SharedStorage.html +++ b/docs/structcutlass_1_1gemm_1_1GemmEpilogueTraits_1_1SharedStorage.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::SharedStorage Struct Reference +Cutlass: cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::SharedStorage Struct Reference @@ -73,10 +73,11 @@
    -
    cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::SharedStorage Struct Reference
    +
    cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::SharedStorage Struct Reference
    @@ -85,22 +86,55 @@

    #include <gemm_epilogue_traits.h>

    + + + +

    +Public Member Functions

    CUTLASS_DEVICE ScalarDdata ()
     
    - - + +

    Public Attributes

    StreamSharedStorage shared_stream
     
    StreamSharedStorage shared_stream
     
    +

    Member Function Documentation

    + +

    ◆ data()

    + +
    +
    +
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    + + + + + +
    + + + + + + + +
    CUTLASS_DEVICE ScalarD* cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::SharedStorage::data ()
    +
    +inline
    +
    + +
    +

    Member Data Documentation

    - -

    ◆ shared_stream

    + +

    ◆ shared_stream

    -template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadIteratorD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    +template<typename OutputTile_, typename Accumulators_, typename GlobalLoadIteratorC_, typename GlobalTransformerC_, typename GlobalTransformerD_, typename GlobalStoreIteratorD_, typename SharedStoreIteratorD_, typename SharedStoreTransformerD_, typename SharedLoadStreamD_, typename Iterations_, typename Delta_, typename Functor_, typename Index_ = int>
    - +
    StreamSharedStorage cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadIteratorD_, Iterations_, Delta_, Functor_, Index_ >::SharedStorage::shared_streamStreamSharedStorage cutlass::gemm::GemmEpilogueTraits< OutputTile_, Accumulators_, GlobalLoadIteratorC_, GlobalTransformerC_, GlobalTransformerD_, GlobalStoreIteratorD_, SharedStoreIteratorD_, SharedStoreTransformerD_, SharedLoadStreamD_, Iterations_, Delta_, Functor_, Index_ >::SharedStorage::shared_stream
    @@ -113,7 +147,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb-members.html b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb-members.html index 7e4746ea81..83f2695807 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb-members.html @@ -79,62 +79,69 @@

    This is the complete list of members for cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >, including all inherited members.

    - - - - - + + + + + - - - - - - - - - - - + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    AccessType typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    Base typedefcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    BaseParams typedefcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    data() constcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline
    Delta typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    AccessType typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    add_pointer_offset(Index offset)cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline
    Base typedefcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    BaseParams typedefcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    Delta typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    Fragment typedefcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    FragmentConstIterator typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    FragmentElement typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    FragmentIterator typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    FragmentShape typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    GemmGlobalIteratorAb(Params const &_params, const Coord< 3 > &bounds, const Coord< 3 > &block, ThreadOffset thread_offset_func=ThreadOffset())cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline
    ImmediateOffsetStrides typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, Advance_, MemorySpace, Index_, TileTraits_::Scalar, IteratorFragment::kScalar, Shape< 0, 0, 0, 0 > >
    inc_advance()cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline
    inc_d()cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline
    inc_h()cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline
    inc_stage()cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    inc_w()cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    FragmentConstIterator typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    FragmentElement typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    FragmentIterator typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    FragmentShape typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    GemmGlobalIteratorAb(Params const &_params, const Coord< 3 > &bounds, const Coord< 3 > &threadblock_offset, ThreadOffset thread_offset_func=ThreadOffset())cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline
    ImmediateOffsetStrides typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, Advance_, MemorySpace, Index_, TileTraits_::Scalar, FragmentElementType::kScalar, Shape< 0, 0, 0, 0 > >
    inc_advance()cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline
    inc_d()cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline
    inc_h()cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline
    inc_stage()cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    inc_w()cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline
    Index typedefcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    initialize_predicates(const Coord< 3 > &bounds, const Coord< 3 > &block)cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline
    TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >::initialize_predicates(PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &block_offset=make_Coord(0, 0, 0))cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    Iterations typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    kAccessSizecutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, Advance_, MemorySpace, Index_, TileTraits_::Scalar, IteratorFragment::kScalar, Shape< 0, 0, 0, 0 > >static
    kAdvancecutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >static
    kFragmentSizecutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, Advance_, MemorySpace, Index_, TileTraits_::Scalar, IteratorFragment::kScalar, Shape< 0, 0, 0, 0 > >static
    kIteratorFragmentcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >static
    kLayoutcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >static
    kMemorySpacecutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >static
    kRequiresLoadFence enum valuecutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    load(Fragment &fragment, PredicateIterator pred_it) constcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    load(Fragment &fragment) constcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    load_post_increment(Fragment &fragment, PredicateIterator pred_it)cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    load_post_increment(Fragment &fragment)cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    paramscutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    Pointer typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    predicatescutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    PredicateVector typedefcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    residue(Index k)cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline
    Scalar typedefcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    SharedStorage typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    Skew typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    stagecutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    Storage typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, Advance_, MemorySpace, Index_, TileTraits_::Scalar, IteratorFragment::kScalar, Shape< 0, 0, 0, 0 > >
    This_ typedefcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    thread_offsetcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    ThreadOffset typedefcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    Threads typedefcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    Tile typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    TileLoadIterator()cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    TileLoadIterator(Params const &_params, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    TileLoadIterator(Params const &, SharedStorage &shared_storage, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    Traits typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    valid(int d, int h, int w, int c) constcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline
    initialize_predicates(const Coord< 3 > &bounds, const Coord< 3 > &block_offset)cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline
    TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >::initialize_predicates(PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &block_offset=make_Coord(0, 0, 0))cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >::initialize_predicates(PredicateIterator predicate_it, PredicateFunctor const &functor, Coord< 3 > const &block_offset)cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    Iterations typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    kAccessSizecutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >static
    kAdvancecutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >static
    kFragmentElementTypecutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >static
    kFragmentSizecutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, Advance_, MemorySpace, Index_, TileTraits_::Scalar, FragmentElementType::kScalar, Shape< 0, 0, 0, 0 > >static
    kLayoutcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >static
    kMemorySpacecutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >static
    kRequiresLoadFence enum valuecutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    load(Fragment &fragment, PredicateIterator pred_it) constcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    load(Fragment &fragment) constcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    load(Fragment &fragment, int d)cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    load_element(typename Base::AccessType &value, int d, int h, int w, int c) constcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline
    load_post_increment(Fragment &fragment)cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline
    TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >::load_post_increment(Fragment &fragment, PredicateIterator pred_it)cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >::load_post_increment(Fragment &fragment)cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    operator+=(Coord< 3 > const &offset)cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline
    paramscutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    Pointer typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    predicatescutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    PredicateVector typedefcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    residue(Index k)cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline
    Scalar typedefcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    SharedStorage typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    Skew typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    stagecutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    Storage typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, Advance_, MemorySpace, Index_, TileTraits_::Scalar, FragmentElementType::kScalar, Shape< 0, 0, 0, 0 > >
    stride_advance(void)cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline
    TensorRef typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    This_ typedefcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    thread_offsetcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    ThreadOffset typedefcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    Threads typedefcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    Tile typedefcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >
    TileLoadIterator()cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    TileLoadIterator(Params const &_params, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    TileLoadIterator(Params const &, Scalar const *ptr, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >inline
    Traits typedefcutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    valid(int d, int h, int w, int c) constcutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >inline

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb.html b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb.html index 4210572d79..a795acf02c 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb.html +++ b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb.html @@ -92,7 +92,8 @@ cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ > -cutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, Advance_, MemorySpace, Index_, TileTraits_::Scalar, IteratorFragment::kScalar, Shape< 0, 0, 0, 0 > > +cutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, Advance_, MemorySpace, Index_, TileTraits_::Scalar, FragmentElementType::kScalar, Shape< 0, 0, 0, 0 > > +cutlass::gemm::IgemmGlobalIteratorAb< TileTraits_, Index_ >
    @@ -109,7 +110,10 @@ - + + + + @@ -124,7 +128,7 @@ - + @@ -133,188 +137,218 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    typedef TileLoadIterator< TileTraits_, typename TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ > Base
     The base class. More...
     
    typedef Base::Fragment Fragment
    typedef TileTraits_::Tile Tile
     The tile. More...
     
    typedef Base::Fragment Fragment
     Fragment type loaded by the iterator. More...
     
    typedef TileTraits_::Scalar Scalar
    typedef TileTraits_::ThreadOffset ThreadOffset
     The thread offset. More...
     
    typedef cutlass::PredicateVector< ShapeCount< typename Base::Iterations >::kCount > PredicateVector
    typedef cutlass::PredicateVector< ShapeCount< typename Base::Iterations >::kCount > PredicateVector
     
    typedef Base::Params BaseParams
     Iterator parameters type. More...
    enum  
     Do we require a fence? More...
     
    typedef TileIteratorBase< TileTraits_, TileTraits_::Scalar, Advance_, MemorySpace, Index_, TileTraits_::Scalar, IteratorFragment::kScalar, Shape< 0, 0, 0, 0 > > Base
     Base class. More...
     
    typedef Base::Traits Traits
     concept TileTraits More...
     
    typedef Base::Scalar Scalar
     Scalar element. More...
     
    typedef Base::FragmentElement FragmentElement
     Fragment element. More...
     
    typedef Base::Index Index
     Index type. More...
     
    typedef Base::Skew Skew
     Skew quantity. More...
     
    typedef Base::Tile Tile
     Tile shape. More...
     
    typedef Base::Delta Delta
     Delta. More...
     
    typedef Base::Iterations Iterations
     Iterations. More...
     
    typedef Base::ThreadOffset ThreadOffset
     ThreadOffset functor. More...
     
    typedef Base::FragmentShape FragmentShape
     Fragment type. More...
     
    typedef Base::AccessType AccessType
     Memory access type. More...
     
    typedef Base::Fragment Fragment
     Fragment definition. More...
     
    typedef Base::FragmentIterator FragmentIterator
     Fragment iterator definition. More...
     
    typedef Base::FragmentConstIterator FragmentConstIterator
     Fragment const iterator definition. More...
     
    typedef Base::PredicateVector PredicateVector
     Default predicate mask type. More...
     
    typedef Base::Storage SharedStorage
     Storage object that may be loaded from. More...
     
    typedef Base::Params BaseParams
     IteratorBase parameters. More...
     
    typedef Scalar const * Pointer
     The pointer type. More...
     
    - Public Types inherited from cutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, Advance_, MemorySpace, Index_, TileTraits_::Scalar, IteratorFragment::kScalar, Shape< 0, 0, 0, 0 > >
    typedef TileTraits_ Traits
     concept TileTraits More...
     
    typedef TileTraits_::Scalar Scalar
     Scalar element. More...
     
    typedef TileTraits_::Scalar FragmentElement
     Fragment element. More...
     
    typedef Index_ Index
     Index type. More...
     
    typedef Shape< 0, 0, 0, 0 > Skew
     Skew quantity. More...
     
    typedef Traits::Tile Tile
     Tile shape. More...
     
    typedef Traits::Delta Delta
     Distance along each dimension. More...
     
    typedef Traits::ImmediateOffsetStrides ImmediateOffsetStrides
     The strides in each dimension between different loads/stores. More...
     
    typedef Traits::Iterations Iterations
     Iterations. More...
     
    typedef Traits::ThreadOffset ThreadOffset
     Thread offset. More...
     
    typedef Vectorize< FragmentElement, kAccessSize >::Type AccessType
     The elements loaded/store by one instruction. More...
     
    typedef Fragment< Scalar, ShapeCount< Tile >::kCount, kFragmentSizeStorage
     The storage. More...
     
    typedef Fragment< FragmentElement, ShapeCount< Iterations >::kCount *kAccessSizeFragment
     The fragment. More...
     
    typedef FragmentIterator< Fragment, Iterations, AccessTypeFragmentIterator
     The fragment iterator. More...
     
    typedef FragmentConstIterator< Fragment, Iterations, AccessTypeFragmentConstIterator
     The fragment const iterator. More...
     
    typedef FragmentIterator::FragmentShape FragmentShape
     The shape of the fragment. More...
     
    typedef PredicateVector< ShapeCount< Iterations >::kCount > PredicateVector
     Default predicate mask type. More...
     
    typedef TileIteratorBase< TileTraits_, TileTraits_::Scalar, Advance_, MemorySpace, Index_, TileTraits_::Scalar, FragmentElementType::kScalar, Shape< 0, 0, 0, 0 > > Base
     Base class. More...
     
    typedef Base::Traits Traits
     concept TileTraits More...
     
    typedef Base::Scalar Scalar
     Scalar element. More...
     
    typedef TileTraits_::Scalar FragmentElement
     Fragment element. More...
     
    typedef Base::Index Index
     Index type. More...
     
    typedef Base::Skew Skew
     Skew quantity. More...
     
    typedef Base::Tile Tile
     Tile shape. More...
     
    typedef Base::Delta Delta
     Delta. More...
     
    typedef Base::Iterations Iterations
     Iterations. More...
     
    typedef Base::ThreadOffset ThreadOffset
     ThreadOffset functor. More...
     
    typedef Base::FragmentShape FragmentShape
     Fragment type. More...
     
    typedef Base::AccessType AccessType
     Memory access type. More...
     
    typedef Base::Fragment Fragment
     Fragment definition. More...
     
    typedef Base::FragmentIterator FragmentIterator
     Fragment iterator definition. More...
     
    typedef Base::FragmentConstIterator FragmentConstIterator
     Fragment const iterator definition. More...
     
    typedef Base::PredicateVector PredicateVector
     Default predicate mask type. More...
     
    typedef Base::Storage SharedStorage
     Storage object that may be loaded from. More...
     
    typedef Base::Params BaseParams
     IteratorBase parameters. More...
     
    typedef Scalar const * Pointer
     The pointer type. More...
     
    typedef TensorRef< Scalar const, 4 > TensorRef
     Tensor reference for the load iterator. More...
     
    - Public Types inherited from cutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, Advance_, MemorySpace, Index_, TileTraits_::Scalar, FragmentElementType::kScalar, Shape< 0, 0, 0, 0 > >
    typedef TileTraits_ Traits
     concept TileTraits More...
     
    typedef TileTraits_::Scalar Scalar
     Scalar element. More...
     
    typedef TileTraits_::Scalar FragmentElement
     Fragment element. More...
     
    typedef Index_ Index
     Index type. More...
     
    typedef Shape< 0, 0, 0, 0 > Skew
     Skew quantity. More...
     
    typedef Traits::Tile Tile
     Tile shape. More...
     
    typedef Traits::Delta Delta
     Distance along each dimension. More...
     
    typedef Traits::ImmediateOffsetStrides ImmediateOffsetStrides
     The strides in each dimension between different loads/stores. More...
     
    typedef Traits::Iterations Iterations
     Iterations. More...
     
    typedef Traits::ThreadOffset ThreadOffset
     Thread offset. More...
     
    typedef Vectorize< FragmentElement, kAccessSize >::Type AccessType
     The elements loaded/store by one instruction. More...
     
    typedef Fragment< Scalar, ShapeCount< Tile >::kCount, kFragmentSizeStorage
     The storage. More...
     
    typedef Fragment< FragmentElement, ShapeCount< Iterations >::kCount *kAccessSizeFragment
     The fragment. More...
     
    typedef FragmentIterator< Fragment, Iterations, AccessTypeFragmentIterator
     The fragment iterator. More...
     
    typedef FragmentConstIterator< Fragment, Iterations, AccessTypeFragmentConstIterator
     The fragment const iterator. More...
     
    typedef FragmentIterator::FragmentShape FragmentShape
     The shape of the fragment. More...
     
    typedef PredicateVector< ShapeCount< Iterations >::kCount > PredicateVector
     Default predicate mask type. More...
     
    - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    Public Member Functions

    CUTLASS_DEVICE void initialize_predicates (const Coord< 3 > &bounds, const Coord< 3 > &block)
     
    CUTLASS_DEVICE GemmGlobalIteratorAb (Params const &_params, const Coord< 3 > &bounds, const Coord< 3 > &block, ThreadOffset thread_offset_func=ThreadOffset())
     Ctor. More...
     
    CUTLASS_DEVICE void inc_h ()
     Increment the pointer in the H dimension. More...
     
    CUTLASS_DEVICE void inc_d ()
     Increment the pointer in the D dimension. More...
     
    CUTLASS_DEVICE void inc_advance ()
     Increment the pointer to move to the next iteration. More...
     
    CUTLASS_HOST_DEVICE Scalar const * data () const
     Returns the current pointer. More...
     
    CUTLASS_DEVICE void residue (Index k)
     That's the residue! Update the predicates. More...
     
    CUTLASS_DEVICE bool valid (int d, int h, int w, int c) const
     Is the iterator valid? More...
     
    CUTLASS_HOST_DEVICE void initialize_predicates (const Coord< 3 > &bounds, const Coord< 3 > &block_offset)
     
    CUTLASS_HOST_DEVICE GemmGlobalIteratorAb (Params const &_params, const Coord< 3 > &bounds, const Coord< 3 > &threadblock_offset, ThreadOffset thread_offset_func=ThreadOffset())
     Ctor. More...
     
    CUTLASS_HOST_DEVICE void inc_w ()
     Increment the pointer in the W dimension. More...
     
    CUTLASS_HOST_DEVICE void inc_h ()
     Increment the pointer in the H dimension. More...
     
    CUTLASS_HOST_DEVICE void inc_d ()
     Increment the pointer in the D dimension. More...
     
    CUTLASS_HOST_DEVICE void inc_advance ()
     Increment the pointer to move to the next iteration. More...
     
    CUTLASS_HOST_DEVICE void load_element (typename Base::AccessType &value, int d, int h, int w, int c) const
     Loads a single fragment element from memory. More...
     
    CUTLASS_HOST_DEVICE void residue (Index k)
     That's the residue! Update the predicates. More...
     
    CUTLASS_HOST_DEVICE bool valid (int d, int h, int w, int c) const
     Is the valid? More...
     
    CUTLASS_HOST_DEVICE GemmGlobalIteratorAboperator+= (Coord< 3 > const &offset)
     Adds a vector offset to the iterator. More...
     
    CUTLASS_HOST_DEVICE void add_pointer_offset (Index offset)
     
    CUTLASS_HOST_DEVICE Index stride_advance (void)
     
    template<typename Fragment >
    CUTLASS_HOST_DEVICE void load_post_increment (Fragment &fragment)
     
    - Public Member Functions inherited from cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    CUTLASS_HOST_DEVICE void initialize_predicates (PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &block_offset=make_Coord(0, 0, 0))
     Initializes a predicate vector. More...
     
    CUTLASS_HOST_DEVICE TileLoadIterator ()
     Default constructor. More...
     
    CUTLASS_HOST_DEVICE TileLoadIterator (Params const &_params, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())
     Constructs a tile load iterator. More...
     
    CUTLASS_HOST_DEVICE TileLoadIterator (Params const &, SharedStorage &shared_storage, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())
     Constructs a tile load iterator. More...
     
    CUTLASS_HOST_DEVICE Scalar const * data () const
     Returns the current pointer. More...
     
    CUTLASS_HOST_DEVICE void inc_d ()
     Increment in the D dimension. More...
     
    CUTLASS_HOST_DEVICE void inc_h ()
     Increment in the H dimension. More...
     
    CUTLASS_HOST_DEVICE void inc_w ()
     Increment in the W dimension. More...
     
    CUTLASS_HOST_DEVICE void inc_advance ()
     Increment in the next dimension. More...
     
    CUTLASS_DEVICE void inc_stage ()
     Increment the stage. More...
     
    CUTLASS_HOST_DEVICE void load_post_increment (Fragment &fragment, PredicateIterator pred_it)
     Loads a fragment and advances the iterator to the next tile. More...
     
    CUTLASS_HOST_DEVICE void load_post_increment (Fragment &fragment)
     Loads a fragment and advances the iterator to the next tile. More...
     
    CUTLASS_HOST_DEVICE void load (Fragment &fragment, PredicateIterator pred_it) const
     Loads a fragment without advancing the iterator.. More...
     
    CUTLASS_HOST_DEVICE void load (Fragment &fragment) const
     Loads a fragment without advancing the iterator.. More...
     
    - Public Member Functions inherited from cutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, Advance_, MemorySpace, Index_, TileTraits_::Scalar, IteratorFragment::kScalar, Shape< 0, 0, 0, 0 > >
    CUTLASS_DEVICE bool valid (int d, int h, int w, int c) const
     Is the iterator valid? More...
     
    CUTLASS_HOST_DEVICE void initialize_predicates (PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &block_offset=make_Coord(0, 0, 0))
     Initializes a predicate vector using a RegularTilePredicateFunctor. More...
     
    CUTLASS_HOST_DEVICE void initialize_predicates (PredicateIterator predicate_it, PredicateFunctor const &functor, Coord< 3 > const &block_offset)
     Initializes a predicate vector using an arbitrary predicate functor. More...
     
    CUTLASS_HOST_DEVICE TileLoadIterator ()
     Default constructor. More...
     
    CUTLASS_HOST_DEVICE TileLoadIterator (Params const &_params, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())
     Constructs a tile load iterator. More...
     
    CUTLASS_HOST_DEVICE TileLoadIterator (Params const &, Scalar const *ptr, Coord< 3 > const &block_offset=make_Coord(0, 0, 0), ThreadOffset thread_offset_func=ThreadOffset())
     Constructs a tile load iterator. More...
     
    CUTLASS_HOST_DEVICE void inc_d ()
     Increment in the D dimension. More...
     
    CUTLASS_HOST_DEVICE void inc_h ()
     Increment in the H dimension. More...
     
    CUTLASS_HOST_DEVICE void inc_w ()
     Increment in the W dimension. More...
     
    CUTLASS_HOST_DEVICE void inc_advance ()
     Increment in the next dimension. More...
     
    CUTLASS_HOST_DEVICE void load_element (AccessType &value, int d, int h, int w, int c) const
     Loads a single fragment element from memory. More...
     
    CUTLASS_HOST_DEVICE void inc_stage ()
     Increment the stage. More...
     
    CUTLASS_HOST_DEVICE TileLoadIteratoroperator+= (Coord< 3 > const &offset)
     Adds a vector offset to the iterator. More...
     
    CUTLASS_HOST_DEVICE void add_pointer_offset (Index offset)
     Adds a raw offset to the pointer. More...
     
    CUTLASS_HOST_DEVICE Index stride_advance (void)
     
    CUTLASS_HOST_DEVICE void load_post_increment (Fragment &fragment, PredicateIterator pred_it)
     Loads a fragment and advances the iterator to the next tile. More...
     
    CUTLASS_HOST_DEVICE void load_post_increment (Fragment &fragment)
     Loads a fragment and advances the iterator to the next tile. More...
     
    CUTLASS_HOST_DEVICE void load (Fragment &fragment, PredicateIterator pred_it) const
     Loads a fragment without advancing the iterator.. More...
     
    CUTLASS_HOST_DEVICE void load (Fragment &fragment) const
     Loads a fragment without advancing the iterator.. More...
     
    CUTLASS_HOST_DEVICE void load (Fragment &fragment, int d)
     Loads a fragment without advancing the iterator.. More...
     
    - Public Member Functions inherited from cutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, Advance_, MemorySpace, Index_, TileTraits_::Scalar, FragmentElementType::kScalar, Shape< 0, 0, 0, 0 > >
    CUTLASS_HOST_DEVICE bool valid (int d, int h, int w, int c) const
     Is the iterator valid? More...
     
    @@ -328,15 +362,15 @@ - - - - - - - - - + + + + + + + + +

    Public Attributes

     The predicates. More...
     
    - Public Attributes inherited from cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    Params params
     Parameters structure. More...
     
    Coord< 4 > thread_offset
     Offset of an individual lane from the start of the tile. More...
     
    int stage
     Stage argument enables wrapping after some number of tiles have been loaded. More...
     
    Params params
     Parameters structure. More...
     
    Coord< 4 > thread_offset
     Offset of an individual lane from the start of the tile. More...
     
    int stage
     Stage argument enables wrapping after some number of tiles have been loaded. More...
     
    @@ -347,38 +381,41 @@ - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    Static Public Attributes

     Specifies in which dimension post-increment accesses advance. More...
     
    - Static Public Attributes inherited from cutlass::TileLoadIterator< TileTraits_, TileTraits_::Scalar, TileTraits_::MultiplicandTraits::kKstrided ? IteratorAdvance::kH :IteratorAdvance::kW, MemorySpace::kGlobal, Index_ >
    static IteratorAdvance::Kind const kAdvance
     Specifies in which dimension post-increment accesses advance. More...
     
    static IteratorFragment::Kind const kIteratorFragment
     Specifies type of iterator fragment storage (Salar or WmmaMatrix) More...
     
    static MemorySpace::Kind const kMemorySpace
     Source or destination memory space. More...
     
    - Static Public Attributes inherited from cutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, Advance_, MemorySpace, Index_, TileTraits_::Scalar, IteratorFragment::kScalar, Shape< 0, 0, 0, 0 > >
    static IteratorAdvance::Kind const kAdvance
     Specifies dimension in which post-increment accesses advance. More...
     
    static IteratorFragment::Kind const kIteratorFragment
     Specifies iterator storage fragment type (Scalar or WmmaMatrix) More...
     
    static MemorySpace::Kind const kMemorySpace
     Source or destination memory space. More...
     
    static int const kAccessSize
     The number of scalars accessed per load/store. More...
     
    static int const kFragmentSize
     The size of storage needed per fragment. More...
     
    static IteratorAdvance::Kind const kAdvance
     Specifies in which dimension post-increment accesses advance. More...
     
    static FragmentElementType::Kind const kFragmentElementType
     Specifies type of iterator fragment storage (Salar or WmmaMatrix) More...
     
    static MemorySpace::Kind const kMemorySpace
     Source or destination memory space. More...
     
    static int const kAccessSize
     The number of scalars accessed per load/store. More...
     
    - Static Public Attributes inherited from cutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, Advance_, MemorySpace, Index_, TileTraits_::Scalar, FragmentElementType::kScalar, Shape< 0, 0, 0, 0 > >
    static IteratorAdvance::Kind const kAdvance
     Specifies dimension in which post-increment accesses advance. More...
     
    static FragmentElementType::Kind const kFragmentElementType
     Specifies iterator storage fragment type (Scalar or WmmaMatrix) More...
     
    static MemorySpace::Kind const kMemorySpace
     Source or destination memory space. More...
     
    static int const kAccessSize
     The number of scalars accessed per load/store. More...
     
    static int const kFragmentSize
     The size of storage needed per fragment. More...
     
    - - - - + + + +

    Additional Inherited Members

    - Static Public Member Functions inherited from cutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, Advance_, MemorySpace, Index_, TileTraits_::Scalar, IteratorFragment::kScalar, Shape< 0, 0, 0, 0 > >
    static CUTLASS_DEVICE void initialize_predicates (PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &offset=make_Coord(0, 0, 0))
     Initializes a predicate vector. More...
     
    - Static Public Member Functions inherited from cutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, Advance_, MemorySpace, Index_, TileTraits_::Scalar, FragmentElementType::kScalar, Shape< 0, 0, 0, 0 > >
    static CUTLASS_HOST_DEVICE void initialize_predicates (PredicateIterator predicate_it, PredicateFunctor const &predicate_func, Coord< 3 > const &offset)
     Initializes a predicate vector. More...
     

    Member Typedef Documentation

    @@ -422,7 +459,7 @@

    typedef Base::Fragment cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::Fragmenttypedef Base::Fragment cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::Fragment
    @@ -454,7 +491,7 @@

    typedef cutlass::PredicateVector<ShapeCount<typename Base::Iterations>::kCount> cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::PredicateVectortypedef cutlass::PredicateVector<ShapeCount<typename Base::Iterations>::kCount> cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::PredicateVector
    @@ -523,11 +560,27 @@

    +

    + + +

    ◆ Tile

    + +
    +
    +
    +template<typename TileTraits_ , typename Index_ = int>
    + + + + +
    typedef TileTraits_::Tile cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::Tile
    +
    +

    Constructor & Destructor Documentation

    - -

    ◆ GemmGlobalIteratorAb()

    + +

    ◆ GemmGlobalIteratorAb()

    + +

    ◆ inc_advance()

    - + @@ -631,8 +712,8 @@

    -

    ◆ inc_d()

    + +

    ◆ inc_h()

    @@ -643,7 +724,7 @@

    CUTLASS_DEVICE void cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::inc_advance CUTLASS_HOST_DEVICE void cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::inc_d ( )
    - + @@ -658,8 +739,8 @@

    -

    ◆ inc_h()

    + +

    ◆ inc_w()

    @@ -670,7 +751,7 @@

    CUTLASS_DEVICE void cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::inc_d CUTLASS_HOST_DEVICE void cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::inc_h ( )
    - + @@ -685,8 +766,8 @@

    -

    ◆ initialize_predicates()

    + +

    ◆ initialize_predicates()

    @@ -697,7 +778,7 @@

    CUTLASS_DEVICE void cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::inc_h CUTLASS_HOST_DEVICE void cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::inc_w ( )
    - + @@ -706,7 +787,7 @@

    - + @@ -723,8 +804,122 @@

    -

    ◆ residue()

    + +

    ◆ load_element()

    + +
    +
    +
    +template<typename TileTraits_ , typename Index_ = int>
    +
    CUTLASS_DEVICE void cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::initialize_predicates CUTLASS_HOST_DEVICE void cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::initialize_predicates ( const Coord< 3 > &  bounds, const Coord< 3 > & block block_offset 
    + + + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    CUTLASS_HOST_DEVICE void cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::load_element (typename Base::AccessTypevalue,
    int d,
    int h,
    int w,
    int c 
    ) const
    +
    +inline
    +
    + +
    + + +

    ◆ load_post_increment()

    + +
    +
    +
    +template<typename TileTraits_ , typename Index_ = int>
    +
    +template<typename Fragment >
    + + + + + +
    + + + + + + + + +
    CUTLASS_HOST_DEVICE void cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::load_post_increment (Fragmentfragment)
    +
    +inline
    +
    + +
    +
    + +

    ◆ operator+=()

    + +
    +
    +
    +template<typename TileTraits_ , typename Index_ = int>
    + + + + + +
    + + + + + + + + +
    CUTLASS_HOST_DEVICE GemmGlobalIteratorAb& cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::operator+= (Coord< 3 > const & offset)
    +
    +inline
    +
    + +
    +
    + +

    ◆ residue()

    + +

    ◆ valid()

    @@ -763,7 +986,7 @@

    - + @@ -904,7 +1127,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb.png b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb.png index f6dfb59583..06b073c800 100644 Binary files a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb.png and b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb.png differ diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb_1_1Params-members.html b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb_1_1Params-members.html index 71243aedf0..af3680b9c5 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb_1_1Params-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb_1_1Params-members.html @@ -79,25 +79,35 @@

    This is the complete list of members for cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::Params, including all inherited members.

    CUTLASS_DEVICE bool cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::valid CUTLASS_HOST_DEVICE bool cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::valid ( int  d,
    - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + +
    inc_advancecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
    inc_dcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
    inc_hcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
    inc_wcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
    initialize(Scalar const *ptr, Index stride_h)cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::Paramsinline
    cutlass::TileLoadIterator::Params::initialize(SharedStorage const &storage)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
    cutlass::TileLoadIterator::Params::initialize(Scalar const *ptr, Index stride_d, Index stride_h, Index stride_w)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
    cutlass::TileLoadIterator::Params::initialize(Scalar const *ptr, Index _stride_d, Index _stride_h, Index _stride_w, Index _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
    cutlass::TileLoadIterator::Params::initialize()cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
    cutlass::TileIteratorBase::Params::initialize(Index _stride_d, Index _stride_h, Index _stride_w, Index _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
    cutlass::TileIteratorBase::Params::initialize(Index _stride_d, Index _stride_h, Index _stride_w)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Paramsinline
    pointercutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
    stride_dcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
    stride_hcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
    stride_wcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
    inc_advancecutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
    inc_dcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
    inc_hcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
    inc_wcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
    initialize(Scalar const *ptr, long long stride_d, Index stride_h)cutlass::gemm::GemmGlobalIteratorAb< TileTraits_, Index_ >::Paramsinline
    cutlass::TileLoadIterator::Params::initialize(TensorRef const &ref)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
    cutlass::TileLoadIterator::Params::initialize(SharedStorage const &storage)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
    cutlass::TileLoadIterator::Params::initialize(Scalar const *ptr)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
    cutlass::TileLoadIterator::Params::initialize(Scalar const *ptr, long long stride_d, Index stride_h, Index stride_w)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
    cutlass::TileLoadIterator::Params::initialize(Scalar const *ptr, long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
    cutlass::TileLoadIterator::Params::initialize()cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
    cutlass::TileIteratorBase::Params::initialize(long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, long long _inc_advance)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
    cutlass::TileIteratorBase::Params::initialize(Coord< 4 > const &stride)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
    cutlass::TileIteratorBase::Params::initialize(long long _stride_d, Index _stride_h, Index _stride_w)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
    Params()cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
    Params(Scalar const *ptr)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
    Params(TensorRef const &ref)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
    Params(Scalar const *ptr, long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
    Params(Scalar const *ptr, long long stride_d, Index stride_h, Index stride_w)cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
    cutlass::TileIteratorBase::Params::Params(long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, long long _inc_advance)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
    cutlass::TileIteratorBase::Params::Params(Coord< 4 > const &stride)cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Paramsinline
    pointercutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
    stride_dcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
    stride_hcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
    stride_wcutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb_1_1Params.html b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb_1_1Params.html index d4517b31e9..8f3134b397 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb_1_1Params.html +++ b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorAb_1_1Params.html @@ -87,62 +87,97 @@
    -cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params -cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params +cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params +cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
    - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    Public Member Functions

    CUTLASS_HOST_DEVICE int initialize (Scalar const *ptr, Index stride_h)
     Initializes params to load a strip-mined tile, given pointer and stride_h. More...
     
    - Public Member Functions inherited from cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
    CUTLASS_HOST_DEVICE int initialize (SharedStorage const &storage)
     Initialize params to access storage object. More...
     
    CUTLASS_HOST_DEVICE int initialize (Scalar const *ptr, Index stride_d, Index stride_h, Index stride_w)
     Initializes params to access a raw pointer. More...
     
    CUTLASS_HOST_DEVICE int initialize (Scalar const *ptr, Index _stride_d, Index _stride_h, Index _stride_w, Index _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)
     Initializes params. More...
     
    CUTLASS_HOST_DEVICE int initialize ()
     
    - Public Member Functions inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
    CUTLASS_HOST_DEVICE int initialize (Index _stride_d, Index _stride_h, Index _stride_w, Index _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)
     Initializes params. More...
     
    CUTLASS_HOST_DEVICE int initialize (Index _stride_d, Index _stride_h, Index _stride_w)
     
    CUTLASS_HOST_DEVICE int initialize ()
     
    CUTLASS_HOST_DEVICE int initialize (Scalar const *ptr, long long stride_d, Index stride_h)
     Initializes params to load a strip-mined tile, given pointer and stride_h. More...
     
    - Public Member Functions inherited from cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
    CUTLASS_HOST_DEVICE Params ()
     Initialize params to access storage object. More...
     
    CUTLASS_HOST_DEVICE Params (Scalar const *ptr)
     Initialize params to access storage object. More...
     
    CUTLASS_HOST_DEVICE Params (TensorRef const &ref)
     Constructs with a CompactTensorRef<> More...
     
    CUTLASS_HOST_DEVICE Params (Scalar const *ptr, long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)
     Initialize params to access storage object. More...
     
    CUTLASS_HOST_DEVICE Params (Scalar const *ptr, long long stride_d, Index stride_h, Index stride_w)
     Initialize params to access storage object. More...
     
    CUTLASS_HOST_DEVICE int initialize (TensorRef const &ref)
     Initializes params to access a raw pointer. More...
     
    CUTLASS_HOST_DEVICE int initialize (SharedStorage const &storage)
     Initialize params to access storage object. More...
     
    CUTLASS_HOST_DEVICE int initialize (Scalar const *ptr)
     Initialize params to access storage object. More...
     
    CUTLASS_HOST_DEVICE int initialize (Scalar const *ptr, long long stride_d, Index stride_h, Index stride_w)
     Initializes params to access a raw pointer. More...
     
    CUTLASS_HOST_DEVICE int initialize (Scalar const *ptr, long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, Index _inc_advance)
     Initializes params. More...
     
    CUTLASS_HOST_DEVICE int initialize ()
     
    - Public Member Functions inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
    CUTLASS_HOST_DEVICE Params ()
     Constructs params. More...
     
    CUTLASS_HOST_DEVICE Params (long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, long long _inc_advance)
     Constructs params. More...
     
    CUTLASS_HOST_DEVICE Params (Coord< 4 > const &stride)
     Constructs params with a stride vector. More...
     
    CUTLASS_HOST_DEVICE int initialize (long long _stride_d, Index _stride_h, Index _stride_w, long long _inc_d, Index _inc_h, Index _inc_w, long long _inc_advance)
     Initializes params. More...
     
    CUTLASS_HOST_DEVICE int initialize (Coord< 4 > const &stride)
     Initializes the parameters object from a vector of strides. More...
     
    CUTLASS_HOST_DEVICE int initialize (long long _stride_d, Index _stride_h, Index _stride_w)
     Initializes the parameters object from a vector of strides. More...
     
    CUTLASS_HOST_DEVICE int initialize ()
     Gotta have this. More...
     
    - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + +

    Additional Inherited Members

    - Public Attributes inherited from cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
    Scalar const * pointer
     Pointer to memory. More...
     
    - Public Attributes inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, IteratorFragment_, Skew_ >::Params
    Index stride_d
     
    Index stride_h
     
    Index stride_w
     
    Index inc_d
     
    Index inc_h
     
    Index inc_w
     
    Index inc_advance
     
    - Public Attributes inherited from cutlass::TileLoadIterator< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
    Scalar const * pointer
     Pointer to memory. More...
     
    - Public Attributes inherited from cutlass::TileIteratorBase< Traits_, Scalar_, Advance_, MemorySpace, Index_, FragmentElement_, FragmentElementType_, Skew_ >::Params
    long long stride_d
     
    Index stride_h
     
    Index stride_w
     
    long long inc_d
     
    Index inc_h
     
    Index inc_w
     
    long long inc_advance
     

    Member Function Documentation

    - -

    ◆ initialize()

    + +

    ◆ initialize()

    @@ -158,6 +193,12 @@

    Scalar const * 

    ptr,
    long long stride_d,
    - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - - + + + + - - - + + +
    AccessType typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    Base typedefcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >
    data()cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    data() constcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    Delta typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    Fragment typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    FragmentConstIterator typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    FragmentElement typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    FragmentIterator typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    FragmentShape typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    GemmGlobalIteratorCd()cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    GemmGlobalIteratorCd(Params const &params, const Coord< 3 > &bounds, const Coord< 3 > &block, int offset=0, int pred_offset=0, ThreadOffset thread_offset_func=ThreadOffset())cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    ImmediateOffsetStrides typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    inc_advance()cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    inc_c()cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    inc_d()cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    inc_h()cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    inc_w()cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    Index typedefcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >
    initialize_predicates(PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &offset=make_Coord(0, 0, 0))cutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >inlinestatic
    Iterations typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    kAccessSizecutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >static
    kAdvancecutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >static
    kFragmentSizecutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >static
    kIteratorFragmentcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >static
    kLayoutcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >static
    kMemorySpacecutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >static
    AccessType typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    add_pointer_offset(Index offset)cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    Base typedefcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >
    Delta typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    Fragment typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    FragmentConstIterator typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    FragmentElement typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    FragmentIterator typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    FragmentShape typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    GemmGlobalIteratorCd(Params const &_params, const Coord< 3 > &bounds, const Coord< 3 > &block_offset, ThreadOffset thread_offset_func=ThreadOffset())cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    GemmGlobalIteratorCd(Params const &_params, const Coord< 3 > &bounds, const Coord< 3 > &block, int offset=0, int pred_offset=0, ThreadOffset thread_offset_func=ThreadOffset())cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    ImmediateOffsetStrides typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    inc_advance()cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    inc_c()cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    inc_d()cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    inc_h()cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    inc_w()cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    Index typedefcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >
    initialize_predicates(PredicateIterator predicate_it, PredicateFunctor const &predicate_func, Coord< 3 > const &offset)cutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >inlinestatic
    Iterations typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    kAccessSizecutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >static
    kAdvancecutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >static
    kFragmentElementTypecutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >static
    kFragmentSizecutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >static
    kLayoutcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >static
    kMemorySpacecutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >static
    load_element(typename Base::AccessType &value, int d, int h, int w, int c) constcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    load_post_increment(Fragment &fragment)cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    operator+=(Coord< 3 > const &offset)cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    paramscutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >
    Pointer typedefcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >
    predicatescutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >
    PredicateVector typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    PredicateVector typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    Scalar typedefcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >
    Skew typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    Storage typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    Skew typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    Storage typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    store_element(typename Base::AccessType const &value, int d, int h, int w, int c)cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    store_post_increment(Fragment &fragment)cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    This_ typedefcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >
    thread_offsetcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >
    ThreadOffset typedefcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >
    Threads typedefcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >
    Tile typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    Traits typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    valid(int d, int h, int w, int c) constcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    Tile typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    Traits typedefcutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    valid(int d, int h, int w, int c) constcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >inline
    diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd.html b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd.html index 6af473203d..8159a864ee 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd.html +++ b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd.html @@ -92,6 +92,7 @@ cutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ > +cutlass::gemm::WmmaGemmGlobalIteratorCd< TileTraits_, Index_ > @@ -125,97 +126,112 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
     The thread offset. More...
     
    - Public Types inherited from cutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    typedef TileTraits_ Traits
     concept TileTraits More...
     
    typedef TileTraits_::Scalar Scalar
     Scalar element. More...
     
    typedef TileTraits_::Scalar FragmentElement
     Fragment element. More...
     
    typedef Index_ Index
     Index type. More...
     
    typedef Shape< 0, 0, 0, 0 > Skew
     Skew quantity. More...
     
    typedef Traits::Tile Tile
     Tile shape. More...
     
    typedef Traits::Delta Delta
     Distance along each dimension. More...
     
    typedef Traits::ImmediateOffsetStrides ImmediateOffsetStrides
     The strides in each dimension between different loads/stores. More...
     
    typedef Traits::Iterations Iterations
     Iterations. More...
     
    typedef Traits::ThreadOffset ThreadOffset
     Thread offset. More...
     
    typedef Vectorize< FragmentElement, kAccessSize >::Type AccessType
     The elements loaded/store by one instruction. More...
     
    typedef Fragment< Scalar, ShapeCount< Tile >::kCount, kFragmentSizeStorage
     The storage. More...
     
    typedef Fragment< FragmentElement, ShapeCount< Iterations >::kCount *kAccessSizeFragment
     The fragment. More...
     
    typedef FragmentIterator< Fragment, Iterations, AccessTypeFragmentIterator
     The fragment iterator. More...
     
    typedef FragmentConstIterator< Fragment, Iterations, AccessTypeFragmentConstIterator
     The fragment const iterator. More...
     
    typedef FragmentIterator::FragmentShape FragmentShape
     The shape of the fragment. More...
     
    typedef PredicateVector< ShapeCount< Iterations >::kCount > PredicateVector
     Default predicate mask type. More...
     
    typedef TileTraits_ Traits
     concept TileTraits More...
     
    typedef TileTraits_::Scalar Scalar
     Scalar element. More...
     
    typedef TileTraits_::Scalar FragmentElement
     Fragment element. More...
     
    typedef Index_ Index
     Index type. More...
     
    typedef Shape< 0, 0, 0, 0 > Skew
     Skew quantity. More...
     
    typedef Traits::Tile Tile
     Tile shape. More...
     
    typedef Traits::Delta Delta
     Distance along each dimension. More...
     
    typedef Traits::ImmediateOffsetStrides ImmediateOffsetStrides
     The strides in each dimension between different loads/stores. More...
     
    typedef Traits::Iterations Iterations
     Iterations. More...
     
    typedef Traits::ThreadOffset ThreadOffset
     Thread offset. More...
     
    typedef Vectorize< FragmentElement, kAccessSize >::Type AccessType
     The elements loaded/store by one instruction. More...
     
    typedef Fragment< Scalar, ShapeCount< Tile >::kCount, kFragmentSizeStorage
     The storage. More...
     
    typedef Fragment< FragmentElement, ShapeCount< Iterations >::kCount *kAccessSizeFragment
     The fragment. More...
     
    typedef FragmentIterator< Fragment, Iterations, AccessTypeFragmentIterator
     The fragment iterator. More...
     
    typedef FragmentConstIterator< Fragment, Iterations, AccessTypeFragmentConstIterator
     The fragment const iterator. More...
     
    typedef FragmentIterator::FragmentShape FragmentShape
     The shape of the fragment. More...
     
    typedef PredicateVector< ShapeCount< Iterations >::kCount > PredicateVector
     Default predicate mask type. More...
     
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - + + +

    Public Member Functions

    CUTLASS_DEVICE GemmGlobalIteratorCd ()
     Ctor. More...
     
    CUTLASS_DEVICE GemmGlobalIteratorCd (Params const &params, const Coord< 3 > &bounds, const Coord< 3 > &block, int offset=0, int pred_offset=0, ThreadOffset thread_offset_func=ThreadOffset())
     Ctor. More...
     
    CUTLASS_DEVICE void inc_c ()
     Increment the pointer in the C dimension. More...
     
    CUTLASS_DEVICE void inc_w ()
     Increment the pointer in the W dimension. More...
     
    CUTLASS_DEVICE void inc_h ()
     Increment the pointer in the H dimension. More...
     
    CUTLASS_DEVICE void inc_d ()
     Increment the pointer in the D dimension. More...
     
    CUTLASS_DEVICE void inc_advance ()
     Increment the pointer to move to the next iteration. More...
     
    CUTLASS_DEVICE bool valid (int d, int h, int w, int c) const
     Test the validity of the iterator. More...
     
    CUTLASS_HOST_DEVICE Pointer data ()
     Returns the raw pointer. More...
     
    CUTLASS_HOST_DEVICE Pointer const data () const
     
    CUTLASS_HOST_DEVICE GemmGlobalIteratorCd (Params const &_params, const Coord< 3 > &bounds, const Coord< 3 > &block_offset, ThreadOffset thread_offset_func=ThreadOffset())
     Ctor. More...
     
    CUTLASS_HOST_DEVICE GemmGlobalIteratorCd (Params const &_params, const Coord< 3 > &bounds, const Coord< 3 > &block, int offset=0, int pred_offset=0, ThreadOffset thread_offset_func=ThreadOffset())
     Ctor. More...
     
    CUTLASS_HOST_DEVICE void inc_c ()
     Increment the pointer in the C dimension. More...
     
    CUTLASS_HOST_DEVICE void inc_w ()
     Increment the pointer in the W dimension. More...
     
    CUTLASS_HOST_DEVICE void inc_h ()
     Increment the pointer in the H dimension. More...
     
    CUTLASS_HOST_DEVICE void inc_d ()
     Increment the pointer in the D dimension. More...
     
    CUTLASS_HOST_DEVICE void inc_advance ()
     Increment the pointer to move to the next iteration. More...
     
    CUTLASS_HOST_DEVICE GemmGlobalIteratorCdoperator+= (Coord< 3 > const &offset)
     Adds a vector offset to the iterator. More...
     
    CUTLASS_HOST_DEVICE void load_element (typename Base::AccessType &value, int d, int h, int w, int c) const
     Loads a single fragment element from memory. More...
     
    CUTLASS_HOST_DEVICE void store_element (typename Base::AccessType const &value, int d, int h, int w, int c)
     Stores a single fragment element into memory. More...
     
    CUTLASS_HOST_DEVICE bool valid (int d, int h, int w, int c) const
     Test the validity of the. More...
     
    CUTLASS_HOST_DEVICE void add_pointer_offset (Index offset)
     add pointer offset More...
     
    template<typename Fragment >
    CUTLASS_HOST_DEVICE void load_post_increment (Fragment &fragment)
     Loads and increments iterator. More...
     
    template<typename Fragment >
    CUTLASS_HOST_DEVICE void store_post_increment (Fragment &fragment)
     
    - Public Member Functions inherited from cutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    CUTLASS_DEVICE bool valid (int d, int h, int w, int c) const
     Is the iterator valid? More...
     
    CUTLASS_HOST_DEVICE bool valid (int d, int h, int w, int c) const
     Is the iterator valid? More...
     
    + @@ -230,28 +246,28 @@ - - - - - - - - - - - - - - - + + + + + + + + + + + + + + +

    Public Attributes

    Params params
     Parameters. More...
     
    Coord< 4 > thread_offset
     Offset of an individual lane from the start of the tile. More...
     The layout. More...
     
    - Static Public Attributes inherited from cutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    static IteratorAdvance::Kind const kAdvance
     Specifies dimension in which post-increment accesses advance. More...
     
    static IteratorFragment::Kind const kIteratorFragment
     Specifies iterator storage fragment type (Scalar or WmmaMatrix) More...
     
    static MemorySpace::Kind const kMemorySpace
     Source or destination memory space. More...
     
    static int const kAccessSize
     The number of scalars accessed per load/store. More...
     
    static int const kFragmentSize
     The size of storage needed per fragment. More...
     
    static IteratorAdvance::Kind const kAdvance
     Specifies dimension in which post-increment accesses advance. More...
     
    static FragmentElementType::Kind const kFragmentElementType
     Specifies iterator storage fragment type (Scalar or WmmaMatrix) More...
     
    static MemorySpace::Kind const kMemorySpace
     Source or destination memory space. More...
     
    static int const kAccessSize
     The number of scalars accessed per load/store. More...
     
    static int const kFragmentSize
     The size of storage needed per fragment. More...
     
    - - - + + +

    Additional Inherited Members

    - Static Public Member Functions inherited from cutlass::TileIteratorBase< TileTraits_, TileTraits_::Scalar, IteratorAdvance::kH, MemorySpace::kGlobal, Index_ >
    static CUTLASS_DEVICE void initialize_predicates (PredicateIterator predicate_it, Coord< 3 > const &bounds, Coord< 3 > const &offset=make_Coord(0, 0, 0))
     Initializes a predicate vector. More...
     
    static CUTLASS_HOST_DEVICE void initialize_predicates (PredicateIterator predicate_it, PredicateFunctor const &predicate_func, Coord< 3 > const &offset)
     Initializes a predicate vector. More...
     

    Member Typedef Documentation

    @@ -367,8 +383,8 @@

    Constructor & Destructor Documentation

    - -

    ◆ GemmGlobalIteratorCd() [1/2]

    + +

    ◆ GemmGlobalIteratorCd() [1/2]

    - + - + @@ -457,8 +496,8 @@

    Member Function Documentation

    - -

    ◆ data() [1/2]

    + +

    ◆ add_pointer_offset()

    @@ -469,9 +508,10 @@

    CUTLASS_DEVICE cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::GemmGlobalIteratorCd CUTLASS_HOST_DEVICE cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::GemmGlobalIteratorCd ( Params const & params, _params,
    - + - + +
    CUTLASS_HOST_DEVICE Pointer cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::data CUTLASS_HOST_DEVICE void cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::add_pointer_offset ()Index offset)
    @@ -484,8 +524,8 @@

    -

    ◆ data() [2/2]

    + +

    ◆ inc_advance()

    - + @@ -538,8 +578,8 @@

    -

    ◆ inc_c()

    + +

    ◆ inc_d()

    @@ -550,7 +590,7 @@

    CUTLASS_DEVICE void cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::inc_advance CUTLASS_HOST_DEVICE void cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::inc_c ( )
    - + @@ -565,8 +605,8 @@

    -

    ◆ inc_d()

    + +

    ◆ inc_h()

    @@ -577,7 +617,7 @@

    CUTLASS_DEVICE void cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::inc_c CUTLASS_HOST_DEVICE void cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::inc_d ( )
    - + @@ -592,8 +632,8 @@

    -

    ◆ inc_h()

    + +

    ◆ inc_w()

    @@ -604,7 +644,7 @@

    CUTLASS_DEVICE void cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::inc_d CUTLASS_HOST_DEVICE void cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::inc_h ( )
    - + @@ -619,8 +659,8 @@

    -

    ◆ inc_w()

    + +

    ◆ load_element()

    @@ -631,9 +671,182 @@

    CUTLASS_DEVICE void cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::inc_h CUTLASS_HOST_DEVICE void cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::inc_w ( )
    - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    CUTLASS_DEVICE void cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::inc_w CUTLASS_HOST_DEVICE void cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::load_element ()typename Base::AccessTypevalue,
    int d,
    int h,
    int w,
    int c 
    ) const
    +
    +inline
    +
    + +
    + + +

    ◆ load_post_increment()

    + +
    +
    +
    +template<typename TileTraits_ , typename Index_ = int>
    +
    +template<typename Fragment >
    + + + + + +
    + + + + + + + + +
    CUTLASS_HOST_DEVICE void cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::load_post_increment (Fragmentfragment)
    +
    +inline
    +
    + +
    +
    + +

    ◆ operator+=()

    + +
    +
    +
    +template<typename TileTraits_ , typename Index_ = int>
    + + + + + +
    + + + + + + + + +
    CUTLASS_HOST_DEVICE GemmGlobalIteratorCd& cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::operator+= (Coord< 3 > const & offset)
    +
    +inline
    +
    + +
    +
    + +

    ◆ store_element()

    + +
    +
    +
    +template<typename TileTraits_ , typename Index_ = int>
    + + + + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    CUTLASS_HOST_DEVICE void cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::store_element (typename Base::AccessType const & value,
    int d,
    int h,
    int w,
    int c 
    )
    +
    +inline
    +
    + +
    +
    + +

    ◆ store_post_increment()

    + +
    +
    +
    +template<typename TileTraits_ , typename Index_ = int>
    +
    +template<typename Fragment >
    + + + + + + + + + - + @@ -266,6 +284,22 @@

    + + + +

    ◆ stride_d

    + +
    +
    +
    +template<typename TileTraits_ , typename Index_ = int>
    +

    + + + + + +
    CUTLASS_HOST_DEVICE void cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::store_post_increment (Fragmentfragment)
    @@ -646,8 +859,8 @@

    -

    ◆ valid()

    + +

    ◆ valid()

    @@ -658,7 +871,7 @@

    - + @@ -775,7 +988,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd.png b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd.png index 13e8ac2aa9..24971967c7 100644 Binary files a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd.png and b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd.png differ diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd_1_1Params-members.html b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd_1_1Params-members.html index aa6c60c858..2492594fe3 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd_1_1Params-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd_1_1Params-members.html @@ -81,16 +81,17 @@

    CUTLASS_DEVICE bool cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::valid CUTLASS_HOST_DEVICE bool cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::valid ( int  d,
    - + - + +
    inc_advancecutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::Params
    inc_hcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::Params
    initialize(Pointer pointer, Index ld, Index bound, Index epilogue_stride_w, Index epilogue_delta_w)cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::Paramsinline
    initialize(Pointer pointer, long long batch_stride, Index ldm, Index bound, Index epilogue_stride_w, Index epilogue_delta_w)cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::Paramsinline
    pointercutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::Params
    predicate_inc_advancecutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::Params
    predicate_inc_hcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::Params
    predicate_offsetcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::Params
    stride_hcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::Params
    stride_dcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::Params
    stride_hcutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::Params

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd_1_1Params.html b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd_1_1Params.html index 9b30fd7844..7c1b105796 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd_1_1Params.html +++ b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd_1_1Params.html @@ -85,18 +85,30 @@

    #include <gemm_global_tile.h>

    +
    +Inheritance diagram for cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::Params:
    +
    +
    + + +cutlass::gemm::WmmaGemmGlobalIteratorCd< TileTraits_, Index_ >::Params + +
    - - - + + +

    Public Member Functions

    CUTLASS_HOST_DEVICE int initialize (Pointer pointer, Index ld, Index bound, Index epilogue_stride_w, Index epilogue_delta_w)
     Setup the params. More...
     
    CUTLASS_HOST_DEVICE int initialize (Pointer pointer, long long batch_stride, Index ldm, Index bound, Index epilogue_stride_w, Index epilogue_delta_w)
     Setup the params. More...
     
    + + + @@ -115,8 +127,8 @@

    Public Attributes

    Pointer pointer
     The pointer. More...
     
    long long stride_d
     The stride in the D dimension. More...
     
    Index stride_h
     The stride in the H dimension to setup the thread in the block. More...
     
     

    Member Function Documentation

    - -

    ◆ initialize()

    + +

    ◆ initialize()

    @@ -132,11 +144,17 @@

    Pointer 

    pointer,
    long long batch_stride,
    Index ld, ldm,
    + + + +
    long long cutlass::gemm::GemmGlobalIteratorCd< TileTraits_, Index_ >::Params::stride_d
    +
    +
    @@ -290,7 +324,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd_1_1Params.png b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd_1_1Params.png new file mode 100644 index 0000000000..a31a8a5a82 Binary files /dev/null and b/docs/structcutlass_1_1gemm_1_1GemmGlobalIteratorCd_1_1Params.png differ diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalTileCdTraits-members.html b/docs/structcutlass_1_1gemm_1_1GemmGlobalTileCdTraits-members.html index 92fd6a4c08..2cf66086a2 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmGlobalTileCdTraits-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmGlobalTileCdTraits-members.html @@ -93,11 +93,12 @@

    Scalar typedefcutlass::gemm::GemmGlobalTileTraits< GemmOperand::kC, MatrixLayout::kColumnMajor, Scalar_, Tile_, Threads_, kAccessSize_ >
    Threads typedefcutlass::gemm::GemmGlobalTileCdTraits< Scalar_, Tile_, Threads_, kStrideH_, kAccessSize_ >
    ThreadsDelta typedefcutlass::gemm::GemmGlobalTileCdTraits< Scalar_, Tile_, Threads_, kStrideH_, kAccessSize_ >
    Tile typedefcutlass::gemm::GemmGlobalTileTraits< GemmOperand::kC, MatrixLayout::kColumnMajor, Scalar_, Tile_, Threads_, kAccessSize_ >
    Tile typedefcutlass::gemm::GemmGlobalTileTraits< GemmOperand::kC, MatrixLayout::kColumnMajor, Scalar_, Tile_, Threads_, kAccessSize_ >
    VectorizedTile typedefcutlass::gemm::GemmGlobalTileTraits< GemmOperand::kC, MatrixLayout::kColumnMajor, Scalar_, Tile_, Threads_, kAccessSize_ >
    diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalTileCdTraits.html b/docs/structcutlass_1_1gemm_1_1GemmGlobalTileCdTraits.html index 3aed66b5bb..9de76685a1 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmGlobalTileCdTraits.html +++ b/docs/structcutlass_1_1gemm_1_1GemmGlobalTileCdTraits.html @@ -107,11 +107,11 @@
    typedef Shape< 0, 0, Base::Delta::kW, Base::Delta::kCDelta
     Override the strides in each dimension between different loads/stores. More...
     
    typedef Base::Iterations Iterations
    typedef Base::Iterations Iterations
     
    typedef Base::Threads Threads
    typedef Base::Threads Threads
     
    typedef Base::ThreadsDelta ThreadsDelta
    typedef Base::ThreadsDelta ThreadsDelta
     
    typedef Base::ImmediateOffsetStrides ImmediateOffsetStrides
     
    typedef Scalar_ * Pointer
     The pointer. More...
     
    typedef ReshapeTile< Tile_, kAccessSize_ >::Tile Tile
     The tile shape. More...
     
    typedef ReshapeThreads< Tile, Threads_ >::Threads Threads
     The threads shape. More...
     
    typedef Shape< 1, 1, Tile::kC > ThreadsDelta
     The relative offset between two elements in the H/W dimension in adjacent threads. More...
     
    typedef Tile_ Tile
     The tile shape. More...
     
    typedef ReshapeTile< Tile_, kAccessSize_ >::Tile VectorizedTile
     The vectorized tile shape. More...
     
    typedef ReshapeThreads< VectorizedTile, Threads_ >::Threads Threads
     The threads shape. More...
     
    typedef Shape< 1, 1, VectorizedTile::kC > ThreadsDelta
     The relative offset between two elements in the H/W dimension in adjacent threads. More...
     
    typedef Shape< 0, Threads::kH, Threads::kW *kAccessSizeDelta
     The strides in each dimension between different loads/stores. More...
     
    typedef Shape< 0, 0, Threads::kW *ThreadsDelta::kW, kAccessSizeImmediateOffsetStrides
     Strides for immediate offset computation. More...
     
    typedef Shape< 1, Tile::kH/Threads::kH, Tile::kW/Threads::kW, Tile::kC/kAccessSizeIterations
     The number of iterations needed to load/store the tile. More...
     
    typedef GemmMultiplicandTraits< Tile, kOperand, kLayoutMultiplicandTraits
    typedef Shape< 1, VectorizedTile::kH/Threads::kH, VectorizedTile::kW/Threads::kW, VectorizedTile::kC/kAccessSizeIterations
     The number of iterations needed to load/store the tile. More...
     
    typedef GemmMultiplicandTraits< Tile, kOperand, kLayoutMultiplicandTraits
     
    - +

    @@ -220,7 +223,7 @@

    typedef Base::Iterations cutlass::gemm::GemmGlobalTileCdTraits< Scalar_, Tile_, Threads_, kStrideH_, kAccessSize_ >::Iterationstypedef Base::Iterations cutlass::gemm::GemmGlobalTileCdTraits< Scalar_, Tile_, Threads_, kStrideH_, kAccessSize_ >::Iterations
    @@ -236,7 +239,7 @@

    typedef Base::Threads cutlass::gemm::GemmGlobalTileCdTraits< Scalar_, Tile_, Threads_, kStrideH_, kAccessSize_ >::Threadstypedef Base::Threads cutlass::gemm::GemmGlobalTileCdTraits< Scalar_, Tile_, Threads_, kStrideH_, kAccessSize_ >::Threads
    @@ -252,7 +255,7 @@

    typedef Base::ThreadsDelta cutlass::gemm::GemmGlobalTileCdTraits< Scalar_, Tile_, Threads_, kStrideH_, kAccessSize_ >::ThreadsDeltatypedef Base::ThreadsDelta cutlass::gemm::GemmGlobalTileCdTraits< Scalar_, Tile_, Threads_, kStrideH_, kAccessSize_ >::ThreadsDelta
    @@ -290,7 +293,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalTileCdTraits_1_1ThreadOffset-members.html b/docs/structcutlass_1_1gemm_1_1GemmGlobalTileCdTraits_1_1ThreadOffset-members.html index 1510bcf26b..192e27a01c 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmGlobalTileCdTraits_1_1ThreadOffset-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmGlobalTileCdTraits_1_1ThreadOffset-members.html @@ -83,7 +83,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalTileCdTraits_1_1ThreadOffset.html b/docs/structcutlass_1_1gemm_1_1GemmGlobalTileCdTraits_1_1ThreadOffset.html index 524a06aa49..8feee54ba0 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmGlobalTileCdTraits_1_1ThreadOffset.html +++ b/docs/structcutlass_1_1gemm_1_1GemmGlobalTileCdTraits_1_1ThreadOffset.html @@ -124,7 +124,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalTileTraits-members.html b/docs/structcutlass_1_1gemm_1_1GemmGlobalTileTraits-members.html index 6225081e82..652e9a86e7 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmGlobalTileTraits-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmGlobalTileTraits-members.html @@ -81,7 +81,7 @@ - + @@ -89,13 +89,14 @@ - - - + + + +
    Delta typedefcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    ImmediateOffsetStrides typedefcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    Iterations typedefcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    Iterations typedefcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    kAccessSizecutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >static
    kLayoutcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >static
    kMemorySpacecutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >static
    MultiplicandTraits typedefcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    Pointer typedefcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    Scalar typedefcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    Threads typedefcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    ThreadsDelta typedefcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    Tile typedefcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    Threads typedefcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    ThreadsDelta typedefcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    Tile typedefcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    VectorizedTile typedefcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalTileTraits.html b/docs/structcutlass_1_1gemm_1_1GemmGlobalTileTraits.html index 4e61285ebf..7f8c05e726 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmGlobalTileTraits.html +++ b/docs/structcutlass_1_1gemm_1_1GemmGlobalTileTraits.html @@ -90,7 +90,7 @@ cutlass::gemm::HgemmCrosswiseGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ > -cutlass::gemm::IgemmContiguousGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ > +cutlass::gemm::IgemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ > @@ -108,25 +108,28 @@ - - - - - - - - - + + + + + + + + + + + + - - - - + + + +
    typedef Scalar_ * Pointer
     The pointer. More...
     
    typedef ReshapeTile< Tile_, kAccessSize_ >::Tile Tile
     The tile shape. More...
     
    typedef ReshapeThreads< Tile, Threads_ >::Threads Threads
     The threads shape. More...
     
    typedef Shape< 1, 1, Tile::kC > ThreadsDelta
     The relative offset between two elements in the H/W dimension in adjacent threads. More...
     
    typedef Tile_ Tile
     The tile shape. More...
     
    typedef ReshapeTile< Tile_, kAccessSize_ >::Tile VectorizedTile
     The vectorized tile shape. More...
     
    typedef ReshapeThreads< VectorizedTile, Threads_ >::Threads Threads
     The threads shape. More...
     
    typedef Shape< 1, 1, VectorizedTile::kC > ThreadsDelta
     The relative offset between two elements in the H/W dimension in adjacent threads. More...
     
    typedef Shape< 0, Threads::kH, Threads::kW *kAccessSizeDelta
     The strides in each dimension between different loads/stores. More...
     
    typedef Shape< 0, 0, Threads::kW *ThreadsDelta::kW, kAccessSizeImmediateOffsetStrides
     Strides for immediate offset computation. More...
     
    typedef Shape< 1, Tile::kH/Threads::kH, Tile::kW/Threads::kW, Tile::kC/kAccessSizeIterations
     The number of iterations needed to load/store the tile. More...
     
    typedef GemmMultiplicandTraits< Tile, kOperand, kLayoutMultiplicandTraits
    typedef Shape< 1, VectorizedTile::kH/Threads::kH, VectorizedTile::kW/Threads::kW, VectorizedTile::kC/kAccessSizeIterations
     The number of iterations needed to load/store the tile. More...
     
    typedef GemmMultiplicandTraits< Tile, kOperand, kLayoutMultiplicandTraits
     
    - +

    @@ -177,8 +180,8 @@

    -

    ◆ Iterations

    + +

    ◆ Iterations

    @@ -186,7 +189,7 @@

    typedef Shape<1, Tile::kH / Threads::kH, Tile::kW / Threads::kW, Tile::kC / kAccessSize> cutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >::Iterationstypedef Shape<1, VectorizedTile::kH / Threads::kH, VectorizedTile::kW / Threads::kW, VectorizedTile::kC / kAccessSize> cutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >::Iterations
    @@ -202,7 +205,7 @@

    typedef GemmMultiplicandTraits<Tile, kOperand, kLayout> cutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >::MultiplicandTraitstypedef GemmMultiplicandTraits<Tile, kOperand, kLayout> cutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >::MultiplicandTraits
    @@ -241,8 +244,8 @@

    -

    ◆ Threads

    + +

    ◆ Threads

    @@ -250,15 +253,15 @@

    typedef ReshapeThreads<Tile, Threads_>::Threads cutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >::Threadstypedef ReshapeThreads<VectorizedTile, Threads_>::Threads cutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >::Threads
    - -

    ◆ ThreadsDelta

    + +

    ◆ ThreadsDelta

    @@ -266,15 +269,15 @@

    typedef Shape<1, 1, Tile::kC> cutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >::ThreadsDeltatypedef Shape<1, 1, VectorizedTile::kC> cutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >::ThreadsDelta
    - -

    ◆ Tile

    + +

    ◆ Tile

    @@ -282,7 +285,23 @@

    typedef ReshapeTile<Tile_, kAccessSize_>::Tile cutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >::Tiletypedef Tile_ cutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >::Tile
    +
    + +
    + + +

    ◆ VectorizedTile

    + +
    +
    +
    +template<GemmOperand::Kind kOperand_, MatrixLayout::Kind kLayout_, typename Scalar_, typename Tile_, typename Threads_, int kAccessSize_>
    + + +
    typedef ReshapeTile<Tile_, kAccessSize_>::Tile cutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >::VectorizedTile
    @@ -392,7 +411,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalTileTraits.png b/docs/structcutlass_1_1gemm_1_1GemmGlobalTileTraits.png index 4c9bada463..739d64830b 100644 Binary files a/docs/structcutlass_1_1gemm_1_1GemmGlobalTileTraits.png and b/docs/structcutlass_1_1gemm_1_1GemmGlobalTileTraits.png differ diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalTileTraits_1_1ThreadOffset-members.html b/docs/structcutlass_1_1gemm_1_1GemmGlobalTileTraits_1_1ThreadOffset-members.html index 634804a60f..4937c4fcce 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmGlobalTileTraits_1_1ThreadOffset-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmGlobalTileTraits_1_1ThreadOffset-members.html @@ -83,7 +83,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmGlobalTileTraits_1_1ThreadOffset.html b/docs/structcutlass_1_1gemm_1_1GemmGlobalTileTraits_1_1ThreadOffset.html index 7b47addb66..50dc0a99f5 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmGlobalTileTraits_1_1ThreadOffset.html +++ b/docs/structcutlass_1_1gemm_1_1GemmGlobalTileTraits_1_1ThreadOffset.html @@ -124,7 +124,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmMultiplicandTraits-members.html b/docs/structcutlass_1_1gemm_1_1GemmMultiplicandTraits-members.html index db9bc1bc7c..bb83157310 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmMultiplicandTraits-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmMultiplicandTraits-members.html @@ -87,7 +87,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmMultiplicandTraits.html b/docs/structcutlass_1_1gemm_1_1GemmMultiplicandTraits.html index 121fe8cce1..ca9cf53710 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmMultiplicandTraits.html +++ b/docs/structcutlass_1_1gemm_1_1GemmMultiplicandTraits.html @@ -220,7 +220,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmOperandTraitsAb-members.html b/docs/structcutlass_1_1gemm_1_1GemmOperandTraitsAb-members.html index 03950b5a98..8ea3efa969 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmOperandTraitsAb-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmOperandTraitsAb-members.html @@ -83,7 +83,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmOperandTraitsAb.html b/docs/structcutlass_1_1gemm_1_1GemmOperandTraitsAb.html index 39721d9540..68f5c1a442 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmOperandTraitsAb.html +++ b/docs/structcutlass_1_1gemm_1_1GemmOperandTraitsAb.html @@ -121,7 +121,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileATraits-members.html b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileATraits-members.html index a317e544fa..6cda4b9edb 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileATraits-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileATraits-members.html @@ -79,8 +79,8 @@

    This is the complete list of members for cutlass::gemm::GemmSharedLoadTileATraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >, including all inherited members.

    - - + + @@ -99,7 +99,7 @@
    Delta typedefcutlass::gemm::GemmSharedLoadTileATraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >
    ImmediateOffsetStrides typedefcutlass::gemm::GemmSharedLoadTileATraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >
    Delta typedefcutlass::gemm::GemmSharedLoadTileATraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >
    ImmediateOffsetStrides typedefcutlass::gemm::GemmSharedLoadTileATraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >
    Iterations typedefcutlass::gemm::GemmSharedLoadTileATraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >
    kAccessSizecutlass::gemm::GemmSharedLoadTileATraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >static
    kMemorySpacecutlass::gemm::GemmSharedLoadTileATraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >static
    diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileATraits.html b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileATraits.html index 27c32f35e9..d309569a2e 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileATraits.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileATraits.html @@ -119,12 +119,11 @@

    typedef Shape< 1, 1, TileWithoutSkew::kW/kWarps/kThreadsPerWarpIterations
     The number of iterations needed to load/store the tile. More...
     
    typedef Shape< TileWithSkew::kW, 0, kWarps *kThreadsPerWarp *kAccessSize, 0 > Delta
     The strides in each dimension between different loads/stores. More...
     
    typedef Shape< TileWithSkew::kW, 0, kWarps *kThreadsPerWarp *kAccessSize, 0 > ImmediateOffsetStrides
     The strides in each dimension between different loads/stores. More...
     
    typedef Shape< TileWithSkew::kW *Warps::kD, 0, kWarps *kThreadsPerWarp *kAccessSize, 0 > ImmediateOffsetStrides
     The strides in each dimension between different loads/stores. More...
     
    typedef Shape< TileWithSkew::kW *Warps::kD, 0, kWarps *kThreadsPerWarp *kAccessSize, 0 > Delta
     
    @@ -147,8 +146,8 @@

    Static Public Attributes

     

    Member Typedef Documentation

    - -

    ◆ Delta

    + +

    ◆ Delta

    @@ -156,15 +155,15 @@

    typedef Shape<TileWithSkew::kW, 0, kWarps * kThreadsPerWarp * kAccessSize, 0> cutlass::gemm::GemmSharedLoadTileATraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >::Deltatypedef Shape<TileWithSkew::kW * Warps::kD, 0, kWarps * kThreadsPerWarp * kAccessSize, 0> cutlass::gemm::GemmSharedLoadTileATraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >::Delta
    - -

    ◆ ImmediateOffsetStrides

    + +

    ◆ ImmediateOffsetStrides

    @@ -172,7 +171,7 @@

    - typedef Shape<TileWithSkew::kW, 0, kWarps * kThreadsPerWarp * kAccessSize, 0> cutlass::gemm::GemmSharedLoadTileATraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >::ImmediateOffsetStrides + typedef Shape<TileWithSkew::kW * Warps::kD, 0, kWarps * kThreadsPerWarp * kAccessSize, 0> cutlass::gemm::GemmSharedLoadTileATraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >::ImmediateOffsetStrides

    @@ -474,7 +473,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileATraits_1_1ThreadOffset-members.html b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileATraits_1_1ThreadOffset-members.html index 3e308db63e..11c167a811 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileATraits_1_1ThreadOffset-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileATraits_1_1ThreadOffset-members.html @@ -83,7 +83,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileATraits_1_1ThreadOffset.html b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileATraits_1_1ThreadOffset.html index 0731bce879..53deb57df8 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileATraits_1_1ThreadOffset.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileATraits_1_1ThreadOffset.html @@ -124,7 +124,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileBTraits-members.html b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileBTraits-members.html index 782aa8415e..4a08989998 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileBTraits-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileBTraits-members.html @@ -79,8 +79,8 @@

    This is the complete list of members for cutlass::gemm::GemmSharedLoadTileBTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >, including all inherited members.

    - - + + @@ -99,7 +99,7 @@
    Delta typedefcutlass::gemm::GemmSharedLoadTileBTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >
    ImmediateOffsetStrides typedefcutlass::gemm::GemmSharedLoadTileBTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >
    Delta typedefcutlass::gemm::GemmSharedLoadTileBTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >
    ImmediateOffsetStrides typedefcutlass::gemm::GemmSharedLoadTileBTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >
    Iterations typedefcutlass::gemm::GemmSharedLoadTileBTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >
    kAccessSizecutlass::gemm::GemmSharedLoadTileBTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >static
    kMemorySpacecutlass::gemm::GemmSharedLoadTileBTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >static

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileBTraits.html b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileBTraits.html index 097ce43efa..a4f04fe772 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileBTraits.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileBTraits.html @@ -119,12 +119,11 @@ typedef Shape< 1, 1, TileWithoutSkew::kW/kWarps/kThreadsPerWarpIterations  The number of iterations needed to load/store the tile. More...
      -typedef Shape< TileWithSkew::kW, 0, kWarps *kThreadsPerWarp *kAccessSize, 0 > Delta - The strides in each dimension between different loads/stores. More...
    -  -typedef Shape< TileWithSkew::kW, 0, kWarps *kThreadsPerWarp *kAccessSize, 0 > ImmediateOffsetStrides - The strides in each dimension between different loads/stores. More...
    -  +typedef Shape< TileWithSkew::kW *Warps::kD, 0, kWarps *kThreadsPerWarp *kAccessSize, 0 > ImmediateOffsetStrides + The strides in each dimension between different loads/stores. More...
    +  +typedef Shape< TileWithSkew::kW *Warps::kD, 0, kWarps *kThreadsPerWarp *kAccessSize, 0 > Delta +  @@ -147,8 +146,8 @@

    Static Public Attributes

     

    Member Typedef Documentation

    - -

    ◆ Delta

    + +

    ◆ Delta

    @@ -156,15 +155,15 @@

    - typedef Shape<TileWithSkew::kW, 0, kWarps * kThreadsPerWarp * kAccessSize, 0> cutlass::gemm::GemmSharedLoadTileBTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >::Delta + typedef Shape<TileWithSkew::kW * Warps::kD, 0, kWarps * kThreadsPerWarp * kAccessSize, 0> cutlass::gemm::GemmSharedLoadTileBTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >::Delta

    - -

    ◆ ImmediateOffsetStrides

    + +

    ◆ ImmediateOffsetStrides

    @@ -172,7 +171,7 @@

    - typedef Shape<TileWithSkew::kW, 0, kWarps * kThreadsPerWarp * kAccessSize, 0> cutlass::gemm::GemmSharedLoadTileBTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >::ImmediateOffsetStrides + typedef Shape<TileWithSkew::kW * Warps::kD, 0, kWarps * kThreadsPerWarp * kAccessSize, 0> cutlass::gemm::GemmSharedLoadTileBTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, InstructionShape_, kStages_, kScalarsPerLds_, kSkew_ >::ImmediateOffsetStrides

    @@ -474,7 +473,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileBTraits_1_1ThreadOffset-members.html b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileBTraits_1_1ThreadOffset-members.html index 387441df03..e1be4227fd 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileBTraits_1_1ThreadOffset-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileBTraits_1_1ThreadOffset-members.html @@ -83,7 +83,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileBTraits_1_1ThreadOffset.html b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileBTraits_1_1ThreadOffset.html index 07f462ac6c..9fb5f6bf7c 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileBTraits_1_1ThreadOffset.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileBTraits_1_1ThreadOffset.html @@ -124,7 +124,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileDTraits-members.html b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileDTraits-members.html index afc22fe8e5..7caaf5c4f1 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileDTraits-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileDTraits-members.html @@ -79,9 +79,9 @@

    This is the complete list of members for cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >, including all inherited members.

    - - - + + + @@ -90,17 +90,18 @@ - - - - - - - + + + + + + + +
    Delta typedefcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >
    ImmediateOffsetStrides typedefcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >
    Iterations typedefcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >
    Delta typedefcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >
    ImmediateOffsetStrides typedefcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >
    Iterations typedefcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >
    kAccessSizecutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >static
    kIterationsDcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >static
    kIterationsHcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >static
    kScalarsPerRowcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >static
    kScalarsPerThreadcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >static
    kSkewcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >static
    kThreadscutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >static
    OutputTile typedefcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >
    Pointer typedefcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >
    Scalar typedefcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >
    ThreadsPerWarp typedefcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >
    Tile typedefcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >
    Warps typedefcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >
    kSplitKcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >static
    kThreadscutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >static
    OutputTile typedefcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >
    Pointer typedefcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >
    Scalar typedefcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >
    ThreadsPerWarp typedefcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >
    Tile typedefcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >
    Warps typedefcutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileDTraits.html b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileDTraits.html index 043d8c3ae2..191deeca66 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileDTraits.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileDTraits.html @@ -108,17 +108,16 @@  The threads in the warps. More...
      typedef Shape< 1, 2, kScalarsPerRow/kAccessSize, kAccessSizeTile - The tile. More...
      -typedef Shape< kIterationsD, kIterationsH, OutputTile::kW/kWarpSize/kAccessSizeIterations - The number of iterations needed to store the tile. More...
    -  -typedef Shape< OutputTile::kW, kScalarsPerRow, kWarpSize *kAccessSizeDelta - The strides in each dimension between different loads/stores. More...
    -  -typedef Shape< OutputTile::kW, kScalarsPerRow, kWarpSize *kAccessSizeImmediateOffsetStrides - The strides in each dimension between different loads/stores. More...
    -  +typedef Shape< kIterationsD, kIterationsH, OutputTile::kW/kWarpSize/kAccessSize, Warps::kD > Iterations + The number of iterations needed to store the tile. More...
    +  +typedef Shape< OutputTile::kW, kScalarsPerRow, kWarpSize *kAccessSize, kSplitKImmediateOffsetStrides + The strides in each dimension between different loads/stores. More...
    +  +typedef Shape< OutputTile::kW, kScalarsPerRow, kWarpSize *kAccessSize, kSplitKDelta + The strides in each dimension between different loads/stores. More...
    +  @@ -146,10 +145,12 @@ + +

    Static Public Attributes

     
    static int const kIterationsD = kIterationsInHPerWarp / kIterationsH
     
    static int const kSplitK = OutputTile::kW * ThreadsPerWarp::kH / 2 * Warps::kH
     

    Member Typedef Documentation

    - -

    ◆ Delta

    + +

    ◆ Delta

    @@ -157,15 +158,15 @@

    - typedef Shape<OutputTile::kW, kScalarsPerRow, kWarpSize * kAccessSize> cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::Delta + typedef Shape<OutputTile::kW, kScalarsPerRow, kWarpSize * kAccessSize, kSplitK> cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::Delta

    - -

    ◆ ImmediateOffsetStrides

    + +

    ◆ ImmediateOffsetStrides

    @@ -173,15 +174,15 @@

    - typedef Shape<OutputTile::kW, kScalarsPerRow, kWarpSize * kAccessSize> cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::ImmediateOffsetStrides + typedef Shape<OutputTile::kW, kScalarsPerRow, kWarpSize * kAccessSize, kSplitK> cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::ImmediateOffsetStrides

    - -

    ◆ Iterations

    + +

    ◆ Iterations

    @@ -189,7 +190,7 @@

    - typedef Shape<kIterationsD, kIterationsH, OutputTile::kW / kWarpSize / kAccessSize> cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::Iterations + typedef Shape<kIterationsD, kIterationsH, OutputTile::kW / kWarpSize / kAccessSize, Warps::kD> cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::Iterations

    @@ -483,6 +485,30 @@

    +

    + + +

    ◆ kSplitK

    + +
    +
    +
    +template<typename Scalar_ , typename OutputTile_ , typename Warps_ , typename ThreadsPerWarp_ , int kTileH_, int kScalarsPerLds_, int kSkew_ = 0>
    + + + + + +
    + + + + +
    int const cutlass::gemm::GemmSharedLoadTileDTraits< Scalar_, OutputTile_, Warps_, ThreadsPerWarp_, kTileH_, kScalarsPerLds_, kSkew_ >::kSplitK = OutputTile::kW * ThreadsPerWarp::kH / 2 * Warps::kH
    +
    +static
    +
    +
    @@ -515,7 +541,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileDTraits_1_1ThreadOffset-members.html b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileDTraits_1_1ThreadOffset-members.html index 1a9ffe26c4..56ad54c93f 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileDTraits_1_1ThreadOffset-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileDTraits_1_1ThreadOffset-members.html @@ -83,7 +83,7 @@ diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileDTraits_1_1ThreadOffset.html b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileDTraits_1_1ThreadOffset.html index d68dda08ba..16eb03b3f5 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileDTraits_1_1ThreadOffset.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedLoadTileDTraits_1_1ThreadOffset.html @@ -124,7 +124,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileAbTraits-members.html b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileAbTraits-members.html index eb2702a3f7..1638037a22 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileAbTraits-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileAbTraits-members.html @@ -93,7 +93,7 @@ diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileAbTraits.html b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileAbTraits.html index f755f52fca..28ec8e3e85 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileAbTraits.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileAbTraits.html @@ -336,7 +336,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileAbTraits_1_1ThreadOffset-members.html b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileAbTraits_1_1ThreadOffset-members.html index 6157a4d397..0d4cc8a96b 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileAbTraits_1_1ThreadOffset-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileAbTraits_1_1ThreadOffset-members.html @@ -83,7 +83,7 @@ diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileAbTraits_1_1ThreadOffset.html b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileAbTraits_1_1ThreadOffset.html index 876eea666a..6fa883234e 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileAbTraits_1_1ThreadOffset.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileAbTraits_1_1ThreadOffset.html @@ -121,7 +121,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileDTraits-members.html b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileDTraits-members.html index 5749940cee..7b95fb3213 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileDTraits-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileDTraits-members.html @@ -97,7 +97,7 @@ diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileDTraits.html b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileDTraits.html index a5e0b8d6ce..73f9f0a046 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileDTraits.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileDTraits.html @@ -437,7 +437,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileDTraits_1_1ThreadOffset-members.html b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileDTraits_1_1ThreadOffset-members.html index 8a28c51900..7f16e8200c 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileDTraits_1_1ThreadOffset-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileDTraits_1_1ThreadOffset-members.html @@ -83,7 +83,7 @@ diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileDTraits_1_1ThreadOffset.html b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileDTraits_1_1ThreadOffset.html index 673f9afb92..e47cf125f1 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileDTraits_1_1ThreadOffset.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreTileDTraits_1_1ThreadOffset.html @@ -124,7 +124,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreWithSkewTileAbTraits-members.html b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreWithSkewTileAbTraits-members.html index 0a64b450cb..48ea29a014 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreWithSkewTileAbTraits-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreWithSkewTileAbTraits-members.html @@ -94,7 +94,7 @@ diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreWithSkewTileAbTraits.html b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreWithSkewTileAbTraits.html index ed1fb90bd4..daf80596fd 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreWithSkewTileAbTraits.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreWithSkewTileAbTraits.html @@ -367,7 +367,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreWithSkewTileAbTraits_1_1ThreadOffset-members.html b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreWithSkewTileAbTraits_1_1ThreadOffset-members.html index 1298ee529a..6c10a6407c 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreWithSkewTileAbTraits_1_1ThreadOffset-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreWithSkewTileAbTraits_1_1ThreadOffset-members.html @@ -83,7 +83,7 @@ diff --git a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreWithSkewTileAbTraits_1_1ThreadOffset.html b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreWithSkewTileAbTraits_1_1ThreadOffset.html index cc55e56e80..f5067a3519 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmSharedStoreWithSkewTileAbTraits_1_1ThreadOffset.html +++ b/docs/structcutlass_1_1gemm_1_1GemmSharedStoreWithSkewTileAbTraits_1_1ThreadOffset.html @@ -121,7 +121,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA.html b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA.html index 8c1ffaf9e4..942c110dbc 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA.html +++ b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA.html @@ -84,7 +84,7 @@ diff --git a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4-members.html b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4-members.html index 9bca290fbb..a8bf4cb25e 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4-members.html @@ -88,7 +88,7 @@ diff --git a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4.html b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4.html index 7affa6ef61..5987120289 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4.html +++ b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4.html @@ -88,7 +88,7 @@
    -cutlass::gemm::IgemmTileTraitsHelperA< MatrixLayout::kColumnMajor, GemmConfig_ > +cutlass::gemm::IgemmTileTraitsHelperA< MatrixLayout::kColumnMajor, GemmConfig_, Index_ >
    @@ -103,7 +103,7 @@ - + @@ -190,7 +190,7 @@

    - +
    typedef GemmGlobalTileTraits< GemmOperand::kA, MatrixLayout::kColumnMajor, Scalar const, Shape< 1, GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kW >, Shape< 1, ShapeCount< typename GemmConfig_::Warps >::kCount, GemmConfig_::kWarpSize >, GemmConfig_::kScalarsPerLdgA > GlobalTileTraits
     The traits class to build the iterator to load data from global memory for A^N. More...
     
    typedef GemmSharedStoreTileAbTraits< MultiplyAddScalar, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kW *GemmConfig_::InstructionShape::kD >, typename GlobalTileTraits::Threads, GemmConfig_::kScalarsPerStsA > SharedStoreTileTraits
    typedef GemmSharedStoreTileAbTraits< MultiplyAddScalar, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kW *GemmConfig_::InstructionShape::kD >, typename GlobalTileTraits::Threads, GemmConfig_::kScalarsPerStsA > SharedStoreTileTraits
     The traits class to build the iterator to store data to shared memory for A^N. More...
     
    typedef GemmSharedLoadTileATraits< MultiplyAddScalar const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, GemmConfig_::kScalarsPerLdsA, 0 > SharedLoadTileTraits
    typedef GemmSharedStoreTileAbTraits< MultiplyAddScalar, Shape<GemmConfig_::kStages, GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kW * GemmConfig_::InstructionShape::kD>, typename GlobalTileTraits::Threads, GemmConfig_::kScalarsPerStsA> cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kColumnMajor, GemmConfig_ >::SharedStoreTileTraitstypedef GemmSharedStoreTileAbTraits< MultiplyAddScalar, Shape<GemmConfig_::kStages, GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kW * GemmConfig_::InstructionShape::kD>, typename GlobalTileTraits::Threads, GemmConfig_::kScalarsPerStsA> cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kColumnMajor, GemmConfig_ >::SharedStoreTileTraits
    @@ -228,7 +228,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4.png b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4.png index 9ce259eb1e..f9de5952a5 100644 Binary files a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4.png and b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4.png differ diff --git a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4-members.html b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4-members.html index 09585beb23..e4d88e7b7d 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4-members.html @@ -82,14 +82,13 @@ GlobalTileTraits typedefcutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > kLayoutcutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >static kScalarsIn4Bcutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >static - MultiplyAddScalar typedefcutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > - Scalar typedefcutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > - SharedLoadTileTraits typedefcutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > - SharedStoreTileTraits typedefcutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > + kSkewAcutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >static + MultiplyAddScalar typedefcutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > + Scalar typedefcutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4.html b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4.html index 809d799b13..f325dcd5a9 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4.html +++ b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperA_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4.html @@ -103,12 +103,6 @@ typedef GemmGlobalTileTraits< GemmOperand::kA, MatrixLayout::kRowMajor, Scalar const, Shape< 1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD >, Shape< 1, GemmConfig_::kThreads/GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD >, GemmConfig_::kScalarsPerLdgA > GlobalTileTraits  The traits class to build the iterator to load data from global memory for A^T. More...
      -typedef GemmSharedStoreWithSkewTileAbTraits< MultiplyAddScalar, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kW *GemmConfig_::InstructionShape::kD >, typename GlobalTileTraits::Threads, GemmConfig_::kScalarsPerStsA, 128/sizeof(MultiplyAddScalar)/GemmConfig_::kScalarsPerStsA/GlobalTileTraits::Threads::kW *kScalarsIn4BSharedStoreTileTraits - The traits class to build the iterator to store data to shared memory for A^T. More...
    -  -typedef GemmSharedLoadTileATraits< MultiplyAddScalar const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, GemmConfig_::kScalarsPerLdsA, SharedStoreTileTraits::kSkew > SharedLoadTileTraits - The traits class to build the iterator to load from shared memory for A^T. More...
    -  @@ -118,6 +112,9 @@ + + +

    Static Public Attributes

    static int const kScalarsIn4B = sizeof(MultiplyAddScalar) > 4 ? 1 : 4 / sizeof(MultiplyAddScalar)
     The number of scalars in 4B. More...
     
    static int const kSkewA
     The skew for A. More...
     

    Member Typedef Documentation

    @@ -168,41 +165,33 @@

    -

    ◆ SharedLoadTileTraits

    - -
    -
    -
    -template<typename GemmConfig_ >
    - - - - -
    typedef GemmSharedLoadTileATraits< MultiplyAddScalar const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, GemmConfig_::kScalarsPerLdsA, SharedStoreTileTraits::kSkew> cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >::SharedLoadTileTraits
    -
    - -
    -
    - -

    ◆ SharedStoreTileTraits

    +

    Member Data Documentation

    + +

    ◆ kLayout

    template<typename GemmConfig_ >
    + + + + + +
    - +
    typedef GemmSharedStoreWithSkewTileAbTraits< MultiplyAddScalar, Shape<GemmConfig_::kStages, GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kW * GemmConfig_::InstructionShape::kD>, typename GlobalTileTraits::Threads, GemmConfig_::kScalarsPerStsA, 128 / sizeof(MultiplyAddScalar) / GemmConfig_::kScalarsPerStsA / GlobalTileTraits::Threads::kW * kScalarsIn4B> cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >::SharedStoreTileTraitsMatrixLayout::Kind const cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >::kLayout = MatrixLayout::kRowMajor
    +
    +static
    -

    Member Data Documentation

    - -

    ◆ kLayout

    + +

    ◆ kScalarsIn4B

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperB_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4-members.html b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperB_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4-members.html index be5a0a9b1e..7ff002906d 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperB_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperB_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4-members.html @@ -82,14 +82,13 @@ GlobalTileTraits typedefcutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > kLayoutcutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >static kScalarsIn4Bcutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >static - MultiplyAddScalar typedefcutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > - Scalar typedefcutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > - SharedLoadTileTraits typedefcutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > - SharedStoreTileTraits typedefcutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > + kSkewBcutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >static + MultiplyAddScalar typedefcutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > + Scalar typedefcutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >
    diff --git a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperB_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4.html b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperB_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4.html index fc90114c00..370667fb63 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperB_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4.html +++ b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperB_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4.html @@ -103,12 +103,6 @@ typedef GemmGlobalTileTraits< GemmOperand::kB, MatrixLayout::kColumnMajor, Scalar const, Shape< 1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD >, Shape< 1, GemmConfig_::kThreads/GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD >, GemmConfig_::kScalarsPerLdgB > GlobalTileTraits  The traits class to build the iterator to load data from global memory for B^N. More...
      -typedef GemmSharedStoreWithSkewTileAbTraits< MultiplyAddScalar, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kH *GemmConfig_::InstructionShape::kD >, typename GlobalTileTraits::Threads, GemmConfig_::kScalarsPerStsB, 128/sizeof(MultiplyAddScalar)/GemmConfig_::kScalarsPerStsB/GlobalTileTraits::Threads::kW *kScalarsIn4BSharedStoreTileTraits - The traits class to build the iterator to store data to shared memory for B^N. More...
    -  -typedef GemmSharedLoadTileBTraits< MultiplyAddScalar const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, GemmConfig_::kScalarsPerLdsB, SharedStoreTileTraits::kSkew > SharedLoadTileTraits - The traits class to build the iterator to load from shared memory for B^N. More...
    -  @@ -118,6 +112,9 @@ + + +

    Static Public Attributes

    static int const kScalarsIn4B = sizeof(MultiplyAddScalar) > 4 ? 1 : 4 / sizeof(MultiplyAddScalar)
     The number of scalars in 4B. More...
     
    static int const kSkewB
     The skew for B. More...
     

    Member Typedef Documentation

    @@ -168,41 +165,33 @@

    -

    ◆ SharedLoadTileTraits

    - -
    -
    -
    -template<typename GemmConfig_ >
    - - - - -
    typedef GemmSharedLoadTileBTraits< MultiplyAddScalar const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, GemmConfig_::kScalarsPerLdsB, SharedStoreTileTraits::kSkew> cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >::SharedLoadTileTraits
    -
    - -
    -
    - -

    ◆ SharedStoreTileTraits

    +

    Member Data Documentation

    + +

    ◆ kLayout

    template<typename GemmConfig_ >
    + + + + + +
    - +
    typedef GemmSharedStoreWithSkewTileAbTraits< MultiplyAddScalar, Shape<GemmConfig_::kStages, GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kH * GemmConfig_::InstructionShape::kD>, typename GlobalTileTraits::Threads, GemmConfig_::kScalarsPerStsB, 128 / sizeof(MultiplyAddScalar) / GemmConfig_::kScalarsPerStsB / GlobalTileTraits::Threads::kW * kScalarsIn4B> cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >::SharedStoreTileTraitsMatrixLayout::Kind const cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >::kLayout = MatrixLayout::kColumnMajor
    +
    +static
    -

    Member Data Documentation

    - -

    ◆ kLayout

    + +

    ◆ kScalarsIn4B

    @@ -213,7 +202,7 @@

    - +
    MatrixLayout::Kind const cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >::kLayout = MatrixLayout::kColumnMajorint const cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >::kScalarsIn4B = sizeof(MultiplyAddScalar) > 4 ? 1 : 4 / sizeof(MultiplyAddScalar)
    @@ -225,8 +214,8 @@

    -

    ◆ kScalarsIn4B

    + +

    ◆ kSkewB


    The documentation for this struct was generated from the following file:
      @@ -255,7 +244,7 @@

      diff --git a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperB_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4-members.html b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperB_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4-members.html index 04d0fed09e..a3e42a5f09 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperB_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperB_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4-members.html @@ -88,7 +88,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperB_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4.html b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperB_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4.html index d2976060d6..3b077381d8 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperB_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4.html +++ b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperB_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4.html @@ -88,7 +88,7 @@
    -cutlass::gemm::IgemmTileTraitsHelperB< MatrixLayout::kRowMajor, GemmConfig_ > +cutlass::gemm::IgemmTileTraitsHelperB< MatrixLayout::kRowMajor, GemmConfig_, Index_ >
    @@ -103,7 +103,7 @@ - + @@ -190,7 +190,7 @@

    - +
    typedef GemmGlobalTileTraits< GemmOperand::kB, MatrixLayout::kRowMajor, Scalar const, Shape< 1, GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kH >, Shape< 1, ShapeCount< typename GemmConfig_::Warps >::kCount, GemmConfig_::kWarpSize >, GemmConfig_::kScalarsPerLdgB > GlobalTileTraits
     The traits class to build the iterator to load data from global memory for B^T. More...
     
    typedef GemmSharedStoreTileAbTraits< MultiplyAddScalar, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kH *GemmConfig_::InstructionShape::kD >, typename GlobalTileTraits::Threads, GemmConfig_::kScalarsPerStsB > SharedStoreTileTraits
    typedef GemmSharedStoreTileAbTraits< MultiplyAddScalar, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kH *GemmConfig_::InstructionShape::kD >, typename GlobalTileTraits::Threads, GemmConfig_::kScalarsPerStsB > SharedStoreTileTraits
     The traits class to build the iterator to store data to shared memory for B^T. More...
     
    typedef GemmSharedLoadTileBTraits< MultiplyAddScalar const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, GemmConfig_::kScalarsPerLdsB, 0 > SharedLoadTileTraits
    typedef GemmSharedStoreTileAbTraits< MultiplyAddScalar, Shape<GemmConfig_::kStages, GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kH * GemmConfig_::InstructionShape::kD>, typename GlobalTileTraits::Threads, GemmConfig_::kScalarsPerStsB> cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kRowMajor, GemmConfig_ >::SharedStoreTileTraitstypedef GemmSharedStoreTileAbTraits< MultiplyAddScalar, Shape<GemmConfig_::kStages, GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kH * GemmConfig_::InstructionShape::kD>, typename GlobalTileTraits::Threads, GemmConfig_::kScalarsPerStsB> cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kRowMajor, GemmConfig_ >::SharedStoreTileTraits
    @@ -228,7 +228,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperB_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4.png b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperB_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4.png index f291cad7a3..b0f8b6c70f 100644 Binary files a/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperB_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4.png and b/docs/structcutlass_1_1gemm_1_1GemmTileTraitsHelperB_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4.png differ diff --git a/docs/structcutlass_1_1gemm_1_1GemmTraits-members.html b/docs/structcutlass_1_1gemm_1_1GemmTraits-members.html index 05cab0611a..0a2041245b 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmTraits-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmTraits-members.html @@ -83,9 +83,11 @@ ClearAccumulators typedefcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ > Epilogue typedefcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ > GemmConfig typedefcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ > - GlobalLoadStreamA typedefcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ > - GlobalLoadStreamB typedefcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ > - Index typedefcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ > + GlobalLoadStream typedefcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ > + GlobalLoadStreamA typedefcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ > + GlobalLoadStreamB typedefcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ > + Index typedefcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ > + KernelClass typedefcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ > kLayoutAcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >static kLayoutBcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >static MultiplyAdd typedefcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ > @@ -98,12 +100,13 @@ shared_store_fence(bool in_loop)cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >inlinestatic SharedLoadStreamA typedefcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ > SharedLoadStreamB typedefcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ > - SharedStoreStorageA typedefcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ > - SharedStoreStorageB typedefcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ > + SharedStream typedefcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ > + This_ typedefcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ > + ThreadblockTileStorage typedefcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmTraits.html b/docs/structcutlass_1_1gemm_1_1GemmTraits.html index 7153c8237a..a97a8d4a8d 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmTraits.html +++ b/docs/structcutlass_1_1gemm_1_1GemmTraits.html @@ -87,29 +87,27 @@ - - - - - - - + - -

    Classes

    struct  GlobalLoadStream
     Assemble the global load streams for A/B. More...
     
    struct  MainLoopSharedStorage
     
    struct  Params
     The params. More...
     
    struct  SharedLoadStream
     Assemble the shared load stream for A/B. More...
     Parameters object constructable on the host. More...
     
    union  SharedStorage
     The storage in shared memory. More...
     
    union  StreamSharedStorage
     
    + + + + + + - + @@ -130,13 +128,7 @@ - - - - - - - + @@ -156,6 +148,15 @@ + + + + + + + + +

    Public Types

    typedef GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ > This_
     This traits. More...
     
    typedef cutlass::gemm::Gemm< This_KernelClass
     The struct that consumes this Traits. More...
     
    typedef GemmConfig_ GemmConfig
     The configuration. More...
     
    typedef GemmConfig::OutputTile OutputTile
    typedef GemmConfig::OutputTile OutputTile
     The output tile. More...
     
    typedef GlobalLoadStreamA_ GlobalLoadStreamA
    typedef SharedLoadStreamB_ SharedLoadStreamB
     The iterator for B to load from shared memory. More...
     
    typedef GlobalLoadStreamA::SharedStoreStorage SharedStoreStorageA
     The shared storage for A. More...
     
    typedef GlobalLoadStreamB::SharedStoreStorage SharedStoreStorageB
     The shared storage for B. More...
     
    typedef GemmConfig::MultiplyAdd MultiplyAdd
    typedef GemmConfig::MultiplyAdd MultiplyAdd
     The multiply-add functor. More...
     
    typedef Epilogue_ Epilogue
    typedef ClearAccumulators_ ClearAccumulators
     Clear the accumulators. More...
     
    typedef GlobalLoadStreamPair< GlobalLoadStreamA, GlobalLoadStreamB, GemmConfig::kResidueInPrologGlobalLoadStream
     Assemble the global load streams for A/B. More...
     
    typedef GlobalLoadStream::ThreadblockTileStorage ThreadblockTileStorage
     Memory needed to store the threadblock-scoped GEMM tile. More...
     
    typedef SharedStreamPair< SharedLoadStreamA, SharedLoadStreamBSharedStream
     Assemble the shared load streams for A/B. More...
     
    @@ -182,7 +183,7 @@

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>

    Static Public Member Functions

    @@ -198,7 +199,7 @@

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>

    typedef BlockSwizzle_ cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::BlockSwizzle
    @@ -214,7 +215,7 @@

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>

    typedef ClearAccumulators_ cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::ClearAccumulators
    @@ -230,7 +231,7 @@

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>

    typedef Epilogue_ cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Epilogue
    @@ -238,6 +239,22 @@

    + + + +

    ◆ GlobalLoadStream

    + +
    +
    +
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    +
    typedef GemmConfig_ cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::GemmConfig
    + + + +
    typedef GlobalLoadStreamPair<GlobalLoadStreamA, GlobalLoadStreamB, GemmConfig::kResidueInProlog> cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::GlobalLoadStream
    +
    +
    @@ -246,7 +263,7 @@

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    @@ -262,7 +279,7 @@

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>

    typedef GlobalLoadStreamA_ cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::GlobalLoadStreamA
    @@ -278,7 +295,7 @@

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>

    typedef GlobalLoadStreamB_ cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::GlobalLoadStreamB
    @@ -286,6 +303,22 @@

    + + + +

    ◆ KernelClass

    + +
    +
    +
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    +
    typedef Index_ cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Index
    + + + +
    typedef cutlass::gemm::Gemm<This_> cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::KernelClass
    +
    +
    @@ -294,10 +327,10 @@

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    - +
    typedef GemmConfig::MultiplyAdd cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::MultiplyAddtypedef GemmConfig::MultiplyAdd cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::MultiplyAdd
    @@ -326,7 +359,7 @@

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    @@ -342,7 +375,7 @@

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>

    typedef GlobalLoadStreamA_::Scalar cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::ScalarA
    @@ -358,7 +391,7 @@

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>

    typedef GlobalLoadStreamB_::Scalar cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::ScalarB
    @@ -374,7 +407,7 @@

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>

    typedef Epilogue::ScalarC cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::ScalarC
    @@ -390,7 +423,7 @@

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>

    typedef Epilogue::ScalarD cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::ScalarD
    @@ -406,7 +439,7 @@

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>

    typedef SharedLoadStreamA_ cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::SharedLoadStreamA
    @@ -416,32 +449,48 @@

    -

    ◆ SharedStoreStorageA

    + +

    ◆ SharedStream

    + +
    +
    +
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    +
    typedef SharedLoadStreamB_ cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::SharedLoadStreamB
    + + + +
    typedef SharedStreamPair<SharedLoadStreamA, SharedLoadStreamB> cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::SharedStream
    +

    + +
    + + +

    ◆ This_

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    - +
    typedef GlobalLoadStreamA::SharedStoreStorage cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::SharedStoreStorageAtypedef GemmTraits<GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_> cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::This_
    - -

    ◆ SharedStoreStorageB

    + +

    ◆ ThreadblockTileStorage

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    - +
    typedef GlobalLoadStreamB::SharedStoreStorage cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::SharedStoreStorageBtypedef GlobalLoadStream::ThreadblockTileStorage cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::ThreadblockTileStorage
    @@ -455,7 +504,7 @@

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    - +
    @@ -483,7 +532,7 @@

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    - + - - - + + + - - - + + + @@ -122,25 +122,28 @@ - - - - - - - - - + + + + + + + + + + + + - - - - + + + +
    @@ -512,7 +561,7 @@

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    @@ -536,7 +585,7 @@

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    @@ -560,7 +609,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmTraits_1_1MainLoopSharedStorage-members.html b/docs/structcutlass_1_1gemm_1_1GemmTraits_1_1MainLoopSharedStorage-members.html index 5f36220360..fd59ce9278 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmTraits_1_1MainLoopSharedStorage-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmTraits_1_1MainLoopSharedStorage-members.html @@ -80,12 +80,12 @@

    This is the complete list of members for cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::MainLoopSharedStorage, including all inherited members.

    - - + +
    clearcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::MainLoopSharedStorage
    stream_acutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::MainLoopSharedStorage
    stream_bcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::MainLoopSharedStorage
    global_to_shared_streamcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::MainLoopSharedStorage
    threadblock_tilecutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::MainLoopSharedStorage
    diff --git a/docs/structcutlass_1_1gemm_1_1GemmTraits_1_1MainLoopSharedStorage.html b/docs/structcutlass_1_1gemm_1_1GemmTraits_1_1MainLoopSharedStorage.html index 95f9a8291c..6dd16c31a6 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmTraits_1_1MainLoopSharedStorage.html +++ b/docs/structcutlass_1_1gemm_1_1GemmTraits_1_1MainLoopSharedStorage.html @@ -84,11 +84,14 @@ - - - - + + + + + + +

    Public Attributes

    StreamSharedStorage< GlobalLoadStreamA, SharedLoadStreamAstream_a
     
    StreamSharedStorage< GlobalLoadStreamB, SharedLoadStreamBstream_b
     
    ThreadblockTileStorage threadblock_tile
     Stores the threadblock tile. More...
     
    GlobalLoadStream::SharedStorage global_to_shared_stream
     Storage for GEMM global stream. More...
     
    ClearAccumulators::SharedStorage clear
     Storage for clearing accumulators. More...
     

    Member Data Documentation

    @@ -98,7 +101,7 @@

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    @@ -108,32 +111,32 @@

    -

    ◆ stream_a

    + +

    ◆ global_to_shared_stream

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>

    ClearAccumulators::SharedStorage cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::MainLoopSharedStorage::clear
    - +
    StreamSharedStorage<GlobalLoadStreamA, SharedLoadStreamA> cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::MainLoopSharedStorage::stream_aGlobalLoadStream::SharedStorage cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::MainLoopSharedStorage::global_to_shared_stream
    - -

    ◆ stream_b

    + +

    ◆ threadblock_tile

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    - +
    StreamSharedStorage<GlobalLoadStreamB, SharedLoadStreamB> cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::MainLoopSharedStorage::stream_bThreadblockTileStorage cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::MainLoopSharedStorage::threadblock_tile
    @@ -146,7 +149,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmTraits_1_1Params-members.html b/docs/structcutlass_1_1gemm_1_1GemmTraits_1_1Params-members.html index 05de1ce4e7..dc6d1859fb 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmTraits_1_1Params-members.html +++ b/docs/structcutlass_1_1gemm_1_1GemmTraits_1_1Params-members.html @@ -79,19 +79,21 @@

    This is the complete list of members for cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params, including all inherited members.

    + + - - + + - - - - - + + + + +
    blockcutlass::KernelLaunchConfiguration
    dynamic_smemcutlass::KernelLaunchConfiguration
    epiloguecutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params
    global_stream_acutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params
    global_stream_bcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params
    global_to_shared_streamcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params
    gridcutlass::KernelLaunchConfiguration
    initialize(GemmDesc_ const &desc)cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Paramsinline
    kcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params
    mcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params
    ncutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params
    shared_stream_acutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params
    shared_stream_bcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params
    initialize(Index m, Index n, Index k, typename Epilogue::Scalar alpha, ScalarA const *d_a, Index lda, ScalarB const *d_b, Index ldb, typename Epilogue::Scalar beta, ScalarC const *d_c, Index ldc, ScalarD *d_d, Index ldd)cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Paramsinline
    initialize(Index m, Index n, Index k, typename Epilogue::Scalar alpha, ScalarA const *d_a, Index lda, long long int batch_stride_A, ScalarB const *d_b, Index ldb, long long int batch_stride_B, typename Epilogue::Scalar beta, ScalarC const *d_c, Index ldc, long long int batch_stride_C, ScalarD *d_d, Index ldd, long long int batch_stride_D, Index batch_count)cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Paramsinline
    KernelLaunchConfiguration(dim3 _grid=dim3(1, 1, 1), dim3 _block=dim3(1, 1, 1), size_t _dynamic_smem=0)cutlass::KernelLaunchConfigurationinline
    problem_sizecutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params
    shared_streamcutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmTraits_1_1Params.html b/docs/structcutlass_1_1gemm_1_1GemmTraits_1_1Params.html index ffeb872b17..22a9ff40d2 100644 --- a/docs/structcutlass_1_1gemm_1_1GemmTraits_1_1Params.html +++ b/docs/structcutlass_1_1gemm_1_1GemmTraits_1_1Params.html @@ -81,10 +81,19 @@
    -

    The params. +

    Parameters object constructable on the host.

    #include <gemm_traits.h>

    +
    +Inheritance diagram for cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params:
    +
    +
    + + +cutlass::KernelLaunchConfiguration + +
    @@ -92,40 +101,50 @@ + + + + + + + + + +

    Public Member Functions

    CUTLASS_HOST_DEVICE int initialize (GemmDesc_ const &desc)
     Initialize the parameters. More...
     
    CUTLASS_HOST_DEVICE int initialize (Index m, Index n, Index k, typename Epilogue::Scalar alpha, ScalarA const *d_a, Index lda, ScalarB const *d_b, Index ldb, typename Epilogue::Scalar beta, ScalarC const *d_c, Index ldc, ScalarD *d_d, Index ldd)
     Helper to construct a GEMM params using a BLAS-like API. More...
     
    CUTLASS_HOST_DEVICE int initialize (Index m, Index n, Index k, typename Epilogue::Scalar alpha, ScalarA const *d_a, Index lda, long long int batch_stride_A, ScalarB const *d_b, Index ldb, long long int batch_stride_B, typename Epilogue::Scalar beta, ScalarC const *d_c, Index ldc, long long int batch_stride_C, ScalarD *d_d, Index ldd, long long int batch_stride_D, Index batch_count)
     Helper to construct a batched GEMM params. More...
     
    - Public Member Functions inherited from cutlass::KernelLaunchConfiguration
    CUTLASS_HOST_DEVICE KernelLaunchConfiguration (dim3 _grid=dim3(1, 1, 1), dim3 _block=dim3(1, 1, 1), size_t _dynamic_smem=0)
     Constructs a KernellaunchConfiguration object. More...
     
    - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + +

    Public Attributes

    Index m
     The dimensions of the GEMM. More...
     
    Index n
     
    Index k
     
    GlobalLoadStreamA::Params global_stream_a
     The params for the A stream. More...
     
    GlobalLoadStreamB::Params global_stream_b
     The params for the B stream. More...
     
    SharedLoadStreamA::Params shared_stream_a
     The params for the A stream from shared memory. More...
     
    SharedLoadStreamB::Params shared_stream_b
     The params for the B stream from shared memory. More...
     
    GemmCoord problem_size
     GEMM problem size. More...
     
    GlobalLoadStream::Params global_to_shared_stream
     Parameters object for the global load stream. More...
     
    SharedStream::Params shared_stream
     Parameters object for the shared load stream. More...
     
    Epilogue::Params epilogue
     The params for the epilogue. More...
     
    - Public Attributes inherited from cutlass::KernelLaunchConfiguration
    dim3 grid
     CUDA grid dimensions. More...
     
    dim3 block
     CUDA threablock dimensions. More...
     
    size_t dynamic_smem
     Bytes of dynamically allocated SMEM in addition to static SMEM. More...
     

    Member Function Documentation

    -

    ◆ initialize()

    +

    ◆ initialize() [1/3]

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    template<typename GemmDesc_ >
    @@ -149,129 +168,303 @@

    Member Data Documentation

    - -

    ◆ epilogue

    + +

    ◆ initialize() [2/3]

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    +
    + + + + +
    - + + + + -
    Epilogue::Params cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params::epilogueCUTLASS_HOST_DEVICE int cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params::initialize (Index m,
    -
    - -
    - - -

    ◆ global_stream_a

    - -
    -
    -
    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    - - + + + + + + + + + + -
    GlobalLoadStreamA::Params cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params::global_stream_aIndex n,
    Index k,
    -
    - -
    -
    - -

    ◆ global_stream_b

    - -
    -
    -
    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    GlobalLoadStreamB::Params cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params::global_stream_btypename Epilogue::Scalar alpha,
    ScalarA const * d_a,
    Index lda,
    ScalarB const * d_b,
    Index ldb,
    typename Epilogue::Scalar beta,
    ScalarC const * d_c,
    Index ldc,
    ScalarDd_d,
    Index ldd 
    )
    +
    +inline
    - -

    ◆ k

    + +

    ◆ initialize() [3/3]

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    + + + + + +
    - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Index cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params::kCUTLASS_HOST_DEVICE int cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params::initialize (Index m,
    Index n,
    Index k,
    typename Epilogue::Scalar alpha,
    ScalarA const * d_a,
    Index lda,
    long long int batch_stride_A,
    ScalarB const * d_b,
    Index ldb,
    long long int batch_stride_B,
    typename Epilogue::Scalar beta,
    ScalarC const * d_c,
    Index ldc,
    long long int batch_stride_C,
    ScalarDd_d,
    Index ldd,
    long long int batch_stride_D,
    Index batch_count 
    )
    +
    +inline
    - -

    ◆ m

    +

    Member Data Documentation

    + +

    ◆ epilogue

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    - +
    Index cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params::mEpilogue::Params cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params::epilogue
    - -

    ◆ n

    + +

    ◆ global_to_shared_stream

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    - +
    Index cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params::nGlobalLoadStream::Params cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params::global_to_shared_stream
    - -

    ◆ shared_stream_a

    + +

    ◆ problem_size

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    - +
    SharedLoadStreamA::Params cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params::shared_stream_aGemmCoord cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params::problem_size
    - -

    ◆ shared_stream_b

    + +

    ◆ shared_stream

    -template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Scalar>>
    +template<typename GemmConfig_, typename GlobalLoadStreamA_, typename GlobalLoadStreamB_, typename SharedLoadStreamA_, typename SharedLoadStreamB_, typename Epilogue_, typename BlockSwizzle_ = IdentityBlockSwizzle, typename Index_ = int, typename ClearAccumulators_ = ClearAccumulators<typename GemmConfig_::Accumulators::Element>>
    - +
    SharedLoadStreamB::Params cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params::shared_stream_bSharedStream::Params cutlass::gemm::GemmTraits< GemmConfig_, GlobalLoadStreamA_, GlobalLoadStreamB_, SharedLoadStreamA_, SharedLoadStreamB_, Epilogue_, BlockSwizzle_, Index_, ClearAccumulators_ >::Params::shared_stream
    @@ -284,7 +477,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GemmTraits_1_1Params.png b/docs/structcutlass_1_1gemm_1_1GemmTraits_1_1Params.png new file mode 100644 index 0000000000..6d6c416a37 Binary files /dev/null and b/docs/structcutlass_1_1gemm_1_1GemmTraits_1_1Params.png differ diff --git a/docs/structcutlass_1_1gemm_1_1GetExtent.html b/docs/structcutlass_1_1gemm_1_1GetExtent.html index c955db6565..c84e02823d 100644 --- a/docs/structcutlass_1_1gemm_1_1GetExtent.html +++ b/docs/structcutlass_1_1gemm_1_1GetExtent.html @@ -84,7 +84,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GetExtent_3_01GemmOperand_1_1kA_00_01Tile___01_4-members.html b/docs/structcutlass_1_1gemm_1_1GetExtent_3_01GemmOperand_1_1kA_00_01Tile___01_4-members.html index 725806f650..351a7645b2 100644 --- a/docs/structcutlass_1_1gemm_1_1GetExtent_3_01GemmOperand_1_1kA_00_01Tile___01_4-members.html +++ b/docs/structcutlass_1_1gemm_1_1GetExtent_3_01GemmOperand_1_1kA_00_01Tile___01_4-members.html @@ -83,7 +83,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GetExtent_3_01GemmOperand_1_1kA_00_01Tile___01_4.html b/docs/structcutlass_1_1gemm_1_1GetExtent_3_01GemmOperand_1_1kA_00_01Tile___01_4.html index 4e613ddc9a..2f4be77a31 100644 --- a/docs/structcutlass_1_1gemm_1_1GetExtent_3_01GemmOperand_1_1kA_00_01Tile___01_4.html +++ b/docs/structcutlass_1_1gemm_1_1GetExtent_3_01GemmOperand_1_1kA_00_01Tile___01_4.html @@ -118,7 +118,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GetExtent_3_01GemmOperand_1_1kB_00_01Tile___01_4-members.html b/docs/structcutlass_1_1gemm_1_1GetExtent_3_01GemmOperand_1_1kB_00_01Tile___01_4-members.html index d17a7e7c5f..1be152115c 100644 --- a/docs/structcutlass_1_1gemm_1_1GetExtent_3_01GemmOperand_1_1kB_00_01Tile___01_4-members.html +++ b/docs/structcutlass_1_1gemm_1_1GetExtent_3_01GemmOperand_1_1kB_00_01Tile___01_4-members.html @@ -83,7 +83,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GetExtent_3_01GemmOperand_1_1kB_00_01Tile___01_4.html b/docs/structcutlass_1_1gemm_1_1GetExtent_3_01GemmOperand_1_1kB_00_01Tile___01_4.html index 172db999e2..a982badec8 100644 --- a/docs/structcutlass_1_1gemm_1_1GetExtent_3_01GemmOperand_1_1kB_00_01Tile___01_4.html +++ b/docs/structcutlass_1_1gemm_1_1GetExtent_3_01GemmOperand_1_1kB_00_01Tile___01_4.html @@ -118,7 +118,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1GlobalLoadStream-members.html b/docs/structcutlass_1_1gemm_1_1GlobalLoadStream-members.html index f3b227eb8c..9cdc4c82f1 100644 --- a/docs/structcutlass_1_1gemm_1_1GlobalLoadStream-members.html +++ b/docs/structcutlass_1_1gemm_1_1GlobalLoadStream-members.html @@ -73,38 +73,46 @@
    -
    cutlass::gemm::GlobalLoadStream< LoadIterator_, StoreIterator_, Transformer_ > Member List
    +
    cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ > Member List
    -

    This is the complete list of members for cutlass::gemm::GlobalLoadStream< LoadIterator_, StoreIterator_, Transformer_ >, including all inherited members.

    +

    This is the complete list of members for cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >, including all inherited members.

    - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Base typedefcutlass::gemm::GlobalLoadStream< LoadIterator_, StoreIterator_, Transformer_ >
    commit()cutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >inline
    copy()cutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >inline
    fetched_fragmentcutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >
    FetchedFragment typedefcutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >
    Fragment typedefcutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >
    GlobalLoadStream(typename Base::Params const &params, typename Base::SharedStorage &shared_storage, Coord< 3 > const &bounds, Coord< 3 > const &block)cutlass::gemm::GlobalLoadStream< LoadIterator_, StoreIterator_, Transformer_ >inline
    GlobalLoadStreamBase(Params const &params, SharedStorage &shared_storage, Coord< 3 > const bounds, Coord< 3 > const &block)cutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >inline
    Index typedefcutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >
    kLayoutcutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >static
    load_iteratorcutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >
    LoadIterator typedefcutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >
    Pointer typedefcutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >
    residue(Index k, bool skip_clear=false)cutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >inline
    Scalar typedefcutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >
    SharedStoreStorage typedefcutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >
    store_iteratorcutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >
    StoreIterator typedefcutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >
    transformed_fragmentcutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >
    TransformedFragment typedefcutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >
    Transformer typedefcutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >
    transformercutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >
    commit()cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >inline
    copy()cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >inline
    fetched_fragmentcutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >
    FetchedFragment typedefcutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >
    Fragment typedefcutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >
    GlobalLoadStream(Params const &_params, SharedStorage &shared_storage, ThreadblockTileRef const &threadblock_tile_ref, Coord< 3 > const bounds, Coord< 3 > const &_threadblock_offset)cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >inline
    Index typedefcutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >
    kLayoutcutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >static
    kOperandcutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >static
    load_iteratorcutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >
    LoadIterator typedefcutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >
    move_to_residue(Index k, Index kTileK)cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >inline
    multiplicand_boundscutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >
    operator+=(Coord< 3 > const &offset)cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >inline
    paramscutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >
    Pointer typedefcutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >
    project_coordinate(Coord< 3 > const &coord, Index d_offset=0)cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >inlinestatic
    residue(Index k, bool skip_clear=false)cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >inline
    rollback(void)cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >inline
    Scalar typedefcutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >
    store_iteratorcutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >
    StoreIterator typedefcutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >
    threadblock_offsetcutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >
    ThreadblockTileRef typedefcutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >
    ThreadblockTileStorage typedefcutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >
    Tile typedefcutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >
    transformed_fragmentcutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >
    TransformedFragment typedefcutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >
    transformercutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >
    Transformer typedefcutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >
    diff --git a/docs/structcutlass_1_1gemm_1_1GlobalLoadStream.html b/docs/structcutlass_1_1gemm_1_1GlobalLoadStream.html index 99cbcad5da..b359559570 100644 --- a/docs/structcutlass_1_1gemm_1_1GlobalLoadStream.html +++ b/docs/structcutlass_1_1gemm_1_1GlobalLoadStream.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::gemm::GlobalLoadStream< LoadIterator_, StoreIterator_, Transformer_ > Struct Template Reference +Cutlass: cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ > Struct Template Reference @@ -73,115 +73,321 @@
    -
    cutlass::gemm::GlobalLoadStream< LoadIterator_, StoreIterator_, Transformer_ > Struct Template Reference
    +
    cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ > Struct Template Reference

    #include <gemm_global_stream.h>

    -
    -Inheritance diagram for cutlass::gemm::GlobalLoadStream< LoadIterator_, StoreIterator_, Transformer_ >:
    -
    -
    - - -cutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ > - -
    + + + + + + +

    +Classes

    struct  Params
     The params. More...
     
    struct  SharedStorage
     
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    Public Types

    typedef GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ > Base
     The base class. More...
     
    - Public Types inherited from cutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >
    typedef LoadIterator_ LoadIterator
     The load iterator. More...
     
    typedef Transformer_ Transformer
     The transformer. More...
     
    typedef StoreIterator_ StoreIterator
     The store iterator to write to shared memory. More...
     
    typedef LoadIterator::Fragment FetchedFragment
     The fragment that is copied from shared memory. More...
     
    typedef Transformer::OutputFragment TransformedFragment
     The fragment that is obtained after the transformation by the transformer. More...
     
    typedef TransformedFragment Fragment
     Make sure the fragments match. More...
     
    typedef LoadIterator::Scalar Scalar
     The scalar type of the iterator. More...
     
    typedef LoadIterator::Pointer Pointer
     The pointer. More...
     
    typedef LoadIterator::Index Index
     The index. More...
     
    typedef StoreIterator::SharedStorage SharedStoreStorage
     The amount of storage in shared memory needed to store the tile. More...
     
    typedef LoadIterator_ LoadIterator
     The load iterator. More...
     
    typedef Transformer_ Transformer
     The transformer. More...
     
    typedef StoreIterator_ StoreIterator
     The store iterator to write to shared memory. More...
     
    typedef LoadIterator::Fragment FetchedFragment
     The fragment that is copied from shared memory. More...
     
    typedef Transformer::OutputFragment TransformedFragment
     The fragment that is obtained after the transformation by the transformer. More...
     
    typedef TransformedFragment Fragment
     Make sure the fragments match. More...
     
    typedef LoadIterator::Scalar Scalar
     The scalar type of the iterator. More...
     
    typedef LoadIterator::Pointer Pointer
     The pointer. More...
     
    typedef LoadIterator::Index Index
     The index. More...
     
    typedef LoadIterator::Tile Tile
     The tile. More...
     
    typedef TileAllocation< typename StoreIterator::Scalar, typename StoreIterator::Tile > ThreadblockTileStorage
     Shared memory allocation for the tile. More...
     
    typedef ThreadblockTileStorage::TensorRef ThreadblockTileRef
     Tensor reference to threadblock tile. More...
     
    - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + +

    Public Member Functions

    CUTLASS_DEVICE GlobalLoadStream (typename Base::Params const &params, typename Base::SharedStorage &shared_storage, Coord< 3 > const &bounds, Coord< 3 > const &block)
     Ctor. More...
     
    - Public Member Functions inherited from cutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >
    CUTLASS_DEVICE GlobalLoadStreamBase (Params const &params, SharedStorage &shared_storage, Coord< 3 > const bounds, Coord< 3 > const &block)
     Ctor. More...
     
    CUTLASS_DEVICE void copy ()
     Load the data from shared memory to the fetch fragment. More...
     
    CUTLASS_DEVICE void commit ()
     Commit the data. More...
     
    CUTLASS_DEVICE void residue (Index k, bool skip_clear=false)
     Execute the residue code. More...
     
    CUTLASS_DEVICE GlobalLoadStream (Params const &_params, SharedStorage &shared_storage, ThreadblockTileRef const &threadblock_tile_ref, Coord< 3 > const bounds, Coord< 3 > const &_threadblock_offset)
     Ctor. More...
     
    CUTLASS_DEVICE void copy ()
     Load the data from shared memory to the fetch fragment. More...
     
    CUTLASS_DEVICE void commit ()
     Commit the data. More...
     
    CUTLASS_DEVICE void residue (Index k, bool skip_clear=false)
     Execute the residue code. More...
     
    CUTLASS_DEVICE void move_to_residue (Index k, Index kTileK)
     Move to the residue portion. More...
     
    CUTLASS_DEVICE void rollback (void)
     Rollback to the beginning of the first tile. More...
     
    CUTLASS_DEVICE GlobalLoadStreamoperator+= (Coord< 3 > const &offset)
     Adds a Coord<3> to the underlying global load iterator. More...
     
    + + + +

    +Static Public Member Functions

    static CUTLASS_DEVICE Coord< 3 > project_coordinate (Coord< 3 > const &coord, Index d_offset=0)
     Maps a coordinate in the GEMM's (K, N, M) coordinate system to global memory. More...
     
    - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + +

    -Additional Inherited Members

    - Public Attributes inherited from cutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >
    LoadIterator load_iterator
     The iterator. More...
     
    FetchedFragment fetched_fragment
     The fragment to fetch from shared memory. More...
     
    Transformer transformer
     The transformer. More...
     
    TransformedFragment transformed_fragment
     The fragment to convert the data after it has been fetched from shared memory. More...
     
    StoreIterator store_iterator
     The store iterator. More...
     
    - Static Public Attributes inherited from cutlass::gemm::GlobalLoadStreamBase< LoadIterator_, StoreIterator_, Transformer_ >
    static MatrixLayout::Kind const kLayout = LoadIterator::kLayout
     Make sure the transformed fragment is the same as the store fragment. More...
     

    +Public Attributes

    Params params
     Parameters. More...
     
    Coord< 3 > multiplicand_bounds
     Multiplicand bounds. More...
     
    Coord< 3 > threadblock_offset
     Threadblock offset. More...
     
    LoadIterator load_iterator
     The iterator. More...
     
    FetchedFragment fetched_fragment
     The fragment to fetch from shared memory. More...
     
    Transformer transformer
     The transformer. More...
     
    TransformedFragment transformed_fragment
     The fragment to convert the data after it has been fetched from shared memory. More...
     
    StoreIterator store_iterator
     The store iterator. More...
     
    + + + + + + +

    +Static Public Attributes

    static GemmOperand::Kind const kOperand = Operand
     Indicates the type of GEMM operand. More...
     
    static MatrixLayout::Kind const kLayout = LoadIterator::kLayout
     Make sure the transformed fragment is the same as the store fragment. More...
     

    Member Typedef Documentation

    - -

    ◆ Base

    + +

    ◆ FetchedFragment

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + +
    typedef LoadIterator::Fragment cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::FetchedFragment
    +
    + +
    +
    + +

    ◆ Fragment

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + +
    typedef TransformedFragment cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::Fragment
    +
    +

    The output fragment.

    + +
    +
    + +

    ◆ Index

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + +
    typedef LoadIterator::Index cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::Index
    +
    + +
    +
    + +

    ◆ LoadIterator

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + +
    typedef LoadIterator_ cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::LoadIterator
    +
    + +
    +
    + +

    ◆ Pointer

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + +
    typedef LoadIterator::Pointer cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::Pointer
    +
    + +
    +
    + +

    ◆ Scalar

    -template<typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ = Copy<typename LoadIterator_::Fragment>>
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    - + + +
    typedef GlobalLoadStreamBase<LoadIterator_, StoreIterator_, Transformer_> cutlass::gemm::GlobalLoadStream< LoadIterator_, StoreIterator_, Transformer_ >::Basetypedef LoadIterator::Scalar cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::Scalar
    +
    + +
    +
    + +

    ◆ StoreIterator

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + +
    typedef StoreIterator_ cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::StoreIterator
    +
    + +
    +
    + +

    ◆ ThreadblockTileRef

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + +
    typedef ThreadblockTileStorage::TensorRef cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::ThreadblockTileRef
    +
    + +
    +
    + +

    ◆ ThreadblockTileStorage

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + +
    typedef TileAllocation<typename StoreIterator::Scalar, typename StoreIterator::Tile> cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::ThreadblockTileStorage
    +
    + +
    +
    + +

    ◆ Tile

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + +
    typedef LoadIterator::Tile cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::Tile
    +
    + +
    +
    + +

    ◆ TransformedFragment

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + +
    typedef Transformer::OutputFragment cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::TransformedFragment
    +
    + +
    +
    + +

    ◆ Transformer

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + +
    typedef Transformer_ cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::Transformer
    @@ -189,40 +395,46 @@

    Constructor & Destructor Documentation

    - -

    ◆ GlobalLoadStream()

    + +

    ◆ GlobalLoadStream()

    -template<typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ = Copy<typename LoadIterator_::Fragment>>
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    - + @@ -91,12 +91,13 @@ - - + + +
    - + - - + + - + - + + + + + + + - + @@ -237,6 +449,409 @@

    + + +

    Member Function Documentation

    +
    +

    ◆ commit()

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    +

    CUTLASS_DEVICE cutlass::gemm::GlobalLoadStream< LoadIterator_, StoreIterator_, Transformer_ >::GlobalLoadStream CUTLASS_DEVICE cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::GlobalLoadStream (typename Base::Params const & params, Params const & _params,
    typename Base::SharedStorageSharedStorage shared_storage,
    Coord< 3 > const & ThreadblockTileRef const & threadblock_tile_ref,
    Coord< 3 > const  bounds,
    Coord< 3 > const & block _threadblock_offset 
    + + + + +
    + + + + + + + +
    CUTLASS_DEVICE void cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::commit ()
    +
    +inline
    +
    + +
    + + +

    ◆ copy()

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + + +
    + + + + + + + +
    CUTLASS_DEVICE void cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::copy ()
    +
    +inline
    +
    + +
    +
    + +

    ◆ move_to_residue()

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + + +
    + + + + + + + + + + + + + + + + + + +
    CUTLASS_DEVICE void cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::move_to_residue (Index k,
    Index kTileK 
    )
    +
    +inline
    +
    + +
    +
    + +

    ◆ operator+=()

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + + +
    + + + + + + + + +
    CUTLASS_DEVICE GlobalLoadStream& cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::operator+= (Coord< 3 > const & offset)
    +
    +inline
    +
    + +
    +
    + +

    ◆ project_coordinate()

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + + +
    + + + + + + + + + + + + + + + + + + +
    static CUTLASS_DEVICE Coord<3> cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::project_coordinate (Coord< 3 > const & coord,
    Index d_offset = 0 
    )
    +
    +inlinestatic
    +
    + +
    +
    + +

    ◆ residue()

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + + +
    + + + + + + + + + + + + + + + + + + +
    CUTLASS_DEVICE void cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::residue (Index k,
    bool skip_clear = false 
    )
    +
    +inline
    +
    + +
    +
    + +

    ◆ rollback()

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + + +
    + + + + + + + + +
    CUTLASS_DEVICE void cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::rollback (void )
    +
    +inline
    +
    + +
    +
    +

    Member Data Documentation

    + +

    ◆ fetched_fragment

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + +
    FetchedFragment cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::fetched_fragment
    +
    + +
    +
    + +

    ◆ kLayout

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + + +
    + + + + +
    MatrixLayout::Kind const cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::kLayout = LoadIterator::kLayout
    +
    +static
    +
    +

    The layout.

    + +
    +
    + +

    ◆ kOperand

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + + +
    + + + + +
    GemmOperand::Kind const cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::kOperand = Operand
    +
    +static
    +
    + +
    +
    + +

    ◆ load_iterator

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + +
    LoadIterator cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::load_iterator
    +
    + +
    +
    + +

    ◆ multiplicand_bounds

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + +
    Coord<3> cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::multiplicand_bounds
    +
    + +
    +
    + +

    ◆ params

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + +
    Params cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::params
    +
    + +
    +
    + +

    ◆ store_iterator

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + +
    StoreIterator cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::store_iterator
    +
    + +
    +
    + +

    ◆ threadblock_offset

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + +
    Coord<3> cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::threadblock_offset
    +
    + +
    +
    + +

    ◆ transformed_fragment

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + +
    TransformedFragment cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::transformed_fragment
    +
    + +
    +
    + +

    ◆ transformer

    + +
    +
    +
    +template<GemmOperand::Kind Operand, typename LoadIterator_ , typename StoreIterator_ , typename Transformer_ >
    + + + + +
    Transformer cutlass::gemm::GlobalLoadStream< Operand, LoadIterator_, StoreIterator_, Transformer_ >::transformer
    +
    +

    The documentation for this struct was generated from the following file:
    Base typedefcutlass::gemm::HgemmCrosswiseGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    Delta typedefcutlass::gemm::HgemmCrosswiseGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    ImmediateOffsetStrides typedefcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    Iterations typedefcutlass::gemm::HgemmCrosswiseGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    Iterations typedefcutlass::gemm::HgemmCrosswiseGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    kAccessSizecutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >static
    kLayoutcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >static
    kMemorySpacecutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >static
    Pointer typedefcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    Scalar typedefcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    Threads typedefcutlass::gemm::HgemmCrosswiseGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    ThreadsDelta typedefcutlass::gemm::HgemmCrosswiseGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    Tile typedefcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    ThreadsDelta typedefcutlass::gemm::HgemmCrosswiseGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    Tile typedefcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    VectorizedTile typedefcutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    diff --git a/docs/structcutlass_1_1gemm_1_1HgemmCrosswiseGlobalTileTraits.html b/docs/structcutlass_1_1gemm_1_1HgemmCrosswiseGlobalTileTraits.html index eca01a3cb6..3696c45338 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmCrosswiseGlobalTileTraits.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmCrosswiseGlobalTileTraits.html @@ -103,18 +103,18 @@

    typedef GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ > Base
     The base class. More...
     
    typedef Base::Threads Threads
    typedef Base::Threads Threads
     The threads. More...
     
    typedef Shape< 1, 2, Base::Tile::kC > ThreadsDelta
     The threads strides. More...
     
    typedef Shape< 1, 2, Base::VectorizedTile::kC > ThreadsDelta
     The threads strides. More...
     
    typedef Shape< Base::Threads::kH *2, 1, Base::Threads::kW, Base::kAccessSizeDelta
     The strides in each dimension between different loads/stores. More...
     
    typedef Shape< Base::Tile::kH/Base::Threads::kH/2, 2, Base::Tile::kW/Base::Threads::kW, Base::Tile::kC/Base::kAccessSizeIterations
     The number of iterations needed to load/store the tile. More...
     
    typedef Shape< Base::VectorizedTile::kH/Base::Threads::kH/2, 2, Base::VectorizedTile::kW/Base::Threads::kW, Base::VectorizedTile::kC/Base::kAccessSizeIterations
     The number of iterations needed to load/store the tile. More...
     
    - Public Types inherited from cutlass::gemm::GemmGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >
    typedef platform::remove_const< Scalar_ >::type Scalar
     The scalar. More...
    typedef Scalar_ * Pointer
     The pointer. More...
     
    typedef ReshapeTile< Tile_, kAccessSize_ >::Tile Tile
     The tile shape. More...
     
    typedef ReshapeThreads< Tile, Threads_ >::Threads Threads
     The threads shape. More...
     
    typedef Shape< 1, 1, Tile::kC > ThreadsDelta
     The relative offset between two elements in the H/W dimension in adjacent threads. More...
     
    typedef Tile_ Tile
     The tile shape. More...
     
    typedef ReshapeTile< Tile_, kAccessSize_ >::Tile VectorizedTile
     The vectorized tile shape. More...
     
    typedef ReshapeThreads< VectorizedTile, Threads_ >::Threads Threads
     The threads shape. More...
     
    typedef Shape< 1, 1, VectorizedTile::kC > ThreadsDelta
     The relative offset between two elements in the H/W dimension in adjacent threads. More...
     
    typedef Shape< 0, Threads::kH, Threads::kW *kAccessSizeDelta
     The strides in each dimension between different loads/stores. More...
     
    typedef Shape< 0, 0, Threads::kW *ThreadsDelta::kW, kAccessSizeImmediateOffsetStrides
     Strides for immediate offset computation. More...
     
    typedef Shape< 1, Tile::kH/Threads::kH, Tile::kW/Threads::kW, Tile::kC/kAccessSizeIterations
     The number of iterations needed to load/store the tile. More...
     
    typedef GemmMultiplicandTraits< Tile, kOperand, kLayoutMultiplicandTraits
    typedef Shape< 1, VectorizedTile::kH/Threads::kH, VectorizedTile::kW/Threads::kW, VectorizedTile::kC/kAccessSizeIterations
     The number of iterations needed to load/store the tile. More...
     
    typedef GemmMultiplicandTraits< Tile, kOperand, kLayoutMultiplicandTraits
     
    - +

    @@ -192,8 +195,8 @@

    -

    ◆ Iterations

    + +

    ◆ Iterations

    @@ -201,7 +204,7 @@

    typedef Shape<Base::Tile::kH / Base::Threads::kH / 2, 2, Base::Tile::kW / Base::Threads::kW, Base::Tile::kC / Base::kAccessSize> cutlass::gemm::HgemmCrosswiseGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >::Iterationstypedef Shape<Base::VectorizedTile::kH / Base::Threads::kH / 2, 2, Base::VectorizedTile::kW / Base::Threads::kW, Base::VectorizedTile::kC / Base::kAccessSize> cutlass::gemm::HgemmCrosswiseGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >::Iterations

    typedef Base::Threads cutlass::gemm::HgemmCrosswiseGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >::Threadstypedef Base::Threads cutlass::gemm::HgemmCrosswiseGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >::Threads

    - -

    ◆ ThreadsDelta

    + +

    ◆ ThreadsDelta

    @@ -233,7 +236,7 @@

    - typedef Shape<1, 2, Base::Tile::kC> cutlass::gemm::HgemmCrosswiseGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >::ThreadsDelta + typedef Shape<1, 2, Base::VectorizedTile::kC> cutlass::gemm::HgemmCrosswiseGlobalTileTraits< kOperand_, kLayout_, Scalar_, Tile_, Threads_, kAccessSize_ >::ThreadsDelta

    @@ -246,7 +249,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1HgemmCrosswiseGlobalTileTraits_1_1ThreadOffset-members.html b/docs/structcutlass_1_1gemm_1_1HgemmCrosswiseGlobalTileTraits_1_1ThreadOffset-members.html index 2fa9bb6b9c..ab8b8df39d 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmCrosswiseGlobalTileTraits_1_1ThreadOffset-members.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmCrosswiseGlobalTileTraits_1_1ThreadOffset-members.html @@ -83,7 +83,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1HgemmCrosswiseGlobalTileTraits_1_1ThreadOffset.html b/docs/structcutlass_1_1gemm_1_1HgemmCrosswiseGlobalTileTraits_1_1ThreadOffset.html index b9fb35edc8..40b60dcc86 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmCrosswiseGlobalTileTraits_1_1ThreadOffset.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmCrosswiseGlobalTileTraits_1_1ThreadOffset.html @@ -124,7 +124,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1HgemmSwizzle-members.html b/docs/structcutlass_1_1gemm_1_1HgemmSwizzle-members.html index 1abd2b9094..aac9bd0aad 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmSwizzle-members.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmSwizzle-members.html @@ -89,7 +89,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1HgemmSwizzle.html b/docs/structcutlass_1_1gemm_1_1HgemmSwizzle.html index 495f144598..9f09fffe8a 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmSwizzle.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmSwizzle.html @@ -265,7 +265,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperA.html b/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperA.html index 6d21685c97..289cb4bc49 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperA.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperA.html @@ -93,7 +93,7 @@ diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperA_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4-members.html b/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperA_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4-members.html index 854339353e..101e43491e 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperA_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4-members.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperA_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4-members.html @@ -83,14 +83,13 @@ GlobalTileTraits typedefcutlass::gemm::HgemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > kLayoutcutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >static kScalarsIn4Bcutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >static - MultiplyAddScalar typedefcutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > - Scalar typedefcutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > - SharedLoadTileTraits typedefcutlass::gemm::HgemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > - SharedStoreTileTraits typedefcutlass::gemm::HgemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > + kSkewAcutlass::gemm::HgemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >static + MultiplyAddScalar typedefcutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > + Scalar typedefcutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperA_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4.html b/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperA_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4.html index 9e370edb33..b6500a957e 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperA_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperA_3_01MatrixLayout_1_1kRowMajor_00_01GemmConfig___01_4.html @@ -74,6 +74,7 @@
    cutlass::gemm::HgemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > Struct Template Reference
    @@ -99,12 +100,6 @@ typedef HgemmCrosswiseGlobalTileTraits< GemmOperand::kA, MatrixLayout::kRowMajor, half const, Shape< 1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD >, Shape< 1, GemmConfig_::kThreads/GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD >, GemmConfig_::kScalarsPerLdgA > GlobalTileTraits  The traits class to build the iterator to load data from global memory for A^T. More...
      -typedef GemmSharedStoreWithSkewTileAbTraits< half, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kW *GemmConfig_::InstructionShape::kD >, typename GlobalTileTraits::Threads, 2, 128/sizeof(half)/GlobalTileTraits::Threads::kW/2 > SharedStoreTileTraits - The traits class to build the iterator to store data to shared memory for A^T. More...
    -  -typedef GemmSharedLoadTileATraits< half const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, 8, SharedStoreTileTraits::kSkew > SharedLoadTileTraits - The traits class to build the iterator to load from shared memory for A^T. More...
    - Public Types inherited from cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ > typedef GemmConfig_::ScalarA Scalar  The input scalar. More...
    @@ -115,15 +110,11 @@ typedef GemmGlobalTileTraits< GemmOperand::kA, MatrixLayout::kRowMajor, Scalar const, Shape< 1, GemmConfig_::OutputTile::kW, GemmConfig_::OutputTile::kD >, Shape< 1, GemmConfig_::kThreads/GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD >, GemmConfig_::kScalarsPerLdgA > GlobalTileTraits  The traits class to build the iterator to load data from global memory for A^T. More...
      -typedef GemmSharedStoreWithSkewTileAbTraits< MultiplyAddScalar, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kW *GemmConfig_::InstructionShape::kD >, typename GlobalTileTraits::Threads, GemmConfig_::kScalarsPerStsA, 128/sizeof(MultiplyAddScalar)/GemmConfig_::kScalarsPerStsA/GlobalTileTraits::Threads::kW *kScalarsIn4BSharedStoreTileTraits - The traits class to build the iterator to store data to shared memory for A^T. More...
    -  -typedef GemmSharedLoadTileATraits< MultiplyAddScalar const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, GemmConfig_::kScalarsPerLdsA, SharedStoreTileTraits::kSkew > SharedLoadTileTraits - The traits class to build the iterator to load from shared memory for A^T. More...
    -  - + + + @@ -131,6 +122,9 @@ + + +

    -Additional Inherited Members

    +Static Public Attributes

    static int const kSkewA = 128 / sizeof(half) / GlobalTileTraits::Threads::kW / 2
     
    - Static Public Attributes inherited from cutlass::gemm::GemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >
    static MatrixLayout::Kind const kLayout = MatrixLayout::kRowMajor
     The layout. More...
    static int const kScalarsIn4B = sizeof(MultiplyAddScalar) > 4 ? 1 : 4 / sizeof(MultiplyAddScalar)
     The number of scalars in 4B. More...
     
    static int const kSkewA
     The skew for A. More...
     

    Member Typedef Documentation

    @@ -165,34 +159,27 @@

    -

    ◆ SharedLoadTileTraits

    +

    Member Data Documentation

    + +

    ◆ kSkewA

    template<typename GemmConfig_ >
    + + + + + +
    - - -
    typedef GemmSharedLoadTileATraits< half const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, 8, SharedStoreTileTraits::kSkew> cutlass::gemm::HgemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >::SharedLoadTileTraits
    -
    - -
    - - -

    ◆ SharedStoreTileTraits

    - -
    -
    -
    -template<typename GemmConfig_ >
    - - - +
    typedef GemmSharedStoreWithSkewTileAbTraits< half, Shape<GemmConfig_::kStages, GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kW * GemmConfig_::InstructionShape::kD>, typename GlobalTileTraits::Threads, 2, 128 / sizeof(half) / GlobalTileTraits::Threads::kW / 2> cutlass::gemm::HgemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >::SharedStoreTileTraitsint const cutlass::gemm::HgemmTileTraitsHelperA< MatrixLayout::kRowMajor, GemmConfig_ >::kSkewA = 128 / sizeof(half) / GlobalTileTraits::Threads::kW / 2
    +
    +static
    @@ -203,7 +190,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperB.html b/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperB.html index b331b74bc3..e98065e5ab 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperB.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperB.html @@ -93,7 +93,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperB_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4-members.html b/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperB_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4-members.html index 273311c271..b7fa3515cd 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperB_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4-members.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperB_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4-members.html @@ -83,14 +83,13 @@ GlobalTileTraits typedefcutlass::gemm::HgemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > kLayoutcutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >static kScalarsIn4Bcutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >static - MultiplyAddScalar typedefcutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > - Scalar typedefcutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > - SharedLoadTileTraits typedefcutlass::gemm::HgemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > - SharedStoreTileTraits typedefcutlass::gemm::HgemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > + kSkewBcutlass::gemm::HgemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >static + MultiplyAddScalar typedefcutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > + Scalar typedefcutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >

    diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperB_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4.html b/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperB_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4.html index 8a59bc287c..f15d401f0d 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperB_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmTileTraitsHelperB_3_01MatrixLayout_1_1kColumnMajor_00_01GemmConfig___01_4.html @@ -74,6 +74,7 @@
    cutlass::gemm::HgemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > Struct Template Reference
    @@ -99,12 +100,6 @@ typedef HgemmCrosswiseGlobalTileTraits< GemmOperand::kB, MatrixLayout::kColumnMajor, half const, Shape< 1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD >, Shape< 1, GemmConfig_::kThreads/GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD >, GemmConfig_::kScalarsPerLdgB > GlobalTileTraits  The traits class to build the iterator to load data from global memory for B^N. More...
      -typedef GemmSharedStoreWithSkewTileAbTraits< half, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kH *GemmConfig_::InstructionShape::kD >, typename GlobalTileTraits::Threads, 2, 128/sizeof(half)/GlobalTileTraits::Threads::kW/2 > SharedStoreTileTraits - The traits class to build the iterator to store data to shared memory for B^N. More...
    -  -typedef GemmSharedLoadTileBTraits< half const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, 8, SharedStoreTileTraits::kSkew > SharedLoadTileTraits - The traits class to build the iterator to load from shared memory for B^N. More...
    - Public Types inherited from cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ > typedef GemmConfig_::ScalarB Scalar  The input scalar. More...
    @@ -115,15 +110,11 @@ typedef GemmGlobalTileTraits< GemmOperand::kB, MatrixLayout::kColumnMajor, Scalar const, Shape< 1, GemmConfig_::OutputTile::kH, GemmConfig_::OutputTile::kD >, Shape< 1, GemmConfig_::kThreads/GemmConfig_::OutputTile::kD, GemmConfig_::OutputTile::kD >, GemmConfig_::kScalarsPerLdgB > GlobalTileTraits  The traits class to build the iterator to load data from global memory for B^N. More...
      -typedef GemmSharedStoreWithSkewTileAbTraits< MultiplyAddScalar, Shape< GemmConfig_::kStages, GemmConfig_::OutputTile::kD/GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kH *GemmConfig_::InstructionShape::kD >, typename GlobalTileTraits::Threads, GemmConfig_::kScalarsPerStsB, 128/sizeof(MultiplyAddScalar)/GemmConfig_::kScalarsPerStsB/GlobalTileTraits::Threads::kW *kScalarsIn4BSharedStoreTileTraits - The traits class to build the iterator to store data to shared memory for B^N. More...
    -  -typedef GemmSharedLoadTileBTraits< MultiplyAddScalar const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, GemmConfig_::kScalarsPerLdsB, SharedStoreTileTraits::kSkew > SharedLoadTileTraits - The traits class to build the iterator to load from shared memory for B^N. More...
    -  - + + + @@ -131,6 +122,9 @@ + + +

    -Additional Inherited Members

    +Static Public Attributes

    static int const kSkewB = 128 / sizeof(half) / GlobalTileTraits::Threads::kW / 2
     
    - Static Public Attributes inherited from cutlass::gemm::GemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >
    static MatrixLayout::Kind const kLayout = MatrixLayout::kColumnMajor
     The layout. More...
    static int const kScalarsIn4B = sizeof(MultiplyAddScalar) > 4 ? 1 : 4 / sizeof(MultiplyAddScalar)
     The number of scalars in 4B. More...
     
    static int const kSkewB
     The skew for B. More...
     

    Member Typedef Documentation

    @@ -165,34 +159,27 @@

    -

    ◆ SharedLoadTileTraits

    +

    Member Data Documentation

    + +

    ◆ kSkewB

    template<typename GemmConfig_ >
    + + + + + +
    - - -
    typedef GemmSharedLoadTileBTraits< half const, typename GemmConfig_::OutputTile, typename GemmConfig_::Warps, typename GemmConfig_::MultiplyAdd::ThreadsPerWarp, typename GemmConfig_::InstructionShape, GemmConfig_::kStages, 8, SharedStoreTileTraits::kSkew> cutlass::gemm::HgemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >::SharedLoadTileTraits
    -
    - -
    - - -

    ◆ SharedStoreTileTraits

    - -
    -
    -
    -template<typename GemmConfig_ >
    - - - +
    typedef GemmSharedStoreWithSkewTileAbTraits< half, Shape<GemmConfig_::kStages, GemmConfig_::OutputTile::kD / GemmConfig_::InstructionShape::kD, GemmConfig_::OutputTile::kH * GemmConfig_::InstructionShape::kD>, typename GlobalTileTraits::Threads, 2, 128 / sizeof(half) / GlobalTileTraits::Threads::kW / 2> cutlass::gemm::HgemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >::SharedStoreTileTraitsint const cutlass::gemm::HgemmTileTraitsHelperB< MatrixLayout::kColumnMajor, GemmConfig_ >::kSkewB = 128 / sizeof(half) / GlobalTileTraits::Threads::kW / 2
    +
    +static
    @@ -203,7 +190,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTraits-members.html b/docs/structcutlass_1_1gemm_1_1HgemmTraits-members.html index 2a51feb222..ec460a2e73 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmTraits-members.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmTraits-members.html @@ -73,19 +73,21 @@

    -
    cutlass::gemm::HgemmTraits< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_, Helper_ > Member List
    +
    cutlass::gemm::HgemmTraits< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_, Helper_ > Member List

    -

    This is the complete list of members for cutlass::gemm::HgemmTraits< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_, Helper_ >, including all inherited members.

    +

    This is the complete list of members for cutlass::gemm::HgemmTraits< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_, Helper_ >, including all inherited members.

    - - - + + + + + @@ -98,12 +100,13 @@ - - + + +
    BlockSwizzle typedefcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    ClearAccumulators typedefcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    Epilogue typedefcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    GemmConfig typedefcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    GlobalLoadStreamA typedefcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    GlobalLoadStreamB typedefcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    Index typedefcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    GlobalLoadStream typedefcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    GlobalLoadStreamA typedefcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    GlobalLoadStreamB typedefcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    Index typedefcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    KernelClass typedefcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    kLayoutAcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >static
    kLayoutBcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >static
    MultiplyAdd typedefcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    shared_store_fence(bool in_loop)cutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >inlinestatic
    SharedLoadStreamA typedefcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    SharedLoadStreamB typedefcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    SharedStoreStorageA typedefcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    SharedStoreStorageB typedefcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    SharedStream typedefcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    This_ typedefcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    ThreadblockTileStorage typedefcutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTraits.html b/docs/structcutlass_1_1gemm_1_1HgemmTraits.html index a9e4c26963..35affe7548 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmTraits.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmTraits.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::gemm::HgemmTraits< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_, Helper_ > Struct Template Reference +Cutlass: cutlass::gemm::HgemmTraits< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_, Helper_ > Struct Template Reference @@ -75,17 +75,17 @@
    -
    cutlass::gemm::HgemmTraits< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_, Helper_ > Struct Template Reference
    +
    cutlass::gemm::HgemmTraits< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_, Helper_ > Struct Template Reference

    #include <hgemm_traits.h>

    -Inheritance diagram for cutlass::gemm::HgemmTraits< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_, Helper_ >:
    +Inheritance diagram for cutlass::gemm::HgemmTraits< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_, Helper_ >:
    - - + + cutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators >
    @@ -93,10 +93,16 @@

    Additional Inherited Members

    - Public Types inherited from cutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators > +typedef GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators > This_ + This traits. More...
    +  +typedef cutlass::gemm::Gemm< This_KernelClass + The struct that consumes this Traits. More...
    +  typedef Helper_::GemmConfig GemmConfig  The configuration. More...
      -typedef GemmConfig::OutputTile OutputTile +typedef GemmConfig::OutputTile OutputTile  The output tile. More...
      typedef Helper_::GlobalLoadStreamA GlobalLoadStreamA @@ -117,13 +123,7 @@ typedef Helper_::SharedLoadStreamB SharedLoadStreamB  The iterator for B to load from shared memory. More...
      -typedef GlobalLoadStreamA::SharedStoreStorage SharedStoreStorageA - The shared storage for A. More...
    -  -typedef GlobalLoadStreamB::SharedStoreStorage SharedStoreStorageB - The shared storage for B. More...
    -  -typedef GemmConfig::MultiplyAdd MultiplyAdd +typedef GemmConfig::MultiplyAdd MultiplyAdd  The multiply-add functor. More...
      typedef Helper_::Epilogue Epilogue @@ -143,6 +143,15 @@ typedef Helper_::ClearAccumulators ClearAccumulators  Clear the accumulators. More...
      +typedef GlobalLoadStreamPair< GlobalLoadStreamA, GlobalLoadStreamB, GemmConfig::kResidueInProlog > GlobalLoadStream + Assemble the global load streams for A/B. More...
    +  +typedef GlobalLoadStream::ThreadblockTileStorage ThreadblockTileStorage + Memory needed to store the threadblock-scoped GEMM tile. More...
    +  +typedef SharedStreamPair< SharedLoadStreamA, SharedLoadStreamBSharedStream + Assemble the shared load streams for A/B. More...
    - Static Public Member Functions inherited from cutlass::gemm::GemmTraits< Helper_::GemmConfig, Helper_::GlobalLoadStreamA, Helper_::GlobalLoadStreamB, Helper_::SharedLoadStreamA, Helper_::SharedLoadStreamB, Helper_::Epilogue, IdentityBlockSwizzle, Index_, Helper_::ClearAccumulators > static CUTLASS_DEVICE void shared_load_fence (bool in_loop)  The memory fence for shared loads. More...
    @@ -164,7 +173,7 @@ diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTraits.png b/docs/structcutlass_1_1gemm_1_1HgemmTraits.png index 03fc4145b8..96fbe77ed4 100644 Binary files a/docs/structcutlass_1_1gemm_1_1HgemmTraits.png and b/docs/structcutlass_1_1gemm_1_1HgemmTraits.png differ diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTraitsHelper-members.html b/docs/structcutlass_1_1gemm_1_1HgemmTraitsHelper-members.html index 78f0782570..504d831084 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmTraitsHelper-members.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmTraitsHelper-members.html @@ -73,35 +73,35 @@
    -
    cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ > Member List
    +
    cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ > Member List
    -

    This is the complete list of members for cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >, including all inherited members.

    +

    This is the complete list of members for cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >, including all inherited members.

    - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + +
    ClearAccumulators typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    Epilogue typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    GemmConfig typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    GemmEpilogueTraits typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    GemmTileTraitsHelperA typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    GemmTileTraitsHelperB typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    GlobalLoadIteratorA typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    GlobalLoadIteratorB typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    GlobalLoadStreamA typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    GlobalLoadStreamB typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    GlobalTransformerA typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    GlobalTransformerB typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    MultiplyAdd typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    SharedLoadIteratorA typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    SharedLoadIteratorB typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    SharedLoadStreamA typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    SharedLoadStreamB typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    SharedStoreIteratorA typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    SharedStoreIteratorB typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    ClearAccumulators typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    Epilogue typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    GemmConfig typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    GemmEpilogueTraits typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    GemmTileTraitsHelperA typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    GemmTileTraitsHelperB typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    GlobalLoadIteratorA typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    GlobalLoadIteratorB typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    GlobalLoadStreamA typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    GlobalLoadStreamB typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    GlobalTransformerA typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    GlobalTransformerB typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    MultiplyAdd typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    SharedLoadIteratorA typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    SharedLoadIteratorB typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    SharedLoadStreamA typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    SharedLoadStreamB typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    SharedStoreIteratorA typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    SharedStoreIteratorB typedefcutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >
    diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTraitsHelper.html b/docs/structcutlass_1_1gemm_1_1HgemmTraitsHelper.html index 1ec8904e71..e7fabe587d 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmTraitsHelper.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmTraitsHelper.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ > Struct Template Reference +Cutlass: cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ > Struct Template Reference @@ -76,7 +76,7 @@ Public Types | List of all members
    -
    cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ > Struct Template Reference
    +
    cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ > Struct Template Reference
    @@ -84,362 +84,362 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    Public Types

    typedef HgemmConfig< OutputTile_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_ > GemmConfig
     The HGEMM config. More...
     
    typedef HgemmTileTraitsHelperA< kLayoutA_, GemmConfigGemmTileTraitsHelperA
     The GEMM config for A. More...
     
    typedef HgemmTileTraitsHelperB< kLayoutB_, GemmConfigGemmTileTraitsHelperB
     The GEMM config for B. More...
     
    typedef GemmGlobalIteratorAb< typename GemmTileTraitsHelperA::GlobalTileTraits, Index_ > GlobalLoadIteratorA
     The iterator to load A from global memory. More...
     
    typedef HgemmTransformerA< GemmTileTraitsHelperA::kLayout, GlobalLoadIteratorA >::Transformer GlobalTransformerA
     The default transformer for A. More...
     
    typedef TileStoreIterator< typename GemmTileTraitsHelperA::SharedStoreTileTraits, typename GemmTileTraitsHelperA::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kSharedSharedStoreIteratorA
     The iterator to store A to shared memory. More...
     
    typedef GlobalLoadStream< GlobalLoadIteratorA, SharedStoreIteratorA, GlobalTransformerAGlobalLoadStreamA
     The stream to load A from global memory to shared memory. More...
     
    typedef GemmGlobalIteratorAb< typename GemmTileTraitsHelperB::GlobalTileTraits, Index_ > GlobalLoadIteratorB
     The iterator to load B from global memory. More...
     
    typedef HgemmTransformerB< GemmTileTraitsHelperB::kLayout, GlobalLoadIteratorB >::Transformer GlobalTransformerB
     
    typedef TileStoreIterator< typename GemmTileTraitsHelperB::SharedStoreTileTraits, typename GemmTileTraitsHelperB::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kSharedSharedStoreIteratorB
     The iterator to store B to shared memory. More...
     
    typedef GlobalLoadStream< GlobalLoadIteratorB, SharedStoreIteratorB, GlobalTransformerBGlobalLoadStreamB
     The stream to load B from global memory to shared memory. More...
     
    typedef TileLoadIterator< typename GemmTileTraitsHelperA::SharedLoadTileTraits, typename GemmTileTraitsHelperA::SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kSharedSharedLoadIteratorA
     The iterator to load A from shared memory. More...
     
    typedef SharedLoadStream< SharedLoadIteratorASharedLoadStreamA
     The stream to load A from shared memory. More...
     
    typedef TileLoadIterator< typename GemmTileTraitsHelperB::SharedLoadTileTraits, typename GemmTileTraitsHelperB::SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kSharedSharedLoadIteratorB
     The iterator to load B from shared memory. More...
     
    typedef SharedLoadStream< SharedLoadIteratorBSharedLoadStreamB
     The stream to load B from shared memory. More...
     
    typedef GemmConfig::MultiplyAdd MultiplyAdd
     The functor to do the multiply-add in the main loop. More...
     
    typedef ClearAccumulators< typename MultiplyAdd::ScalarCClearAccumulators
     The object to clear accumulators. More...
     
    typedef SimplifiedGemmEpilogueTraits< GemmConfig, EpilogueFunctor_, Index_ > GemmEpilogueTraits
     The traits class for the epilogue. More...
     
    typedef GemmEpilogue< GemmEpilogueTraitsEpilogue
     The epilogue. More...
     
    typedef HgemmConfig< OutputTile_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_ > GemmConfig
     The HGEMM config. More...
     
    typedef HgemmTileTraitsHelperA< kLayoutA_, GemmConfigGemmTileTraitsHelperA
     The GEMM config for A. More...
     
    typedef HgemmTileTraitsHelperB< kLayoutB_, GemmConfigGemmTileTraitsHelperB
     The GEMM config for B. More...
     
    typedef GemmGlobalIteratorAb< typename GemmTileTraitsHelperA::GlobalTileTraits, Index_ > GlobalLoadIteratorA
     The iterator to load A from global memory. More...
     
    typedef HgemmTransformerA< GemmTileTraitsHelperA::kLayout, GlobalLoadIteratorA >::Transformer GlobalTransformerA
     The default transformer for A. More...
     
    typedef TileStoreIterator< typename GemmTileTraitsHelperA::SharedStoreTileTraits, typename GemmTileTraitsHelperA::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kSharedSharedStoreIteratorA
     The iterator to store A to shared memory. More...
     
    typedef GlobalLoadStream< GemmOperand::kA, GlobalLoadIteratorA, SharedStoreIteratorA, GlobalTransformerAGlobalLoadStreamA
     The stream to load A from global memory to shared memory. More...
     
    typedef GemmGlobalIteratorAb< typename GemmTileTraitsHelperB::GlobalTileTraits, Index_ > GlobalLoadIteratorB
     The iterator to load B from global memory. More...
     
    typedef HgemmTransformerB< GemmTileTraitsHelperB::kLayout, GlobalLoadIteratorB >::Transformer GlobalTransformerB
     
    typedef TileStoreIterator< typename GemmTileTraitsHelperB::SharedStoreTileTraits, typename GemmTileTraitsHelperB::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kSharedSharedStoreIteratorB
     The iterator to store B to shared memory. More...
     
    typedef GlobalLoadStream< GemmOperand::kB, GlobalLoadIteratorB, SharedStoreIteratorB, GlobalTransformerBGlobalLoadStreamB
     The stream to load B from global memory to shared memory. More...
     
    typedef TileLoadIterator< typename GemmTileTraitsHelperA::SharedLoadTileTraits, typename GemmTileTraitsHelperA::SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kSharedSharedLoadIteratorA
     The iterator to load A from shared memory. More...
     
    typedef SharedLoadStream< SharedLoadIteratorASharedLoadStreamA
     The stream to load A from shared memory. More...
     
    typedef TileLoadIterator< typename GemmTileTraitsHelperB::SharedLoadTileTraits, typename GemmTileTraitsHelperB::SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kSharedSharedLoadIteratorB
     The iterator to load B from shared memory. More...
     
    typedef SharedLoadStream< SharedLoadIteratorBSharedLoadStreamB
     The stream to load B from shared memory. More...
     
    typedef GemmConfig::MultiplyAdd MultiplyAdd
     The functor to do the multiply-add in the main loop. More...
     
    typedef ClearAccumulators< typename MultiplyAdd::ScalarCClearAccumulators
     The object to clear accumulators. More...
     
    typedef SimplifiedGemmEpilogueTraits< GemmConfig, EpilogueFunctor_, Index_ > GemmEpilogueTraits
     The traits class for the epilogue. More...
     
    typedef GemmEpilogue< GemmEpilogueTraitsEpilogue
     The epilogue. More...
     

    Member Typedef Documentation

    - -

    ◆ ClearAccumulators

    + +

    ◆ ClearAccumulators

    -template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename AccumulatorsPerThread_ = Shape<32, 8, 8>, int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    +template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename ThreadGemmShape_ , int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    - +
    typedef ClearAccumulators<typename MultiplyAdd::ScalarC> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::ClearAccumulatorstypedef ClearAccumulators<typename MultiplyAdd::ScalarC> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::ClearAccumulators
    - -

    ◆ Epilogue

    + +

    ◆ Epilogue

    -template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename AccumulatorsPerThread_ = Shape<32, 8, 8>, int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    +template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename ThreadGemmShape_ , int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    - +
    typedef GemmEpilogue<GemmEpilogueTraits> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::Epiloguetypedef GemmEpilogue<GemmEpilogueTraits> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::Epilogue
    - -

    ◆ GemmConfig

    + +

    ◆ GemmConfig

    -template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename AccumulatorsPerThread_ = Shape<32, 8, 8>, int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    +template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename ThreadGemmShape_ , int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    - +
    typedef HgemmConfig<OutputTile_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::GemmConfigtypedef HgemmConfig<OutputTile_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::GemmConfig
    - -

    ◆ GemmEpilogueTraits

    + +

    ◆ GemmEpilogueTraits

    -template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename AccumulatorsPerThread_ = Shape<32, 8, 8>, int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    +template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename ThreadGemmShape_ , int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    - +
    typedef SimplifiedGemmEpilogueTraits<GemmConfig, EpilogueFunctor_, Index_> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::GemmEpilogueTraitstypedef SimplifiedGemmEpilogueTraits<GemmConfig, EpilogueFunctor_, Index_> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::GemmEpilogueTraits
    - -

    ◆ GemmTileTraitsHelperA

    + +

    ◆ GemmTileTraitsHelperA

    -template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename AccumulatorsPerThread_ = Shape<32, 8, 8>, int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    +template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename ThreadGemmShape_ , int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    - +
    typedef HgemmTileTraitsHelperA<kLayoutA_, GemmConfig> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::GemmTileTraitsHelperAtypedef HgemmTileTraitsHelperA<kLayoutA_, GemmConfig> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::GemmTileTraitsHelperA
    - -

    ◆ GemmTileTraitsHelperB

    + +

    ◆ GemmTileTraitsHelperB

    -template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename AccumulatorsPerThread_ = Shape<32, 8, 8>, int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    +template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename ThreadGemmShape_ , int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    - +
    typedef HgemmTileTraitsHelperB<kLayoutB_, GemmConfig> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::GemmTileTraitsHelperBtypedef HgemmTileTraitsHelperB<kLayoutB_, GemmConfig> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::GemmTileTraitsHelperB
    - -

    ◆ GlobalLoadIteratorA

    + +

    ◆ GlobalLoadIteratorA

    -template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename AccumulatorsPerThread_ = Shape<32, 8, 8>, int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    +template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename ThreadGemmShape_ , int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    - +
    typedef GemmGlobalIteratorAb<typename GemmTileTraitsHelperA::GlobalTileTraits, Index_> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::GlobalLoadIteratorAtypedef GemmGlobalIteratorAb<typename GemmTileTraitsHelperA::GlobalTileTraits, Index_> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::GlobalLoadIteratorA
    - -

    ◆ GlobalLoadIteratorB

    + +

    ◆ GlobalLoadIteratorB

    -template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename AccumulatorsPerThread_ = Shape<32, 8, 8>, int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    +template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename ThreadGemmShape_ , int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    - +
    typedef GemmGlobalIteratorAb<typename GemmTileTraitsHelperB::GlobalTileTraits, Index_> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::GlobalLoadIteratorBtypedef GemmGlobalIteratorAb<typename GemmTileTraitsHelperB::GlobalTileTraits, Index_> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::GlobalLoadIteratorB
    - -

    ◆ GlobalLoadStreamA

    + +

    ◆ GlobalLoadStreamA

    -template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename AccumulatorsPerThread_ = Shape<32, 8, 8>, int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    +template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename ThreadGemmShape_ , int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    - +
    typedef GlobalLoadStream<GlobalLoadIteratorA, SharedStoreIteratorA, GlobalTransformerA> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::GlobalLoadStreamAtypedef GlobalLoadStream<GemmOperand::kA, GlobalLoadIteratorA, SharedStoreIteratorA, GlobalTransformerA> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::GlobalLoadStreamA
    - -

    ◆ GlobalLoadStreamB

    + +

    ◆ GlobalLoadStreamB

    -template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename AccumulatorsPerThread_ = Shape<32, 8, 8>, int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    +template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename ThreadGemmShape_ , int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    - +
    typedef GlobalLoadStream<GlobalLoadIteratorB, SharedStoreIteratorB, GlobalTransformerB> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::GlobalLoadStreamBtypedef GlobalLoadStream<GemmOperand::kB, GlobalLoadIteratorB, SharedStoreIteratorB, GlobalTransformerB> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::GlobalLoadStreamB
    - -

    ◆ GlobalTransformerA

    + +

    ◆ GlobalTransformerA

    -template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename AccumulatorsPerThread_ = Shape<32, 8, 8>, int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    +template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename ThreadGemmShape_ , int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    - +
    typedef HgemmTransformerA<GemmTileTraitsHelperA::kLayout, GlobalLoadIteratorA>::Transformer cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::GlobalTransformerAtypedef HgemmTransformerA<GemmTileTraitsHelperA::kLayout, GlobalLoadIteratorA>::Transformer cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::GlobalTransformerA
    - -

    ◆ GlobalTransformerB

    + +

    ◆ GlobalTransformerB

    -template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename AccumulatorsPerThread_ = Shape<32, 8, 8>, int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    +template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename ThreadGemmShape_ , int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    - +
    typedef HgemmTransformerB<GemmTileTraitsHelperB::kLayout, GlobalLoadIteratorB>::Transformer cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::GlobalTransformerBtypedef HgemmTransformerB<GemmTileTraitsHelperB::kLayout, GlobalLoadIteratorB>::Transformer cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::GlobalTransformerB
    - -

    ◆ MultiplyAdd

    + +

    ◆ MultiplyAdd

    -template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename AccumulatorsPerThread_ = Shape<32, 8, 8>, int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    +template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename ThreadGemmShape_ , int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    - +
    typedef GemmConfig::MultiplyAdd cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::MultiplyAddtypedef GemmConfig::MultiplyAdd cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::MultiplyAdd
    - -

    ◆ SharedLoadIteratorA

    + +

    ◆ SharedLoadIteratorA

    -template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename AccumulatorsPerThread_ = Shape<32, 8, 8>, int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    +template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename ThreadGemmShape_ , int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    - +
    typedef TileLoadIterator<typename GemmTileTraitsHelperA::SharedLoadTileTraits, typename GemmTileTraitsHelperA::SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::SharedLoadIteratorAtypedef TileLoadIterator<typename GemmTileTraitsHelperA::SharedLoadTileTraits, typename GemmTileTraitsHelperA::SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::SharedLoadIteratorA
    - -

    ◆ SharedLoadIteratorB

    + +

    ◆ SharedLoadIteratorB

    -template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename AccumulatorsPerThread_ = Shape<32, 8, 8>, int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    +template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename ThreadGemmShape_ , int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    - +
    typedef TileLoadIterator<typename GemmTileTraitsHelperB::SharedLoadTileTraits, typename GemmTileTraitsHelperB::SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::SharedLoadIteratorBtypedef TileLoadIterator<typename GemmTileTraitsHelperB::SharedLoadTileTraits, typename GemmTileTraitsHelperB::SharedLoadTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::SharedLoadIteratorB
    - -

    ◆ SharedLoadStreamA

    + +

    ◆ SharedLoadStreamA

    -template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename AccumulatorsPerThread_ = Shape<32, 8, 8>, int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    +template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename ThreadGemmShape_ , int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    - +
    typedef SharedLoadStream<SharedLoadIteratorA> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::SharedLoadStreamAtypedef SharedLoadStream<SharedLoadIteratorA> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::SharedLoadStreamA
    - -

    ◆ SharedLoadStreamB

    + +

    ◆ SharedLoadStreamB

    -template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename AccumulatorsPerThread_ = Shape<32, 8, 8>, int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    +template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename ThreadGemmShape_ , int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    - +
    typedef SharedLoadStream<SharedLoadIteratorB> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::SharedLoadStreamBtypedef SharedLoadStream<SharedLoadIteratorB> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::SharedLoadStreamB
    - -

    ◆ SharedStoreIteratorA

    + +

    ◆ SharedStoreIteratorA

    -template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename AccumulatorsPerThread_ = Shape<32, 8, 8>, int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    +template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename ThreadGemmShape_ , int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    - +
    typedef TileStoreIterator<typename GemmTileTraitsHelperA::SharedStoreTileTraits, typename GemmTileTraitsHelperA::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::SharedStoreIteratorAtypedef TileStoreIterator<typename GemmTileTraitsHelperA::SharedStoreTileTraits, typename GemmTileTraitsHelperA::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::SharedStoreIteratorA
    - -

    ◆ SharedStoreIteratorB

    + +

    ◆ SharedStoreIteratorB

    -template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename AccumulatorsPerThread_ = Shape<32, 8, 8>, int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    +template<MatrixLayout::Kind kLayoutA_, MatrixLayout::Kind kLayoutB_, typename OutputTile_ , typename EpilogueFunctor_ , typename ThreadGemmShape_ , int kScalarsPerLdgA_ = 2, int kScalarsPerLdgB_ = 2, typename Index_ = int>
    - +
    typedef TileStoreIterator<typename GemmTileTraitsHelperB::SharedStoreTileTraits, typename GemmTileTraitsHelperB::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, AccumulatorsPerThread_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::SharedStoreIteratorBtypedef TileStoreIterator<typename GemmTileTraitsHelperB::SharedStoreTileTraits, typename GemmTileTraitsHelperB::SharedStoreTileTraits::Scalar, IteratorAdvance::kH, MemorySpace::kShared> cutlass::gemm::HgemmTraitsHelper< kLayoutA_, kLayoutB_, OutputTile_, EpilogueFunctor_, ThreadGemmShape_, kScalarsPerLdgA_, kScalarsPerLdgB_, Index_ >::SharedStoreIteratorB
    @@ -452,7 +452,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTransformerA.html b/docs/structcutlass_1_1gemm_1_1HgemmTransformerA.html index 505f08a55a..971b592707 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmTransformerA.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmTransformerA.html @@ -84,7 +84,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTransformerA_3_01MatrixLayout_1_1kColumnMajor_00_01Iterator___01_4-members.html b/docs/structcutlass_1_1gemm_1_1HgemmTransformerA_3_01MatrixLayout_1_1kColumnMajor_00_01Iterator___01_4-members.html index 7846aeebf5..13fdca496d 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmTransformerA_3_01MatrixLayout_1_1kColumnMajor_00_01Iterator___01_4-members.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmTransformerA_3_01MatrixLayout_1_1kColumnMajor_00_01Iterator___01_4-members.html @@ -83,7 +83,7 @@ diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTransformerA_3_01MatrixLayout_1_1kColumnMajor_00_01Iterator___01_4.html b/docs/structcutlass_1_1gemm_1_1HgemmTransformerA_3_01MatrixLayout_1_1kColumnMajor_00_01Iterator___01_4.html index 092d948b91..162e5cd232 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmTransformerA_3_01MatrixLayout_1_1kColumnMajor_00_01Iterator___01_4.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmTransformerA_3_01MatrixLayout_1_1kColumnMajor_00_01Iterator___01_4.html @@ -110,7 +110,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTransformerA_3_01MatrixLayout_1_1kRowMajor_00_01Iterator___01_4-members.html b/docs/structcutlass_1_1gemm_1_1HgemmTransformerA_3_01MatrixLayout_1_1kRowMajor_00_01Iterator___01_4-members.html index 4877f00119..69a96f49fc 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmTransformerA_3_01MatrixLayout_1_1kRowMajor_00_01Iterator___01_4-members.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmTransformerA_3_01MatrixLayout_1_1kRowMajor_00_01Iterator___01_4-members.html @@ -83,7 +83,7 @@ diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTransformerA_3_01MatrixLayout_1_1kRowMajor_00_01Iterator___01_4.html b/docs/structcutlass_1_1gemm_1_1HgemmTransformerA_3_01MatrixLayout_1_1kRowMajor_00_01Iterator___01_4.html index 8837b6996e..540f7a7d16 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmTransformerA_3_01MatrixLayout_1_1kRowMajor_00_01Iterator___01_4.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmTransformerA_3_01MatrixLayout_1_1kRowMajor_00_01Iterator___01_4.html @@ -110,7 +110,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTransformerB.html b/docs/structcutlass_1_1gemm_1_1HgemmTransformerB.html index 99325439a4..acddad07a2 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmTransformerB.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmTransformerB.html @@ -84,7 +84,7 @@ diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTransformerB_3_01MatrixLayout_1_1kColumnMajor_00_01Iterator___01_4-members.html b/docs/structcutlass_1_1gemm_1_1HgemmTransformerB_3_01MatrixLayout_1_1kColumnMajor_00_01Iterator___01_4-members.html index fdfc6a0035..007e2fd31f 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmTransformerB_3_01MatrixLayout_1_1kColumnMajor_00_01Iterator___01_4-members.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmTransformerB_3_01MatrixLayout_1_1kColumnMajor_00_01Iterator___01_4-members.html @@ -83,7 +83,7 @@ diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTransformerB_3_01MatrixLayout_1_1kColumnMajor_00_01Iterator___01_4.html b/docs/structcutlass_1_1gemm_1_1HgemmTransformerB_3_01MatrixLayout_1_1kColumnMajor_00_01Iterator___01_4.html index 93727e2762..73c853d113 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmTransformerB_3_01MatrixLayout_1_1kColumnMajor_00_01Iterator___01_4.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmTransformerB_3_01MatrixLayout_1_1kColumnMajor_00_01Iterator___01_4.html @@ -110,7 +110,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTransformerB_3_01MatrixLayout_1_1kRowMajor_00_01Iterator___01_4-members.html b/docs/structcutlass_1_1gemm_1_1HgemmTransformerB_3_01MatrixLayout_1_1kRowMajor_00_01Iterator___01_4-members.html index e655326e55..1f1b6e2451 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmTransformerB_3_01MatrixLayout_1_1kRowMajor_00_01Iterator___01_4-members.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmTransformerB_3_01MatrixLayout_1_1kRowMajor_00_01Iterator___01_4-members.html @@ -83,7 +83,7 @@ diff --git a/docs/structcutlass_1_1gemm_1_1HgemmTransformerB_3_01MatrixLayout_1_1kRowMajor_00_01Iterator___01_4.html b/docs/structcutlass_1_1gemm_1_1HgemmTransformerB_3_01MatrixLayout_1_1kRowMajor_00_01Iterator___01_4.html index 013566c2f1..d1d5706b15 100644 --- a/docs/structcutlass_1_1gemm_1_1HgemmTransformerB_3_01MatrixLayout_1_1kRowMajor_00_01Iterator___01_4.html +++ b/docs/structcutlass_1_1gemm_1_1HgemmTransformerB_3_01MatrixLayout_1_1kRowMajor_00_01Iterator___01_4.html @@ -110,7 +110,7 @@

    diff --git a/docs/structcutlass_1_1gemm_1_1IdentityBlockSwizzle-members.html b/docs/structcutlass_1_1gemm_1_1IdentityBlockSwizzle-members.html index 748a3a4955..120fa5588c 100644 --- a/docs/structcutlass_1_1gemm_1_1IdentityBlockSwizzle-members.html +++ b/docs/structcutlass_1_1gemm_1_1IdentityBlockSwizzle-members.html @@ -79,12 +79,15 @@

    This is the complete list of members for cutlass::gemm::IdentityBlockSwizzle, including all inherited members.

    - - + + + + +
    IdentityBlockSwizzle()cutlass::gemm::IdentityBlockSwizzleinline
    swizzle()cutlass::gemm::IdentityBlockSwizzleinline
    get_batch_id()cutlass::gemm::IdentityBlockSwizzleinline
    get_grid_layout(GemmCoord const &problem_size, Coord< 3 > const &OutputTile)cutlass::gemm::IdentityBlockSwizzleinline
    get_threadblock_offset(Coord< 3 > const &OutputTile)cutlass::gemm::IdentityBlockSwizzleinline
    IdentityBlockSwizzle()cutlass::gemm::IdentityBlockSwizzleinline
    swizzle()cutlass::gemm::IdentityBlockSwizzleinline
    diff --git a/docs/structcutlass_1_1gemm_1_1IdentityBlockSwizzle.html b/docs/structcutlass_1_1gemm_1_1IdentityBlockSwizzle.html index 68a70c7b38..02f1b0715b 100644 --- a/docs/structcutlass_1_1gemm_1_1IdentityBlockSwizzle.html +++ b/docs/structcutlass_1_1gemm_1_1IdentityBlockSwizzle.html @@ -80,20 +80,26 @@
    -

    #include <identity_block_swizzle.h>

    +

    #include <threadblock_swizzle.h>

    - - - + + + + + + + + +

    Public Member Functions

    CUTLASS_DEVICE IdentityBlockSwizzle ()
     Ctor. More...
     
    CUTLASS_HOST_DEVICE IdentityBlockSwizzle ()
     Ctor. aka ColumnMajorBlockSwizzle<1> More...
     
    CUTLASS_DEVICE dim3 swizzle ()
     Swizzle the block index. More...
     
    CUTLASS_HOST_DEVICE dim3 get_grid_layout (GemmCoord const &problem_size, Coord< 3 > const &OutputTile)
     
    CUTLASS_DEVICE Coord< 3 > get_threadblock_offset (Coord< 3 > const &OutputTile)
     
    CUTLASS_DEVICE int get_batch_id ()
     

    Constructor & Destructor Documentation

    - -

    ◆ IdentityBlockSwizzle()

    + +

    ◆ IdentityBlockSwizzle()

    + +

    ◆ get_grid_layout()

    + +
    +
    + + + + + +
    + + + + + + + + + + + + + + + + + + +
    CUTLASS_HOST_DEVICE dim3 cutlass::gemm::IdentityBlockSwizzle::get_grid_layout (GemmCoord const & problem_size,
    Coord< 3 > const & OutputTile 
    )
    +
    +inline
    +
    + +
    +
    + +

    ◆ get_threadblock_offset()

    + +
    +
    + + + + + +
    + + + + + + + + +
    CUTLASS_DEVICE Coord<3> cutlass::gemm::IdentityBlockSwizzle::get_threadblock_offset (Coord< 3 > const & OutputTile)
    +
    +inline
    +
    + +
    +

    ◆ swizzle()

    @@ -144,12 +237,12 @@

    identity_block_swizzle.h +
  • threadblock_swizzle.h
  • diff --git a/docs/structcutlass_1_1gemm_1_1IgemmConfig-members.html b/docs/structcutlass_1_1gemm_1_1IgemmConfig-members.html index bb78c951e8..a3ae8ada2f 100644 --- a/docs/structcutlass_1_1gemm_1_1IgemmConfig-members.html +++ b/docs/structcutlass_1_1gemm_1_1IgemmConfig-members.html @@ -73,41 +73,44 @@
    -
    cutlass::gemm::IgemmConfig< OutputTile_, ScalarD_, AccumulatorsPerThread_ > Member List
    +
    cutlass::gemm::IgemmConfig< OutputTile_, ScalarD_, ThreadGemmShape_ > Member List
    -

    This is the complete list of members for cutlass::gemm::IgemmConfig< OutputTile_, ScalarD_, AccumulatorsPerThread_ >, including all inherited members.

    +

    This is the complete list of members for cutlass::gemm::IgemmConfig< OutputTile_, ScalarD_, ThreadGemmShape_ >, including all inherited members.

    - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Accumulators typedefcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >
    AccumulatorsPerWarp typedefcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >
    InstructionShape typedefcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >
    kAccumulatorsPerLdsAcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >static
    kAccumulatorsPerLdsBcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >static
    kScalarsPerLdgAcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >static
    kScalarsPerLdgBcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >static
    kScalarsPerLdgCcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >static
    kScalarsPerLdsAcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >static
    kScalarsPerLdsBcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >static
    kScalarsPerLdsDcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >static
    kScalarsPerStgDcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >static
    kScalarsPerStsAcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >static
    kScalarsPerStsBcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >static
    kScalarsPerStsDcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >static
    kStagescutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >static
    kThreadscutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >static
    kWarpSizecutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >static
    MultiplyAdd typedefcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >
    OutputTile typedefcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >
    ScalarA typedefcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >
    ScalarB typedefcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >
    ScalarC typedefcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >
    ScalarD typedefcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >
    Warps typedefcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >
    Accumulators typedefcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >
    AccumulatorsPerWarp typedefcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >
    InstructionShape typedefcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >
    kAccumulatorsPerLdsAcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >static
    kAccumulatorsPerLdsBcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >static
    kLaunchBoundscutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >static
    kResidueInPrologcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >static
    kResidueSeparatecutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >static
    kScalarsPerLdgAcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >static
    kScalarsPerLdgBcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >static
    kScalarsPerLdgCcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >static
    kScalarsPerLdsAcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >static
    kScalarsPerLdsBcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >static
    kScalarsPerLdsDcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >static
    kScalarsPerStgDcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >static
    kScalarsPerStsAcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >static
    kScalarsPerStsBcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >static
    kScalarsPerStsDcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >static
    kStagescutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >static
    kThreadscutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >static
    kWarpSizecutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >static
    MultiplyAdd typedefcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >
    OutputTile typedefcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >
    ScalarA typedefcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >
    ScalarB typedefcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >
    ScalarC typedefcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >
    ScalarD typedefcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >
    Warps typedefcutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >
    diff --git a/docs/structcutlass_1_1gemm_1_1IgemmConfig.html b/docs/structcutlass_1_1gemm_1_1IgemmConfig.html index cbcfef4564..89ac24e89b 100644 --- a/docs/structcutlass_1_1gemm_1_1IgemmConfig.html +++ b/docs/structcutlass_1_1gemm_1_1IgemmConfig.html @@ -5,7 +5,7 @@ -Cutlass: cutlass::gemm::IgemmConfig< OutputTile_, ScalarD_, AccumulatorsPerThread_ > Struct Template Reference +Cutlass: cutlass::gemm::IgemmConfig< OutputTile_, ScalarD_, ThreadGemmShape_ > Struct Template Reference @@ -75,93 +75,102 @@
    -
    cutlass::gemm::IgemmConfig< OutputTile_, ScalarD_, AccumulatorsPerThread_ > Struct Template Reference
    +
    cutlass::gemm::IgemmConfig< OutputTile_, ScalarD_, ThreadGemmShape_ > Struct Template Reference

    #include <igemm_traits.h>

    -Inheritance diagram for cutlass::gemm::IgemmConfig< OutputTile_, ScalarD_, AccumulatorsPerThread_ >:
    +Inheritance diagram for cutlass::gemm::IgemmConfig< OutputTile_, ScalarD_, ThreadGemmShape_ >:
    - - -cutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 > + + +cutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    Additional Inherited Members

    - Public Types inherited from cutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >
    typedef int8_t ScalarA
     The scalar for A. More...
     
    typedef int8_t ScalarB
     The scalar for B. More...
     
    typedef ScalarD_ ScalarC
     The scalar for C. More...
     
    typedef ScalarD_ ScalarD
     The scalar for D. More...
     
    typedef OutputTile_ OutputTile
     The tile. More...
     
    typedef ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int > MultiplyAdd
     The functor to do D = A*B + C. More...
     
    typedef MultiplyAdd::InstructionShape InstructionShape
     The shape of the instruction. More...
     
    typedef MultiplyAdd::AccumulatorsPerWarp AccumulatorsPerWarp
     The number of accumulators per warp. More...
     
    typedef MultiplyAdd::Accumulators Accumulators
     The accumulators. More...
     
    typedef ShapeDiv< OutputTile, AccumulatorsPerWarp >::Shape Warps
     The number of warps. More...
     
    - Static Public Attributes inherited from cutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< AccumulatorsPerThread_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2 >
    static int const kWarpSize
     The default warp size (32 threads per warp). More...
     
    static int const kThreads
     The numnber of threads. More...
     
    static int const kScalarsPerLdgA
     The number of scalars per LDG/STS/LDS for A. More...
     
    static int const kScalarsPerStsA
     
    static int const kScalarsPerLdsA
     
    static int const kScalarsPerLdgB
     The number of scalars per LDG/STS/LDS for B. More...
     
    static int const kScalarsPerStsB
     
    static int const kScalarsPerLdsB
     
    static int const kScalarsPerLdgC
     The number of scalars per LDG for C. More...
     
    static int const kScalarsPerStgD
     The number of scalars per STS/LDS/STG for D. More...
     
    static int const kScalarsPerStsD
     
    static int const kScalarsPerLdsD
     
    static int const kAccumulatorsPerLdsA
     The number of accumulators that are going to be fed from one LDS A/B. More...
     
    static int const kAccumulatorsPerLdsB
     
    static int const kStages
     The number of stages in shared memory to implement double, triple, more-buffering. More...
     
    - Public Types inherited from cutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >
    typedef int8_t ScalarA
     The scalar for A. More...
     
    typedef int8_t ScalarB
     The scalar for B. More...
     
    typedef ScalarD_ ScalarC
     The scalar for C. More...
     
    typedef ScalarD_ ScalarD
     The scalar for D. More...
     
    typedef OutputTile_ OutputTile
     The tile. More...
     
    typedef ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int > MultiplyAdd
     The functor to do D = A*B + C. More...
     
    typedef MultiplyAdd::InstructionShape InstructionShape
     The shape of the instruction. More...
     
    typedef MultiplyAdd::AccumulatorsPerWarp AccumulatorsPerWarp
     The shape of warp-level GEMM. More...
     
    typedef MultiplyAdd::Accumulators Accumulators
     The accumulators. More...
     
    typedef ShapeDiv< OutputTile, AccumulatorsPerWarp >::Shape Warps
     The number of warps. More...
     
    - Static Public Attributes inherited from cutlass::gemm::GemmConfig< int8_t, int8_t, ScalarD_, ScalarD_, OutputTile_, ThreadMultiplyAdd< ThreadGemmShape_, Shape< 1, 4, 8 >, int8_t, int8_t, int >, 4, 4, 16, 4, 4, 16, 1, 4, 1, 2, false, false, false >
    static int const kWarpSize
     The default warp size (32 threads per warp). More...
     
    static int const kThreads
     The numnber of threads. More...
     
    static int const kScalarsPerLdgA
     The number of scalars per LDG/STS/LDS for A. More...
     
    static int const kScalarsPerStsA
     
    static int const kScalarsPerLdsA
     
    static int const kScalarsPerLdgB
     The number of scalars per LDG/STS/LDS for B. More...
     
    static int const kScalarsPerStsB
     
    static int const kScalarsPerLdsB
     
    static int const kScalarsPerLdgC
     The number of scalars per LDG for C. More...
     
    static int const kScalarsPerStgD
     The number of scalars per STS/LDS/STG for D. More...
     
    static int const kScalarsPerStsD
     
    static int const kScalarsPerLdsD
     
    static int const kAccumulatorsPerLdsA
     The number of accumulators that are going to be fed from one LDS A/B. More...
     
    static int const kAccumulatorsPerLdsB
     
    static int const kStages
     The number of stages in shared memory to implement double, triple, more-buffering. More...
     
    static bool const kResidueSeparate
     If true, mainloop is instantiated twice. The first instantiation contains no predicate. More...
     
    static bool const kResidueInProlog
     If true, residue is computed in the prologue. More...
     
    static bool const kLaunchBounds
     If true, kernel is launched with launch bounds specified. More...
     

    The documentation for this struct was generated from the following file: