Skip to content

Commit

Permalink
Merge pull request NVIDIA#13 from NVIDIA/cutlass_v1.0.1
Browse files Browse the repository at this point in the history
Cutlass v1.0.1
  • Loading branch information
kerrmudgeon authored Jun 12, 2018
2 parents 9fd5546 + c566e83 commit e1c4ba5
Show file tree
Hide file tree
Showing 354 changed files with 3,329 additions and 129,924 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "tools/external/googletest"]
path = tools/external/googletest
url = https://github.com/google/googletest.git
47 changes: 47 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# NVIDIA CUTLASS Changelog

## 1.0.1 (2018-06-11)

* Intra-threadblock reduction added for small threadblock tile sizes
* sgemm_64x128x16, sgemm_128x128x16, sgemm_128x64x16, sgemm_128x32x16, sgemm_64x64x16, sgemm_64x32x16
* igemm_32x32x128
* GEMM _K_ residue handled during prologue prior to mainloop
* Replaced Google Test copy with submodule. Use `git submodule init`

## [1.0.0](https://github.com/NVIDIA/cutlass/commit/2028ebe120aab22bfd0b2baf8902d4c9627eb33f) (2018-05-16)

* Substantial rewrite to accommodate new architecture
* Kernels: SGEMM, DGEMM, IGEMM, HGEMM, WMMA GEMM
* Unit and performance tests

## [0.0.1](https://github.com/NVIDIA/cutlass/commit/d08ba8ac46e2fa3f745e070c390182edb56b2e91) (2017-12-04)

* Initial release


## Copyright

Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.

```
Redistribution and use in source and binary forms, with or without modification, are permitted
provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this list of
conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this list of
conditions and the following disclaimer in the documentation and/or other materials
provided with the distribution.
* Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
to endorse or promote products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
```

2 changes: 1 addition & 1 deletion cutlass/cutlass.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

#define CUTLASS_MAJOR 1
#define CUTLASS_MINOR 0
#define CUTLASS_PATCH 0
#define CUTLASS_PATCH 1
#define CUTLASS_VERSION ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH)

#ifdef __NVCC__
Expand Down
4 changes: 2 additions & 2 deletions cutlass/fragment.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ struct FragmentIterator {
/// The shape of the the fragment.
typedef typename ShapeMul<Iterations, Shape<1, 1, 1, kElementsPerAccess> >::Shape FragmentShape;
/// The linear strides for iterations.
typedef typename ShapeStrides<FragmentShape>::Shape Strides;
typedef typename ShapeStrides<FragmentShape, kElementsPerAccess>::Shape Strides;

/// Ctor.
template <typename OtherFragment_>
Expand Down Expand Up @@ -242,7 +242,7 @@ struct FragmentConstIterator {
/// The shape of the the fragment.
typedef typename ShapeMul<Iterations, Shape<1, 1, 1, kElementsPerAccess> >::Shape FragmentShape;
/// The linear strides for iterations.
typedef typename ShapeStrides<FragmentShape>::Shape IterationsStrides;
typedef typename ShapeStrides<FragmentShape, kElementsPerAccess>::Shape IterationsStrides;

/// Ctor.
template <typename OtherFragment_>
Expand Down
68 changes: 43 additions & 25 deletions cutlass/fragment_multiply_add.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,21 +49,29 @@ struct FragmentMultiplyAdd {
CUTLASS_DEVICE FragmentMultiplyAdd() {}

/// Multiply : d = a*b.
template <typename Fragment_>
CUTLASS_DEVICE void multiply(Scalar_ a, Fragment_ const& b, Fragment_& d) {
for (int j = 0; j < Fragment_::kElements; ++j) {
d[j] = a * b[j];
template <typename FragmentB_, typename FragmentCd_>
CUTLASS_DEVICE void multiply(Scalar_ a, FragmentB_ const& b, FragmentCd_& d) {
int const kReduction = FragmentB_::kElements / FragmentCd_::kElements;
for (int j = 0; j < FragmentCd_::kElements; ++j) {
d[j] = a * b[j * kReduction + 0];
for (int k = 1; k < kReduction; ++k) {
d[j] += a * b[j * kReduction + k];
}
}
}

/// Multiply : d = a*b + c.
template <typename Fragment_>
template <typename FragmentB_, typename FragmentCd_>
CUTLASS_DEVICE void multiply_add(Scalar_ a,
Fragment_ const& b,
Fragment_ const& c,
Fragment_& d) {
for (int j = 0; j < Fragment_::kElements; ++j) {
d[j] = a * b[j] + c[j];
FragmentB_ const& b,
FragmentCd_ const& c,
FragmentCd_& d) {
int const kReduction = FragmentB_::kElements / FragmentCd_::kElements;
for (int j = 0; j < FragmentCd_::kElements; ++j) {
d[j] = a * b[j * kReduction + 0] + c[j];
for (int k = 1; k < kReduction; ++k) {
d[j] += a * b[j * kReduction + k];
}
}
}
};
Expand All @@ -74,7 +82,7 @@ struct FragmentMultiplyAdd {
template <>
struct FragmentMultiplyAdd<half> {
/// The shape of the instruction.
typedef Shape<1, 1, 1, 1> InstructionShape;
typedef Shape<1, 1, 2, 1> InstructionShape;
/// The type for A.
typedef half ScalarA;
/// The type for B.
Expand All @@ -86,38 +94,48 @@ struct FragmentMultiplyAdd<half> {
CUTLASS_DEVICE FragmentMultiplyAdd() {}

/// Multiply : d = a*b.
template <typename Fragment_>
CUTLASS_DEVICE void multiply(half a, Fragment_ const& b, Fragment_& d) {
template <typename FragmentB_, typename FragmentCd_>
CUTLASS_DEVICE void multiply(half a, FragmentB_ const& b, FragmentCd_& d) {
#if defined(__CUDACC__) && __CUDA_ARCH__ >= 530

// Assemble a half2 from a.
__half2 const a_half2 = __half2half2(a);
// The input.
__half2 const* b_half2 = reinterpret_cast<__half2 const*>(&b[0]);
// The output.
__half2* d_half2 = reinterpret_cast<__half2*>(&d[0]);

// Assemble a half2 from a.
__half2 const a_half2 = __half2half2(a);

for (int i = 0; i < Fragment_::kElements / 2; ++i) {
d_half2[i] = __hmul2(a_half2, b_half2[i]);
int const kReduction = FragmentB_::kElements / FragmentCd_::kElements;
for (int j = 0; j < FragmentCd_::kElements / 2; ++j) {
d_half2[j] = __hmul2(a_half2, b_half2[j * kReduction + 0]);
for (int k = 1; k < kReduction; ++k) {
d_half2[j] = __hfma2(a_half2, b_half2[j * kReduction + k], d_half2[j]);
}
}
#endif
}

/// Multiply : d = a*b + c.
template <typename Fragment_>
CUTLASS_DEVICE void multiply_add(half a, Fragment_ const& b, Fragment_ const& c, Fragment_& d) {
template <typename FragmentB_, typename FragmentCd_>
CUTLASS_DEVICE void multiply_add(half a,
FragmentB_ const& b,
FragmentCd_ const& c,
FragmentCd_& d) {
#if defined(__CUDACC__) && __CUDA_ARCH__ >= 530
// Assemble a half2 from a.
__half2 const a_half2 = __half2half2(a);
// The inputs.
__half2 const* b_half2 = reinterpret_cast<__half2 const*>(&b[0]);
__half2 const* c_half2 = reinterpret_cast<__half2 const*>(&c[0]);
// The output.
__half2* d_half2 = reinterpret_cast<__half2*>(&d[0]);

// Assemble a half2 from a.
__half2 const a_half2 = __half2half2(a);

for (int i = 0; i < Fragment_::kElements / 2; ++i) {
d_half2[i] = __hfma2(a_half2, b_half2[i], c_half2[i]);
int const kReduction = (FragmentB_::kElements / FragmentCd_::kElements);
for (int j = 0; j < FragmentCd_::kElements / 2; ++j) {
d_half2[j] = __hfma2(a_half2, b_half2[j * kReduction + 0], c_half2[j]);
for (int k = 1; k < kReduction; ++k) {
d_half2[j] = __hfma2(a_half2, b_half2[j * kReduction + k], d_half2[j]);
}
}
#endif
}
Expand Down
2 changes: 2 additions & 0 deletions cutlass/gemm/clear_accumulators.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ struct ClearAccumulators {
/// The shared storage.
struct SharedStorage {};

/// Ctor.
CUTLASS_DEVICE ClearAccumulators() {}
/// Ctor.
CUTLASS_DEVICE ClearAccumulators(SharedStorage& shared_storage) {}

Expand Down
139 changes: 82 additions & 57 deletions cutlass/gemm/gemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ namespace gemm {
////////////////////////////////////////////////////////////////////////////////////////////////////

template <typename Gemm_>
__global__ void gemm_kernel(typename Gemm_::Params params) {
__global__ /*__launch_bounds__(Gemm_::kThreads)*/ void gemm_kernel(typename Gemm_::Params params) {
// Declare shared memory.
__shared__ typename Gemm_::SharedStorage shared_storage;

Expand Down Expand Up @@ -193,6 +193,71 @@ struct Gemm {
CUTLASS_DEVICE Gemm(Params const& params_, SharedStorage& shared_storage_)
: params(params_), shared_storage(shared_storage_) {}

/// Consume a single iteration of the loop.
template <bool kIsLastIteration>
CUTLASS_DEVICE void consume_tile(typename Traits::GlobalLoadStream& global_stream,
typename Traits::SharedLoadStream& shared_load_stream,
typename Traits::MultiplyAdd::Accumulators& accumulators,
Index outer_k) {
// If that's the last "load iteration" update the predicates.
if (!kIsLastIteration) {
global_stream.move_to_residue<false>(outer_k);
}

// Load data for the next iteration of the main loop.
if (!kIsLastIteration) {
global_stream.copy();
}

// The unrolling steps for the main loop.
int const kUnrollingSteps =
Traits::MultiplyAdd::AccumulatorsPerWarp::kD / Traits::MultiplyAdd::InstructionShape::kD;

CUTLASS_PRAGMA_UNROLL
for (int step = 0; step < kUnrollingSteps - 1; ++step) {
// Trigger the copy from shared memory for the next A/B values.
shared_load_stream.copy(step + 1);
// Make sure the values are available for the current iteration to do the multiply-add.
shared_load_stream.commit(step);

// Do the math on the fragments of the current iteration.
typename Traits::MultiplyAdd multiply_add;
multiply_add.multiply_add(shared_load_stream.fragment_a(step),
shared_load_stream.fragment_b(step),
accumulators,
accumulators);
}

// Make sure the data from shared memory has been entirely consumed.
Traits::shared_load_fence(true);

// Commit the data in shared memory for A/B.
if (!kIsLastIteration) {
global_stream.commit();
}

// Make sure the data is in shared memory.
Traits::shared_store_fence(true);

// Trigger the loads for the next iteration (if needed).
if (!kIsLastIteration) {
// Move to the next stage for the load (if it makes sense).
shared_load_stream.inc_stage();
// Trigger the copy from shared memory for the next loop iteration.
shared_load_stream.copy(0);
}

// Make sure the values are available for the current iteration to do the multiply-add.
shared_load_stream.commit(kUnrollingSteps - 1);

// Do the math on the fragments of the current iteration.
typename Traits::MultiplyAdd multiply_add;
multiply_add.multiply_add(shared_load_stream.fragment_a(kUnrollingSteps - 1),
shared_load_stream.fragment_b(kUnrollingSteps - 1),
accumulators,
accumulators);
}

/// Do the GEMM.
CUTLASS_DEVICE void multiply_add() {
// Swizzle the IDs of the block (to enable better cache behavior).
Expand All @@ -212,16 +277,11 @@ struct Gemm {
// Create the accumulator clear.
ClearAccumulators clear(shared_storage.main_loop.clear);

/// Define the mainloop iteration size
typedef typename Traits::MultiplyAdd MultiplyAdd;

// By how much we unroll the main loop.
Index const kUnroll = static_cast<Index>(MultiplyAdd::AccumulatorsPerWarp::kD);
Index const kUnroll = static_cast<Index>(Traits::OutputTile::kD);

// If we do not have enough steps in the main loop, trigger the residue code.
if (params.k < kUnroll) {
global_stream.residue(params.k, true);
}
global_stream.move_to_residue<true>(params.k);

// Fetch the fragments for A and B from global memory.
global_stream.copy();
Expand All @@ -232,9 +292,12 @@ struct Gemm {
// Make sure the data is in shared memory.
Traits::shared_store_fence(false);

// Rollback to the beginning of the GEMM-K dimension. It may have no impact.
global_stream.rollback();

// The unrolling steps for the main loop.
int const kUnrollingSteps =
MultiplyAdd::AccumulatorsPerWarp::kD / MultiplyAdd::InstructionShape::kD;
Traits::MultiplyAdd::AccumulatorsPerWarp::kD / Traits::MultiplyAdd::InstructionShape::kD;

// Make sure we have at least 2 unrolling steps or our pipeling is not going to work.
static_assert(kUnrollingSteps >= 2, "The pipelining assumes at least two steps");
Expand All @@ -246,59 +309,21 @@ struct Gemm {
shared_load_stream.copy(0);

// Allocate the accumulators.
typename MultiplyAdd::Accumulators accumulators;
typename Traits::MultiplyAdd::Accumulators accumulators;
// Clear the accumulators.
clear.clear(accumulators);

// Enter the main loop and iterate.
typedef typename Traits::Index Index;
for (Index outer_k = params.k - kUnroll; outer_k > -kUnroll; outer_k -= kUnroll) {
// If that's the last "load iteration" update the predicates.
int const is_residue = outer_k <= kUnroll;
if (is_residue) {
global_stream.residue(outer_k);
}

// Load data for the next iteration of the main loop.
global_stream.copy();

CUTLASS_PRAGMA_UNROLL
for (int step = 0; step < kUnrollingSteps - 1; ++step) {
// Trigger the copy from shared memory for the next A/B values.
shared_load_stream.copy(step + 1);
// Make sure the values are available for the current iteration to do the multiply-add.
shared_load_stream.commit(step);

// Do the math on the fragments of the current iteration.
MultiplyAdd multiply_add;
multiply_add.multiply_add(shared_load_stream.fragment_a(step),
shared_load_stream.fragment_b(step),
accumulators,
accumulators);
}

// Make sure the data from shared memory has been entirely consumed.
Traits::shared_load_fence(true);

// Commit the data in shared memory for A/B.
global_stream.commit();

// Make sure the data is in shared memory.
Traits::shared_store_fence(true);
// The loop index.
Index outer_k = params.k - kUnroll;

// Move to the next stage for the load (if it makes sense).
shared_load_stream.inc_stage();
// Trigger the copy from shared memory for the next loop iteration.
shared_load_stream.copy(0);
// Make sure the values are available for the current iteration to do the multiply-add.
shared_load_stream.commit(kUnrollingSteps - 1);
// Enter the main loop and iterate.
for (; outer_k > 0; outer_k -= kUnroll) {
consume_tile<false>(global_stream, shared_load_stream, accumulators, outer_k);
}

// Do the math on the fragments of the current iteration.
MultiplyAdd multiply_add;
multiply_add.multiply_add(shared_load_stream.fragment_a(kUnrollingSteps - 1),
shared_load_stream.fragment_b(kUnrollingSteps - 1),
accumulators,
accumulators);
// Residual loop.
for (; outer_k > -kUnroll; outer_k -= kUnroll) {
consume_tile<true>(global_stream, shared_load_stream, accumulators, outer_k);
}

// Epilogue.
Expand Down
Loading

0 comments on commit e1c4ba5

Please sign in to comment.