Merge pull request NVIDIA#13 from NVIDIA/cutlass_v1.0.1

Cutlass v1.0.1
ShayekhBinIslam · Jun 12, 2018 · e1c4ba5 · e1c4ba5
2 parents 9fd5546 + c566e83
commit e1c4ba5
Show file tree

Hide file tree

Showing 354 changed files with 3,329 additions and 129,924 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "tools/external/googletest"]
+	path = tools/external/googletest
+	url = https://github.com/google/googletest.git
diff --git a/changelog.md b/changelog.md
@@ -0,0 +1,47 @@
+# NVIDIA CUTLASS Changelog
+
+## 1.0.1 (2018-06-11)
+
+  * Intra-threadblock reduction added for small threadblock tile sizes
+    * sgemm_64x128x16, sgemm_128x128x16, sgemm_128x64x16, sgemm_128x32x16, sgemm_64x64x16, sgemm_64x32x16
+    * igemm_32x32x128
+  * GEMM _K_ residue handled during prologue prior to mainloop
+  * Replaced Google Test copy with submodule. Use `git submodule init`
+
+## [1.0.0](https://github.com/NVIDIA/cutlass/commit/2028ebe120aab22bfd0b2baf8902d4c9627eb33f) (2018-05-16)
+
+  * Substantial rewrite to accommodate new architecture
+  * Kernels: SGEMM, DGEMM, IGEMM, HGEMM, WMMA GEMM
+  * Unit and performance tests
+
+## [0.0.1](https://github.com/NVIDIA/cutlass/commit/d08ba8ac46e2fa3f745e070c390182edb56b2e91) (2017-12-04)
+
+  * Initial release
+
+
+## Copyright
+
+Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+
+```
+  Redistribution and use in source and binary forms, with or without modification, are permitted
+  provided that the following conditions are met:
+      * Redistributions of source code must retain the above copyright notice, this list of
+        conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above copyright notice, this list of
+        conditions and the following disclaimer in the documentation and/or other materials
+        provided with the distribution.
+      * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+        to endorse or promote products derived from this software without specific prior written
+        permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+  STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
+
diff --git a/cutlass/cutlass.h b/cutlass/cutlass.h
@@ -33,7 +33,7 @@
 
 #define CUTLASS_MAJOR 1
 #define CUTLASS_MINOR 0
-#define CUTLASS_PATCH 0
+#define CUTLASS_PATCH 1
 #define CUTLASS_VERSION ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH)
 
 #ifdef __NVCC__

diff --git a/cutlass/fragment.h b/cutlass/fragment.h
@@ -184,7 +184,7 @@ struct FragmentIterator {
   /// The shape of the the fragment.
   typedef typename ShapeMul<Iterations, Shape<1, 1, 1, kElementsPerAccess> >::Shape FragmentShape;
   /// The linear strides for iterations.
-  typedef typename ShapeStrides<FragmentShape>::Shape Strides;
+  typedef typename ShapeStrides<FragmentShape, kElementsPerAccess>::Shape Strides;
 
   /// Ctor.
   template <typename OtherFragment_>
@@ -242,7 +242,7 @@ struct FragmentConstIterator {
   /// The shape of the the fragment.
   typedef typename ShapeMul<Iterations, Shape<1, 1, 1, kElementsPerAccess> >::Shape FragmentShape;
   /// The linear strides for iterations.
-  typedef typename ShapeStrides<FragmentShape>::Shape IterationsStrides;
+  typedef typename ShapeStrides<FragmentShape, kElementsPerAccess>::Shape IterationsStrides;
 
   /// Ctor.
   template <typename OtherFragment_>

diff --git a/cutlass/fragment_multiply_add.h b/cutlass/fragment_multiply_add.h
@@ -49,21 +49,29 @@ struct FragmentMultiplyAdd {
   CUTLASS_DEVICE FragmentMultiplyAdd() {}
 
   /// Multiply : d = a*b.
-  template <typename Fragment_>
-  CUTLASS_DEVICE void multiply(Scalar_ a, Fragment_ const& b, Fragment_& d) {
-    for (int j = 0; j < Fragment_::kElements; ++j) {
-      d[j] = a * b[j];
+  template <typename FragmentB_, typename FragmentCd_>
+  CUTLASS_DEVICE void multiply(Scalar_ a, FragmentB_ const& b, FragmentCd_& d) {
+    int const kReduction = FragmentB_::kElements / FragmentCd_::kElements;
+    for (int j = 0; j < FragmentCd_::kElements; ++j) {
+      d[j] = a * b[j * kReduction + 0];
+      for (int k = 1; k < kReduction; ++k) {
+        d[j] += a * b[j * kReduction + k];
+      }
     }
   }
 
   /// Multiply : d = a*b + c.
-  template <typename Fragment_>
+  template <typename FragmentB_, typename FragmentCd_>
   CUTLASS_DEVICE void multiply_add(Scalar_ a,
-                                   Fragment_ const& b,
-                                   Fragment_ const& c,
-                                   Fragment_& d) {
-    for (int j = 0; j < Fragment_::kElements; ++j) {
-      d[j] = a * b[j] + c[j];
+                                   FragmentB_ const& b,
+                                   FragmentCd_ const& c,
+                                   FragmentCd_& d) {
+    int const kReduction = FragmentB_::kElements / FragmentCd_::kElements;
+    for (int j = 0; j < FragmentCd_::kElements; ++j) {
+      d[j] = a * b[j * kReduction + 0] + c[j];
+      for (int k = 1; k < kReduction; ++k) {
+        d[j] += a * b[j * kReduction + k];
+      }
     }
   }
 };
@@ -74,7 +82,7 @@ struct FragmentMultiplyAdd {
 template <>
 struct FragmentMultiplyAdd<half> {
   /// The shape of the instruction.
-  typedef Shape<1, 1, 1, 1> InstructionShape;
+  typedef Shape<1, 1, 2, 1> InstructionShape;
   /// The type for A.
   typedef half ScalarA;
   /// The type for B.
@@ -86,38 +94,48 @@ struct FragmentMultiplyAdd<half> {
   CUTLASS_DEVICE FragmentMultiplyAdd() {}
 
   /// Multiply : d = a*b.
-  template <typename Fragment_>
-  CUTLASS_DEVICE void multiply(half a, Fragment_ const& b, Fragment_& d) {
+  template <typename FragmentB_, typename FragmentCd_>
+  CUTLASS_DEVICE void multiply(half a, FragmentB_ const& b, FragmentCd_& d) {
 #if defined(__CUDACC__) && __CUDA_ARCH__ >= 530
+
+    // Assemble a half2 from a.
+    __half2 const a_half2 = __half2half2(a);
     // The input.
     __half2 const* b_half2 = reinterpret_cast<__half2 const*>(&b[0]);
     // The output.
     __half2* d_half2 = reinterpret_cast<__half2*>(&d[0]);
 
-    // Assemble a half2 from a.
-    __half2 const a_half2 = __half2half2(a);
-
-    for (int i = 0; i < Fragment_::kElements / 2; ++i) {
-      d_half2[i] = __hmul2(a_half2, b_half2[i]);
+    int const kReduction = FragmentB_::kElements / FragmentCd_::kElements;
+    for (int j = 0; j < FragmentCd_::kElements / 2; ++j) {
+      d_half2[j] = __hmul2(a_half2, b_half2[j * kReduction + 0]);
+      for (int k = 1; k < kReduction; ++k) {
+        d_half2[j] = __hfma2(a_half2, b_half2[j * kReduction + k], d_half2[j]);
+      }
     }
 #endif
   }
 
   /// Multiply : d = a*b + c.
-  template <typename Fragment_>
-  CUTLASS_DEVICE void multiply_add(half a, Fragment_ const& b, Fragment_ const& c, Fragment_& d) {
+  template <typename FragmentB_, typename FragmentCd_>
+  CUTLASS_DEVICE void multiply_add(half a,
+                                   FragmentB_ const& b,
+                                   FragmentCd_ const& c,
+                                   FragmentCd_& d) {
 #if defined(__CUDACC__) && __CUDA_ARCH__ >= 530
+    // Assemble a half2 from a.
+    __half2 const a_half2 = __half2half2(a);
     // The inputs.
     __half2 const* b_half2 = reinterpret_cast<__half2 const*>(&b[0]);
     __half2 const* c_half2 = reinterpret_cast<__half2 const*>(&c[0]);
     // The output.
     __half2* d_half2 = reinterpret_cast<__half2*>(&d[0]);
 
-    // Assemble a half2 from a.
-    __half2 const a_half2 = __half2half2(a);
-
-    for (int i = 0; i < Fragment_::kElements / 2; ++i) {
-      d_half2[i] = __hfma2(a_half2, b_half2[i], c_half2[i]);
+    int const kReduction = (FragmentB_::kElements / FragmentCd_::kElements);
+    for (int j = 0; j < FragmentCd_::kElements / 2; ++j) {
+      d_half2[j] = __hfma2(a_half2, b_half2[j * kReduction + 0], c_half2[j]);
+      for (int k = 1; k < kReduction; ++k) {
+        d_half2[j] = __hfma2(a_half2, b_half2[j * kReduction + k], d_half2[j]);
+      }
     }
 #endif
   }

diff --git a/cutlass/gemm/clear_accumulators.h b/cutlass/gemm/clear_accumulators.h
@@ -39,6 +39,8 @@ struct ClearAccumulators {
   /// The shared storage.
   struct SharedStorage {};
 
+  /// Ctor.
+  CUTLASS_DEVICE ClearAccumulators() {}
   /// Ctor.
   CUTLASS_DEVICE ClearAccumulators(SharedStorage& shared_storage) {}
 

diff --git a/cutlass/gemm/gemm.h b/cutlass/gemm/gemm.h
@@ -40,7 +40,7 @@ namespace gemm {
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename Gemm_>
-__global__ void gemm_kernel(typename Gemm_::Params params) {
+__global__ /*__launch_bounds__(Gemm_::kThreads)*/ void gemm_kernel(typename Gemm_::Params params) {
   // Declare shared memory.
   __shared__ typename Gemm_::SharedStorage shared_storage;
 
@@ -193,6 +193,71 @@ struct Gemm {
   CUTLASS_DEVICE Gemm(Params const& params_, SharedStorage& shared_storage_)
       : params(params_), shared_storage(shared_storage_) {}
 
+  /// Consume a single iteration of the loop.
+  template <bool kIsLastIteration>
+  CUTLASS_DEVICE void consume_tile(typename Traits::GlobalLoadStream& global_stream,
+                                   typename Traits::SharedLoadStream& shared_load_stream,
+                                   typename Traits::MultiplyAdd::Accumulators& accumulators,
+                                   Index outer_k) {
+    // If that's the last "load iteration" update the predicates.
+    if (!kIsLastIteration) {
+      global_stream.move_to_residue<false>(outer_k);
+    }
+
+    // Load data for the next iteration of the main loop.
+    if (!kIsLastIteration) {
+      global_stream.copy();
+    }
+
+    // The unrolling steps for the main loop.
+    int const kUnrollingSteps =
+        Traits::MultiplyAdd::AccumulatorsPerWarp::kD / Traits::MultiplyAdd::InstructionShape::kD;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int step = 0; step < kUnrollingSteps - 1; ++step) {
+      // Trigger the copy from shared memory for the next A/B values.
+      shared_load_stream.copy(step + 1);
+      // Make sure the values are available for the current iteration to do the multiply-add.
+      shared_load_stream.commit(step);
+
+      // Do the math on the fragments of the current iteration.
+      typename Traits::MultiplyAdd multiply_add;
+      multiply_add.multiply_add(shared_load_stream.fragment_a(step),
+                                shared_load_stream.fragment_b(step),
+                                accumulators,
+                                accumulators);
+    }
+
+    // Make sure the data from shared memory has been entirely consumed.
+    Traits::shared_load_fence(true);
+
+    // Commit the data in shared memory for A/B.
+    if (!kIsLastIteration) {
+      global_stream.commit();
+    }
+
+    // Make sure the data is in shared memory.
+    Traits::shared_store_fence(true);
+
+    // Trigger the loads for the next iteration (if needed).
+    if (!kIsLastIteration) {
+      // Move to the next stage for the load (if it makes sense).
+      shared_load_stream.inc_stage();
+      // Trigger the copy from shared memory for the next loop iteration.
+      shared_load_stream.copy(0);
+    }
+
+    // Make sure the values are available for the current iteration to do the multiply-add.
+    shared_load_stream.commit(kUnrollingSteps - 1);
+
+    // Do the math on the fragments of the current iteration.
+    typename Traits::MultiplyAdd multiply_add;
+    multiply_add.multiply_add(shared_load_stream.fragment_a(kUnrollingSteps - 1),
+                              shared_load_stream.fragment_b(kUnrollingSteps - 1),
+                              accumulators,
+                              accumulators);
+  }
+
   /// Do the GEMM.
   CUTLASS_DEVICE void multiply_add() {
     // Swizzle the IDs of the block (to enable better cache behavior).
@@ -212,16 +277,11 @@ struct Gemm {
     // Create the accumulator clear.
     ClearAccumulators clear(shared_storage.main_loop.clear);
 
-    /// Define the mainloop iteration size
-    typedef typename Traits::MultiplyAdd MultiplyAdd;
-
     // By how much we unroll the main loop.
-    Index const kUnroll = static_cast<Index>(MultiplyAdd::AccumulatorsPerWarp::kD);
+    Index const kUnroll = static_cast<Index>(Traits::OutputTile::kD);
 
     // If we do not have enough steps in the main loop, trigger the residue code.
-    if (params.k < kUnroll) {
-      global_stream.residue(params.k, true);
-    }
+    global_stream.move_to_residue<true>(params.k);
 
     // Fetch the fragments for A and B from global memory.
     global_stream.copy();
@@ -232,9 +292,12 @@ struct Gemm {
     // Make sure the data is in shared memory.
     Traits::shared_store_fence(false);
 
+    // Rollback to the beginning of the GEMM-K dimension. It may have no impact.
+    global_stream.rollback();
+
     // The unrolling steps for the main loop.
     int const kUnrollingSteps =
-        MultiplyAdd::AccumulatorsPerWarp::kD / MultiplyAdd::InstructionShape::kD;
+        Traits::MultiplyAdd::AccumulatorsPerWarp::kD / Traits::MultiplyAdd::InstructionShape::kD;
 
     // Make sure we have at least 2 unrolling steps or our pipeling is not going to work.
     static_assert(kUnrollingSteps >= 2, "The pipelining assumes at least two steps");
@@ -246,59 +309,21 @@ struct Gemm {
     shared_load_stream.copy(0);
 
     // Allocate the accumulators.
-    typename MultiplyAdd::Accumulators accumulators;
+    typename Traits::MultiplyAdd::Accumulators accumulators;
     // Clear the accumulators.
     clear.clear(accumulators);
 
-    // Enter the main loop and iterate.
-    typedef typename Traits::Index Index;
-    for (Index outer_k = params.k - kUnroll; outer_k > -kUnroll; outer_k -= kUnroll) {
-      // If that's the last "load iteration" update the predicates.
-      int const is_residue = outer_k <= kUnroll;
-      if (is_residue) {
-        global_stream.residue(outer_k);
-      }
-
-      // Load data for the next iteration of the main loop.
-      global_stream.copy();
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int step = 0; step < kUnrollingSteps - 1; ++step) {
-        // Trigger the copy from shared memory for the next A/B values.
-        shared_load_stream.copy(step + 1);
-        // Make sure the values are available for the current iteration to do the multiply-add.
-        shared_load_stream.commit(step);
-
-        // Do the math on the fragments of the current iteration.
-        MultiplyAdd multiply_add;
-        multiply_add.multiply_add(shared_load_stream.fragment_a(step),
-                                  shared_load_stream.fragment_b(step),
-                                  accumulators,
-                                  accumulators);
-      }
-
-      // Make sure the data from shared memory has been entirely consumed.
-      Traits::shared_load_fence(true);
-
-      // Commit the data in shared memory for A/B.
-      global_stream.commit();
-
-      // Make sure the data is in shared memory.
-      Traits::shared_store_fence(true);
+    // The loop index.
+    Index outer_k = params.k - kUnroll;
 
-      // Move to the next stage for the load (if it makes sense).
-      shared_load_stream.inc_stage();
-      // Trigger the copy from shared memory for the next loop iteration.
-      shared_load_stream.copy(0);
-      // Make sure the values are available for the current iteration to do the multiply-add.
-      shared_load_stream.commit(kUnrollingSteps - 1);
+    // Enter the main loop and iterate.
+    for (; outer_k > 0; outer_k -= kUnroll) {
+      consume_tile<false>(global_stream, shared_load_stream, accumulators, outer_k);
+    }
 
-      // Do the math on the fragments of the current iteration.
-      MultiplyAdd multiply_add;
-      multiply_add.multiply_add(shared_load_stream.fragment_a(kUnrollingSteps - 1),
-                                shared_load_stream.fragment_b(kUnrollingSteps - 1),
-                                accumulators,
-                                accumulators);
+    // Residual loop.
+    for (; outer_k > -kUnroll; outer_k -= kUnroll) {
+      consume_tile<true>(global_stream, shared_load_stream, accumulators, outer_k);
     }
 
     // Epilogue.