[examples] Fix typos in SYRK and TRMM examples (NVIDIA#507)

zhaoyongke · Jun 4, 2022 · 0abaac8 · 0abaac8
1 parent 858c735
commit 0abaac8
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 7 deletions.
diff --git a/examples/31_basic_syrk/basic_syrk.cu b/examples/31_basic_syrk/basic_syrk.cu
@@ -37,7 +37,7 @@
   the symmetric rank-k update (SYRK) using double-precision doubleing-point arithmetic and assumes
   all matrices have column-major layout.
 
-  The threadblock tile size is chosen as 128x128x8 which offers good performance for large matrices.
+  The threadblock tile size is chosen as 16x32x16 which offers good performance for large matrices.
   See the CUTLASS Parallel for All blog post for more exposition on the tunable parameters available
   in CUTLASS.
 
@@ -83,7 +83,7 @@ cudaError_t CutlassSsyrkNN(
   int ldc) {
 
   // Define type definition for double-precision CUTLASS SYRK with column-major
-  // input matrices and 128x128x8 threadblock tile size (chosen by default).
+  // input matrices and 16x32x16 threadblock tile size (chosen by default).
   //
   // To keep the interface manageable, several helpers are defined for plausible compositions
   // including the following example for double-precision SYRK. Typical values are used as
@@ -138,7 +138,7 @@ cudaError_t CutlassSsyrkNN(
                               {alpha, beta}, // Scalars used in the Epilogue
                               reinterpret_cast<void const *>(A),
                               const_cast<void *>(reinterpret_cast<void *>(C)),
-                              reinterpret_cast<void *>(C), // destination matrix D (may be different memory than so    urce C matrix)
+                              reinterpret_cast<void *>(C), // destination matrix D (may be different memory than source C matrix)
                               (int64_t)N*K, // Batch strides
                               (int64_t)N*N,
                               (int64_t)N*N,

diff --git a/examples/32_basic_trmm/basic_trmm.cu b/examples/32_basic_trmm/basic_trmm.cu
@@ -37,7 +37,7 @@
   the triangular matrix product (TRMM) using double-precision doubleing-point arithmetic and assumes
   all matrices have column-major layout.
 
-  The threadblock tile size is chosen as 128x128x8 which offers good performance for large matrices.
+  The threadblock tile size is chosen as 64x64x16 which offers good performance for large matrices.
   See the CUTLASS Parallel for All blog post for more exposition on the tunable parameters available
   in CUTLASS.
 
@@ -84,7 +84,7 @@ cudaError_t CutlassStrmmNN(
   int ldc) {
 
   // Define type definition for double-precision CUTLASS TRMM with column-major
-  // input matrices and 128x128x8 threadblock tile size (chosen by default).
+  // input matrices and 64x64x16 threadblock tile size (chosen by default).
   //
   // To keep the interface manageable, several helpers are defined for plausible compositions
   // including the following example for double-precision TRMM. Typical values are used as
@@ -107,7 +107,7 @@ cudaError_t CutlassStrmmNN(
     double,
     cutlass::arch::OpClassTensorOp,
     cutlass::arch::Sm80,
-    cutlass::gemm::GemmShape<64,64, 16>,
+    cutlass::gemm::GemmShape<64, 64, 16>,
     cutlass::gemm::GemmShape<32, 32, 16>,
     cutlass::gemm::GemmShape<8, 8, 4>,
     cutlass::epilogue::thread::LinearCombination<
@@ -143,7 +143,7 @@ cudaError_t CutlassStrmmNN(
                               {alpha}, // Scalars used in the Epilogue
                               reinterpret_cast<void const *>(A),
                               reinterpret_cast<void const *>(B),
-                              reinterpret_cast<void *>(C), // destination matrix D (may be different memory than so    urce C matrix)
+                              reinterpret_cast<void *>(C), // destination matrix D (may be different memory than source C matrix)
                               (int64_t)M*M, // Batch strides
                               (int64_t)M*N,
                               (int64_t)M*N,