Fix several typos (NVIDIA#1169)

Co-authored-by: isaacw <isaacw@nvidia.com>
AllenDou · Nov 3, 2023 · 557be3a · 557be3a
1 parent c008b4a
commit 557be3a
Show file tree

Hide file tree

Showing 21 changed files with 30 additions and 30 deletions.
diff --git a/docs/namespacecutlass_1_1reference_1_1host.html b/docs/namespacecutlass_1_1reference_1_1host.html
@@ -1677,7 +1677,7 @@ <h2 class="groupheader">Function Documentation</h2>
         </tr>
       </table>
 </div><div class="memdoc">
-<p>Returns a pair containing a boolean of whether a value exists in a tensor and the location of of the first occurrence. If the value is not contained in the tensor, the second element of the pair is undefined. </p>
+<p>Returns a pair containing a boolean of whether a value exists in a tensor and the location of the first occurrence. If the value is not contained in the tensor, the second element of the pair is undefined. </p>
 
 </div>
 </div>

diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py
@@ -49,7 +49,7 @@ def __init__(self, fuse_gemm_info, gen_class_name, user_header_file, cutlass_dep
         self.arg_member = []
         self.gen_class_name = gen_class_name
         self.gen_kernel_name = gen_class_name + "Kernel"
-        self.tempalte_args = []
+        self.template_args = []
         self.__tempalate_arg_list = {'Stages': int, 'SplitKSerial': bool, 'IsBetaZero': bool, 'AlignmentA': int, 'AlignmentB': int}
 
         self.file_name = output_dir + "/device/" +gen_class_name +".h"
@@ -63,7 +63,7 @@ def __init__(self, fuse_gemm_info, gen_class_name, user_header_file, cutlass_dep
         self.first_use_1stage = False
 
         ## gen kernel
-        self.gen_kernel = gen_ker.gen_kernel(self.tempalte_args, self.gen_class_name, self.b2b_num, output_dir, cutlass_deps_root, project_root)
+        self.gen_kernel = gen_ker.gen_kernel(self.template_args, self.gen_class_name, self.b2b_num, output_dir, cutlass_deps_root, project_root)
 
 
     def __check_arg_type(self, temp_arg):
@@ -126,7 +126,7 @@ def gen_code(self, sm_cap, mma_tp, ifprint = True):
         func_code = self.gen_all_func()
         member_var_code = "private:\n typename B2bGemmKernel::Params params_;\n"
 
-        gen_code = gen_ir.gen_template_class(self.gen_class_name, self.tempalte_args, func_code + member_var_code)
+        gen_code = gen_ir.gen_template_class(self.gen_class_name, self.template_args, func_code + member_var_code)
         code = self.gen_include_header() + gen_ir.gen_namespace("cutlass", gen_ir.gen_namespace("gemm", gen_ir.gen_namespace("device", gen_code)))
 
         if ifprint:
@@ -142,7 +142,7 @@ def gen_code(self, sm_cap, mma_tp, ifprint = True):
 
     def update_b2b_class_template_args(self):
         for arg in self.args.keys():
-            self.tempalte_args.append([self.__check_arg_type(arg), arg, self.args[arg]])
+            self.template_args.append([self.__check_arg_type(arg), arg, self.args[arg]])
 
     def update_b2b_args(self):
 

diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py
@@ -444,7 +444,7 @@ def __init__(self, template_param, gen_class_name, b2b_num, output_dir, cutlass_
 
         self.gen_class_name = "B2bGemm"
         self.gen_kernel_name = gen_class_name + "Kernel"
-        self.tempalte_args = []
+        self.template_args = []
 
         self.cutlass_deps_root = cutlass_deps_root
         self.project_root = project_root

diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py
@@ -957,21 +957,21 @@ def gen_public_member(self):
 
     def gen_code(self):
 
-        tempalte_arg = []
+        template_arg = []
         for i in range(self.b2b_num):
-            tempalte_arg.append(("typename", helper.var_idx("Shape", i)))
+            template_arg.append(("typename", helper.var_idx("Shape", i)))
         for i in range(self.b2b_num):
-            tempalte_arg.append(("typename", helper.var_idx("Policy", i)))
+            template_arg.append(("typename", helper.var_idx("Policy", i)))
         for i in range(self.b2b_num):
-            tempalte_arg.append((int, helper.var_idx("Stage", i)))
+            template_arg.append((int, helper.var_idx("Stage", i)))
 
 
 
         code_body = self.gen_using_and_misc(self.b2b_num)
         code_body += self.gen_protected()
         code_body += self.gen_public_member()
 
-        class_code = gen_ir.gen_template_class("B2bMmaBase", tempalte_arg, code_body)
+        class_code = gen_ir.gen_template_class("B2bMmaBase", template_arg, code_body)
 
         code = self.gen_include_header() + gen_ir.gen_namespace("cutlass", gen_ir.gen_namespace("gemm", gen_ir.gen_namespace("threadblock", class_code)))
 

diff --git a/examples/python/04_epilogue_visitor.ipynb b/examples/python/04_epilogue_visitor.ipynb
@@ -68,7 +68,7 @@
    "source": [
     "## Define the epilogue visitor functor\n",
     "The epilogue functor can be defined as a simple Python function and a set of example tensors for inputs and outputs. The example below illustrates a complex epilogue under the directed acyclic graph structure (`F` is used twice). The epilogue takes source tensors in different ranks: `alpha`, `beta` are scalars, `bias` is a column vector to broadcast, and `C`, `aux` are matrices. It contains various math operations from basic arithmatic operations and built-in callable functions like `relu`. It also accomodates multiple outputs `D` and `F`. Note that there are some restrictions on syntax.\n",
-    "* Each named variable must be assigned exactly once and defined before it it used.\n",
+    "* Each named variable must be assigned exactly once and defined before it used.\n",
     "* Reserved names: `accum`, `C`, and `D` are reserved for accumulator, tensor_C, and tensor_D.\n",
     "* Return values must be a named variable.\n",
     "\n",

diff --git a/include/cute/algorithm/tensor_algorithms.hpp b/include/cute/algorithm/tensor_algorithms.hpp
@@ -123,7 +123,7 @@ transform(Tensor<EngineIn,LayoutIn>&& tensor_in, Tensor<EngineOut,LayoutOut>&& t
 
 // Similar to std::transform with a binary operation
 // Takes two tensors as input and one tensor as output. 
-// Applies the binary_op to tensor_in1 and and tensor_in2 and
+// Applies the binary_op to tensor_in1 and tensor_in2 and
 // assigns it to tensor_out
 template <class EngineIn1, class LayoutIn1,
           class EngineIn2, class LayoutIn2,

diff --git a/include/cute/layout.hpp b/include/cute/layout.hpp
@@ -576,7 +576,7 @@ depth(Layout<Shape,Stride> const& layout)
 
 // Return the codomain shape of a mode
 // @post size(coshape(@a a)) == cosize(@a a)
-// @return C Coordinate with smallest elements such that that
+// @return C Coordinate with smallest elements such that
 //           @a elem_less(sub_layout(c), C) for all c < size(@a sub_layout)
 //           where sub_layout = get<Is...>(layout).
 template <int... Is, class Shape, class Stride>

diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
@@ -527,7 +527,7 @@ class GemmUniversal<
         auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
         auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
 
-        // Allocate the the accumulators for the (M,N) blk_shape
+        // Allocate the accumulators for the (M,N) blk_shape
         //
         // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
         auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)

diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
@@ -540,7 +540,7 @@ class GemmUniversal<
         auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
         auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
 
-        // Allocate the the accumulators for the (M,N) blk_shape
+        // Allocate the accumulators for the (M,N) blk_shape
         Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
 
         // Order two Math WG's MMA one after the other, helps hide Epilogue

diff --git a/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp b/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp
@@ -347,7 +347,7 @@ class PersistentTileSchedulerSm90StreamK {
     // The number of tiles for which reduction is required is either:
     //   (a) the total number of output tiles (in the case of split-K)
     //   (b) the number of stream-K tiles
-    // To calculate the the total number of output tiles in the split-K case, we
+    // To calcualte the total number of output tiles in the split-K case, we
     // note that, in the split-K case, the units_per_problem_ member of Params will be
     // the total number of output tiles.
     auto reduction_tiles = params.splits_ > 1 ? params.units_per_problem_ : params.sk_tiles_;

diff --git a/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp b/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp
@@ -556,7 +556,7 @@ class AsyncTranspositionOperandB_1BElementB {
     constexpr auto WarpThreadLayout           = make_layout(make_shape(Int<WarpThreadShapeN>{}, Int<WarpThreadShapeK>{}));
     //////////////////////////////////////////////////////////////////////////////////////////////////////////////
     /// A warp group uses 8 steps to transpose the whole WarpgroupTileSize x WarpgroupTileSize.
-    ///  Divide a warp_group_tile into 8x8 warp_tiles to futher reduce the reg usage.
+    ///  Divide a warp_group_tile into 8x8 warp_tiles to further reduce the reg usage.
     ///  Step 0:                   Step 1:                   Step 2:                   Step 3:
     ///  W0 W1 W2 W3 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
     ///  W1 W0 -- -- -- -- -- --   -- -- W3 W2 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --

diff --git a/include/cutlass/wmma_array.h b/include/cutlass/wmma_array.h
@@ -47,7 +47,7 @@ namespace cutlass {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-/// Wmma array type (WmmaFragmentArray holds elements of of type nvcuda::wmma::fragment)
+/// Wmma array type (WmmaFragmentArray holds elements of type nvcuda::wmma::fragment)
 template <
   /// Element type
   typename T,

diff --git a/media/docs/cute/04_algorithms.md b/media/docs/cute/04_algorithms.md
@@ -116,7 +116,7 @@ would include the following.
    access instructions (like `cp.async`), then dispatch to the
    custom instruction.
 
-2. The the two `Tensor`s have static layouts and it can be proven
+2. The two `Tensor`s have static layouts and it can be proven
    that element vectorization is valid -- for example, four `LDS.32`s
    can be combined into a single `LDS.128` -- then vectorize the source
    and destinations tensors.

diff --git a/media/docs/cute/0t_mma_atom.md b/media/docs/cute/0t_mma_atom.md
@@ -37,7 +37,7 @@ and the `Layout`s of threads and values within the operation.
 The `MMA_Traits` struct takes the Operation as a template parameter.
 CuTe specializes `MMA_Traits` for each Operation type that it supports.
 
-Together, these two types comprise an "Atom" that decouples the complexity of thread and data layouts from the call site of of the PTX instruction.  The Atom's Traits struct exposes information that is relevant to a single MMA operation, no matter the granularity at which it operates.
+Together, these two types comprise an "Atom" that decouples the complexity of thread and data layouts from the call site of the PTX instruction.  The Atom's Traits struct exposes information that is relevant to a single MMA operation, no matter the granularity at which it operates.
 
 CuTe MMA atoms expose the semantics of a single MMA operation.
 This is true regardless of the hardware level at which the MMA operates.

diff --git a/media/docs/cute/0x_gemm_tutorial.md b/media/docs/cute/0x_gemm_tutorial.md
@@ -255,7 +255,7 @@ int bar()
 }
 ```
 
-"Static" is an unfortunately overloaded term in C++.  Sometimes it means "the opposite of instance," like a "static function" or "static member" of a class.  (Some programming languages, like Java, say "class method" to refer to a "static function of a class.")  That's not what we mean here.  Instead, we mean "part of a compile-time type."  For example, `Int<1>` encodes the value 1 at compile time, as part of the type of a templated class `Int<Value>`.  `Int<3>` and `Int<4>` have different types.  You can get the value of of the type like this: `Int<3>::value`.  (The `value` is a `static constexpr` member of the class, where "static" means "opposite of instance.")  As soon as you go from `Int<3>` to `Int<3>::value`, you've gone from (3) above (a compile-time value) to (2) above (a `constexpr` value).  In some situations, this may mean that the compiler treats it as a run-time value.
+"Static" is an unfortunately overloaded term in C++.  Sometimes it means "the opposite of instance," like a "static function" or "static member" of a class.  (Some programming languages, like Java, say "class method" to refer to a "static function of a class.")  That's not what we mean here.  Instead, we mean "part of a compile-time type."  For example, `Int<1>` encodes the value 1 at compile time, as part of the type of a templated class `Int<Value>`.  `Int<3>` and `Int<4>` have different types.  You can get the value of the type like this: `Int<3>::value`.  (The `value` is a `static constexpr` member of the class, where "static" means "opposite of instance.")  As soon as you go from `Int<3>` to `Int<3>::value`, you've gone from (3) above (a compile-time value) to (2) above (a `constexpr` value).  In some situations, this may mean that the compiler treats it as a run-time value.
 
 #### Strides
 

diff --git a/media/docs/quickstart.md b/media/docs/quickstart.md
@@ -56,7 +56,7 @@ You may explicitly exclude cuBLAS and cuDNN as dependencies with the following C
 
 ## Build and run the CUTLASS Profiler
 
-From the `build/` directory created above, compile the the CUTLASS Profiler.
+From the `build/` directory created above, compile the CUTLASS Profiler.
 ```bash
 $ make cutlass_profiler -j12
 ```

diff --git a/test/unit/conv/device/conv2d_testbed.h b/test/unit/conv/device/conv2d_testbed.h
@@ -696,7 +696,7 @@ bool TestAllConv2d(
       return false;
     }
 
-    // If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the the number of tested problem counts
+    // If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the number of tested problem counts
     if (CutlassUnitTestProblemCount() && 
         testbed.tested_problem_count > CutlassUnitTestProblemCount()) {
       return true;
@@ -742,7 +742,7 @@ bool TestAllConv2d(
   }
   // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
   // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
-  // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep 
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep 
   // alpha and beta for local testing, but only runs one value for alpha and beta.
   cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
       {1, 17, 11, 288},   // input size (NHWC)
@@ -784,7 +784,7 @@ bool TestAllConv2d(
             return false;
           }
 
-          // If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the the number of tested problem counts
+          // If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the number of tested problem counts
           if (CutlassUnitTestProblemCount() && 
               testbed.tested_problem_count > CutlassUnitTestProblemCount()) {
             return true;

diff --git a/test/unit/conv/device/conv2d_testbed_interleaved.h b/test/unit/conv/device/conv2d_testbed_interleaved.h
@@ -609,7 +609,7 @@ bool TestAllInterleavedConv2d(
 #if 0
   // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
   // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
-  // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep 
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep 
   // alpha and beta for local testing, but only runs one value for alpha and beta.
   cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
       {1, 17, 11, 288},   // input size (NHWC)

diff --git a/test/unit/conv/device/conv2d_with_broadcast_testbed.h b/test/unit/conv/device/conv2d_with_broadcast_testbed.h
@@ -632,7 +632,7 @@ bool TestAllConv2dWithBroadcast(
 
   // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
   // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
-  // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep 
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep 
   // alpha and beta for local testing, but only runs one value for alpha and beta.
   cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
       {1, 17, 11, 288},   // input size (NHWC)

diff --git a/test/unit/conv/device/conv2d_with_reduction_testbed.h b/test/unit/conv/device/conv2d_with_reduction_testbed.h
@@ -587,7 +587,7 @@ bool TestAllConv2dWithReduction(
 
   // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
   // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
-  // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep 
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep 
   // alpha and beta for local testing, but only runs one value for alpha and beta.
   cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
       {1, 17, 11, 288},   // input size (NHWC)

diff --git a/test/unit/conv/device/conv3d_testbed.h b/test/unit/conv/device/conv3d_testbed.h
@@ -613,7 +613,7 @@ bool TestAllConv3d(
 
   // Sweep split-k-slice using serial reduction with non-unity alpha and non-zero beta for 
   // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
-  // which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep 
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep 
   // alpha and beta for local testing, but only runs one value for alpha and beta.
   cutlass::conv::Conv3dProblemSize conv3d_split_k_test_size (
     {1, 8, 8, 8, 32},            // input size  (NDHWC)
-Original file line number
+Diff line change
@@ Expand Up / @@ -255,7 +255,7 @@ int bar() @@
     }
     ```
-    "Static" is an unfortunately overloaded term in C++.  Sometimes it means "the opposite of instance," like a "static function" or "static member" of a class.  (Some programming languages, like Java, say "class method" to refer to a "static function of a class.")  That's not what we mean here.  Instead, we mean "part of a compile-time type."  For example, `Int<1>` encodes the value 1 at compile time, as part of the type of a templated class `Int<Value>`.  `Int<3>` and `Int<4>` have different types.  You can get the value of of the type like this: `Int<3>::value`.  (The `value` is a `static constexpr` member of the class, where "static" means "opposite of instance.")  As soon as you go from `Int<3>` to `Int<3>::value`, you've gone from (3) above (a compile-time value) to (2) above (a `constexpr` value).  In some situations, this may mean that the compiler treats it as a run-time value.
+    "Static" is an unfortunately overloaded term in C++.  Sometimes it means "the opposite of instance," like a "static function" or "static member" of a class.  (Some programming languages, like Java, say "class method" to refer to a "static function of a class.")  That's not what we mean here.  Instead, we mean "part of a compile-time type."  For example, `Int<1>` encodes the value 1 at compile time, as part of the type of a templated class `Int<Value>`.  `Int<3>` and `Int<4>` have different types.  You can get the value of the type like this: `Int<3>::value`.  (The `value` is a `static constexpr` member of the class, where "static" means "opposite of instance.")  As soon as you go from `Int<3>` to `Int<3>::value`, you've gone from (3) above (a compile-time value) to (2) above (a `constexpr` value).  In some situations, this may mean that the compiler treats it as a run-time value.
     #### Strides
@@ Expand Down @@