Skip to content

Commit

Permalink
Fix several typos (NVIDIA#1169)
Browse files Browse the repository at this point in the history
Co-authored-by: isaacw <isaacw@nvidia.com>
  • Loading branch information
wang-y-z and isaacw authored Nov 3, 2023
1 parent c008b4a commit 557be3a
Show file tree
Hide file tree
Showing 21 changed files with 30 additions and 30 deletions.
2 changes: 1 addition & 1 deletion docs/namespacecutlass_1_1reference_1_1host.html
Original file line number Diff line number Diff line change
Expand Up @@ -1677,7 +1677,7 @@ <h2 class="groupheader">Function Documentation</h2>
</tr>
</table>
</div><div class="memdoc">
<p>Returns a pair containing a boolean of whether a value exists in a tensor and the location of of the first occurrence. If the value is not contained in the tensor, the second element of the pair is undefined. </p>
<p>Returns a pair containing a boolean of whether a value exists in a tensor and the location of the first occurrence. If the value is not contained in the tensor, the second element of the pair is undefined. </p>

</div>
</div>
Expand Down
8 changes: 4 additions & 4 deletions examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def __init__(self, fuse_gemm_info, gen_class_name, user_header_file, cutlass_dep
self.arg_member = []
self.gen_class_name = gen_class_name
self.gen_kernel_name = gen_class_name + "Kernel"
self.tempalte_args = []
self.template_args = []
self.__tempalate_arg_list = {'Stages': int, 'SplitKSerial': bool, 'IsBetaZero': bool, 'AlignmentA': int, 'AlignmentB': int}

self.file_name = output_dir + "/device/" +gen_class_name +".h"
Expand All @@ -63,7 +63,7 @@ def __init__(self, fuse_gemm_info, gen_class_name, user_header_file, cutlass_dep
self.first_use_1stage = False

## gen kernel
self.gen_kernel = gen_ker.gen_kernel(self.tempalte_args, self.gen_class_name, self.b2b_num, output_dir, cutlass_deps_root, project_root)
self.gen_kernel = gen_ker.gen_kernel(self.template_args, self.gen_class_name, self.b2b_num, output_dir, cutlass_deps_root, project_root)


def __check_arg_type(self, temp_arg):
Expand Down Expand Up @@ -126,7 +126,7 @@ def gen_code(self, sm_cap, mma_tp, ifprint = True):
func_code = self.gen_all_func()
member_var_code = "private:\n typename B2bGemmKernel::Params params_;\n"

gen_code = gen_ir.gen_template_class(self.gen_class_name, self.tempalte_args, func_code + member_var_code)
gen_code = gen_ir.gen_template_class(self.gen_class_name, self.template_args, func_code + member_var_code)
code = self.gen_include_header() + gen_ir.gen_namespace("cutlass", gen_ir.gen_namespace("gemm", gen_ir.gen_namespace("device", gen_code)))

if ifprint:
Expand All @@ -142,7 +142,7 @@ def gen_code(self, sm_cap, mma_tp, ifprint = True):

def update_b2b_class_template_args(self):
for arg in self.args.keys():
self.tempalte_args.append([self.__check_arg_type(arg), arg, self.args[arg]])
self.template_args.append([self.__check_arg_type(arg), arg, self.args[arg]])

def update_b2b_args(self):

Expand Down
2 changes: 1 addition & 1 deletion examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,7 @@ def __init__(self, template_param, gen_class_name, b2b_num, output_dir, cutlass_

self.gen_class_name = "B2bGemm"
self.gen_kernel_name = gen_class_name + "Kernel"
self.tempalte_args = []
self.template_args = []

self.cutlass_deps_root = cutlass_deps_root
self.project_root = project_root
Expand Down
10 changes: 5 additions & 5 deletions examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -957,21 +957,21 @@ def gen_public_member(self):

def gen_code(self):

tempalte_arg = []
template_arg = []
for i in range(self.b2b_num):
tempalte_arg.append(("typename", helper.var_idx("Shape", i)))
template_arg.append(("typename", helper.var_idx("Shape", i)))
for i in range(self.b2b_num):
tempalte_arg.append(("typename", helper.var_idx("Policy", i)))
template_arg.append(("typename", helper.var_idx("Policy", i)))
for i in range(self.b2b_num):
tempalte_arg.append((int, helper.var_idx("Stage", i)))
template_arg.append((int, helper.var_idx("Stage", i)))



code_body = self.gen_using_and_misc(self.b2b_num)
code_body += self.gen_protected()
code_body += self.gen_public_member()

class_code = gen_ir.gen_template_class("B2bMmaBase", tempalte_arg, code_body)
class_code = gen_ir.gen_template_class("B2bMmaBase", template_arg, code_body)

code = self.gen_include_header() + gen_ir.gen_namespace("cutlass", gen_ir.gen_namespace("gemm", gen_ir.gen_namespace("threadblock", class_code)))

Expand Down
2 changes: 1 addition & 1 deletion examples/python/04_epilogue_visitor.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
"source": [
"## Define the epilogue visitor functor\n",
"The epilogue functor can be defined as a simple Python function and a set of example tensors for inputs and outputs. The example below illustrates a complex epilogue under the directed acyclic graph structure (`F` is used twice). The epilogue takes source tensors in different ranks: `alpha`, `beta` are scalars, `bias` is a column vector to broadcast, and `C`, `aux` are matrices. It contains various math operations from basic arithmatic operations and built-in callable functions like `relu`. It also accomodates multiple outputs `D` and `F`. Note that there are some restrictions on syntax.\n",
"* Each named variable must be assigned exactly once and defined before it it used.\n",
"* Each named variable must be assigned exactly once and defined before it used.\n",
"* Reserved names: `accum`, `C`, and `D` are reserved for accumulator, tensor_C, and tensor_D.\n",
"* Return values must be a named variable.\n",
"\n",
Expand Down
2 changes: 1 addition & 1 deletion include/cute/algorithm/tensor_algorithms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ transform(Tensor<EngineIn,LayoutIn>&& tensor_in, Tensor<EngineOut,LayoutOut>&& t

// Similar to std::transform with a binary operation
// Takes two tensors as input and one tensor as output.
// Applies the binary_op to tensor_in1 and and tensor_in2 and
// Applies the binary_op to tensor_in1 and tensor_in2 and
// assigns it to tensor_out
template <class EngineIn1, class LayoutIn1,
class EngineIn2, class LayoutIn2,
Expand Down
2 changes: 1 addition & 1 deletion include/cute/layout.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,7 @@ depth(Layout<Shape,Stride> const& layout)

// Return the codomain shape of a mode
// @post size(coshape(@a a)) == cosize(@a a)
// @return C Coordinate with smallest elements such that that
// @return C Coordinate with smallest elements such that
// @a elem_less(sub_layout(c), C) for all c < size(@a sub_layout)
// where sub_layout = get<Is...>(layout).
template <int... Is, class Shape, class Stride>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ class GemmUniversal<
auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
// Allocate the the accumulators for the (M,N) blk_shape
// Allocate the accumulators for the (M,N) blk_shape
//
// MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); // (MMA,MMA_M,MMA_N)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -540,7 +540,7 @@ class GemmUniversal<
auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
// Allocate the the accumulators for the (M,N) blk_shape
// Allocate the accumulators for the (M,N) blk_shape
Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); // (MMA,MMA_M,MMA_N)
// Order two Math WG's MMA one after the other, helps hide Epilogue
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ class PersistentTileSchedulerSm90StreamK {
// The number of tiles for which reduction is required is either:
// (a) the total number of output tiles (in the case of split-K)
// (b) the number of stream-K tiles
// To calculate the the total number of output tiles in the split-K case, we
// To calcualte the total number of output tiles in the split-K case, we
// note that, in the split-K case, the units_per_problem_ member of Params will be
// the total number of output tiles.
auto reduction_tiles = params.splits_ > 1 ? params.units_per_problem_ : params.sk_tiles_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,7 @@ class AsyncTranspositionOperandB_1BElementB {
constexpr auto WarpThreadLayout = make_layout(make_shape(Int<WarpThreadShapeN>{}, Int<WarpThreadShapeK>{}));
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// A warp group uses 8 steps to transpose the whole WarpgroupTileSize x WarpgroupTileSize.
/// Divide a warp_group_tile into 8x8 warp_tiles to futher reduce the reg usage.
/// Divide a warp_group_tile into 8x8 warp_tiles to further reduce the reg usage.
/// Step 0: Step 1: Step 2: Step 3:
/// W0 W1 W2 W3 -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
/// W1 W0 -- -- -- -- -- -- -- -- W3 W2 -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
Expand Down
2 changes: 1 addition & 1 deletion include/cutlass/wmma_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ namespace cutlass {

////////////////////////////////////////////////////////////////////////////////////////////////////

/// Wmma array type (WmmaFragmentArray holds elements of of type nvcuda::wmma::fragment)
/// Wmma array type (WmmaFragmentArray holds elements of type nvcuda::wmma::fragment)
template <
/// Element type
typename T,
Expand Down
2 changes: 1 addition & 1 deletion media/docs/cute/04_algorithms.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ would include the following.
access instructions (like `cp.async`), then dispatch to the
custom instruction.
2. The the two `Tensor`s have static layouts and it can be proven
2. The two `Tensor`s have static layouts and it can be proven
that element vectorization is valid -- for example, four `LDS.32`s
can be combined into a single `LDS.128` -- then vectorize the source
and destinations tensors.
Expand Down
2 changes: 1 addition & 1 deletion media/docs/cute/0t_mma_atom.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ and the `Layout`s of threads and values within the operation.
The `MMA_Traits` struct takes the Operation as a template parameter.
CuTe specializes `MMA_Traits` for each Operation type that it supports.

Together, these two types comprise an "Atom" that decouples the complexity of thread and data layouts from the call site of of the PTX instruction. The Atom's Traits struct exposes information that is relevant to a single MMA operation, no matter the granularity at which it operates.
Together, these two types comprise an "Atom" that decouples the complexity of thread and data layouts from the call site of the PTX instruction. The Atom's Traits struct exposes information that is relevant to a single MMA operation, no matter the granularity at which it operates.

CuTe MMA atoms expose the semantics of a single MMA operation.
This is true regardless of the hardware level at which the MMA operates.
Expand Down
2 changes: 1 addition & 1 deletion media/docs/cute/0x_gemm_tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ int bar()
}
```
"Static" is an unfortunately overloaded term in C++. Sometimes it means "the opposite of instance," like a "static function" or "static member" of a class. (Some programming languages, like Java, say "class method" to refer to a "static function of a class.") That's not what we mean here. Instead, we mean "part of a compile-time type." For example, `Int<1>` encodes the value 1 at compile time, as part of the type of a templated class `Int<Value>`. `Int<3>` and `Int<4>` have different types. You can get the value of of the type like this: `Int<3>::value`. (The `value` is a `static constexpr` member of the class, where "static" means "opposite of instance.") As soon as you go from `Int<3>` to `Int<3>::value`, you've gone from (3) above (a compile-time value) to (2) above (a `constexpr` value). In some situations, this may mean that the compiler treats it as a run-time value.
"Static" is an unfortunately overloaded term in C++. Sometimes it means "the opposite of instance," like a "static function" or "static member" of a class. (Some programming languages, like Java, say "class method" to refer to a "static function of a class.") That's not what we mean here. Instead, we mean "part of a compile-time type." For example, `Int<1>` encodes the value 1 at compile time, as part of the type of a templated class `Int<Value>`. `Int<3>` and `Int<4>` have different types. You can get the value of the type like this: `Int<3>::value`. (The `value` is a `static constexpr` member of the class, where "static" means "opposite of instance.") As soon as you go from `Int<3>` to `Int<3>::value`, you've gone from (3) above (a compile-time value) to (2) above (a `constexpr` value). In some situations, this may mean that the compiler treats it as a run-time value.
#### Strides
Expand Down
2 changes: 1 addition & 1 deletion media/docs/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ You may explicitly exclude cuBLAS and cuDNN as dependencies with the following C

## Build and run the CUTLASS Profiler

From the `build/` directory created above, compile the the CUTLASS Profiler.
From the `build/` directory created above, compile the CUTLASS Profiler.
```bash
$ make cutlass_profiler -j12
```
Expand Down
6 changes: 3 additions & 3 deletions test/unit/conv/device/conv2d_testbed.h
Original file line number Diff line number Diff line change
Expand Up @@ -696,7 +696,7 @@ bool TestAllConv2d(
return false;
}

// If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the the number of tested problem counts
// If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the number of tested problem counts
if (CutlassUnitTestProblemCount() &&
testbed.tested_problem_count > CutlassUnitTestProblemCount()) {
return true;
Expand Down Expand Up @@ -742,7 +742,7 @@ bool TestAllConv2d(
}
// Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for
// a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters
// which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep
// which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
// alpha and beta for local testing, but only runs one value for alpha and beta.
cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
{1, 17, 11, 288}, // input size (NHWC)
Expand Down Expand Up @@ -784,7 +784,7 @@ bool TestAllConv2d(
return false;
}

// If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the the number of tested problem counts
// If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the number of tested problem counts
if (CutlassUnitTestProblemCount() &&
testbed.tested_problem_count > CutlassUnitTestProblemCount()) {
return true;
Expand Down
2 changes: 1 addition & 1 deletion test/unit/conv/device/conv2d_testbed_interleaved.h
Original file line number Diff line number Diff line change
Expand Up @@ -609,7 +609,7 @@ bool TestAllInterleavedConv2d(
#if 0
// Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for
// a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters
// which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep
// which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
// alpha and beta for local testing, but only runs one value for alpha and beta.
cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
{1, 17, 11, 288}, // input size (NHWC)
Expand Down
2 changes: 1 addition & 1 deletion test/unit/conv/device/conv2d_with_broadcast_testbed.h
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,7 @@ bool TestAllConv2dWithBroadcast(

// Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for
// a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters
// which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep
// which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
// alpha and beta for local testing, but only runs one value for alpha and beta.
cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
{1, 17, 11, 288}, // input size (NHWC)
Expand Down
2 changes: 1 addition & 1 deletion test/unit/conv/device/conv2d_with_reduction_testbed.h
Original file line number Diff line number Diff line change
Expand Up @@ -587,7 +587,7 @@ bool TestAllConv2dWithReduction(

// Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for
// a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters
// which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep
// which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
// alpha and beta for local testing, but only runs one value for alpha and beta.
cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
{1, 17, 11, 288}, // input size (NHWC)
Expand Down
2 changes: 1 addition & 1 deletion test/unit/conv/device/conv3d_testbed.h
Original file line number Diff line number Diff line change
Expand Up @@ -613,7 +613,7 @@ bool TestAllConv3d(

// Sweep split-k-slice using serial reduction with non-unity alpha and non-zero beta for
// a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters
// which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep
// which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
// alpha and beta for local testing, but only runs one value for alpha and beta.
cutlass::conv::Conv3dProblemSize conv3d_split_k_test_size (
{1, 8, 8, 8, 32}, // input size (NDHWC)
Expand Down

0 comments on commit 557be3a

Please sign in to comment.