Fix typos shoulde should (#71350)

co63oc · web-flow · commit 0af1249c2d24 · 2025-03-03T17:58:25.000+08:00
diff --git a/paddle/cinn/hlir/framework/pir/trivial_op_util.cc b/paddle/cinn/hlir/framework/pir/trivial_op_util.cc
@@ -1072,7 +1072,7 @@ void CheckLoopAlignment(const std::vector<ir::Expr>& roots) {
     }
     PADDLE_ENFORCE(fusion::VectorEqual(base_loop_vars, loop_vars, var_equal),
                    ::common::errors::PreconditionNotMet(
-                       "CheckLoopAlignment Failed, The loop vars are not euqal "
+                       "CheckLoopAlignment Failed, The loop vars are not equal "
                        "between FusionOps: \n%s\n%s",
                        roots[base_loop_idx],
                        roots[i]));
@@ -1087,7 +1087,7 @@ void CheckLoopAlignment(const std::vector<ir::Expr>& roots) {
       PADDLE_ENFORCE(
           fusion::VectorEqual(base_reduce_vars, reduce_vars, var_equal),
           ::common::errors::PreconditionNotMet(
-              "CheckLoopAlignment Failed, The reduce vars are not euqal "
+              "CheckLoopAlignment Failed, The reduce vars are not equal "
               "between FusionOps: \n%s\n%s",
               roots[base_reduce_idx],
               roots[i]));
diff --git a/paddle/phi/core/distributed/auto_parallel/placement_types.cc b/paddle/phi/core/distributed/auto_parallel/placement_types.cc
@@ -87,7 +87,7 @@ phi::distributed::Placements cvt_dim_map_to_placements(
       auto& p = placements[mesh_id];
       if (p->is_shard()) {
         PADDLE_THROW(common::errors::PreconditionNotMet(
-            "ProcessMesh dimension cann't be mapped to two  dimension of the "
+            "ProcessMesh dimension can't be mapped to two  dimension of the "
             "same tensor: {%d} and {%d}",
             i,
             dynamic_cast<phi::distributed::Shard&>(*p).get_dim()));
diff --git a/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h b/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h
@@ -168,7 +168,7 @@ class CublasLtAlgoCache {
                                            cudaDataType_t bias_type,
                                            cudaDataType_t c_type,
                                            cudaStream_t stream) {
-    // If we don't have config file and we donot search, here return nullptr
+    // If we don't have config file and we do not search, here return nullptr
     if (!has_config_file_ && search_times_ <= 0) {
       return nullptr;
     }
diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -311,7 +311,7 @@ void distribution_and_transform(const GPUContext &ctx,
   size_t total_thread = block_size * grid_size;
   size_t curand4_loop_times =
       (size + 4 * total_thread - 1) / (4 * total_thread);
-  // 'increment' shoulde be multiple of 4
+  // 'increment' should be multiple of 4
   uint64_t increment = curand4_loop_times * 4;
 
   auto seed_offset = gen_cuda->IncrementOffset(increment);
diff --git a/paddle/phi/kernels/funcs/jit/gen/seqpool.h b/paddle/phi/kernels/funcs/jit/gen/seqpool.h
@@ -131,8 +131,8 @@ class SeqPoolJitCode : public JitCode {
           rest_used_num_regs,
           common::errors::InvalidArgument(
               "All heights of SeqPool should use the same number of registers."
-              "It equals to the numbr of rest registers. But use %d registers "
-              "and the numbr of rest registers is %d.",
+              "It equals to the number of rest registers. But use %d registers "
+              "and the number of rest registers is %d.",
               reg_idx,
               rest_used_num_regs));
       for (int i = 0; i < reg_idx; ++i) {
diff --git a/paddle/phi/kernels/funcs/stack_and_unstack.h b/paddle/phi/kernels/funcs/stack_and_unstack.h
@@ -255,7 +255,7 @@ void UnStackRawKernel(const Context& ctx,
                       std::vector<DenseTensor*>* outs) {
   auto x_dims = x.dims();
 
-  // Input tensor is splited to split_dim tensors along split_dim dimension.
+  // Input tensor is split to split_dim tensors along split_dim dimension.
   int64_t split_dim = x_dims[axis];
 
   // zero sized tensor case
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/epilogue_predicated_tile_iterator.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/iterators/epilogue_predicated_tile_iterator.h
@@ -86,7 +86,7 @@ namespace threadblock {
 /// Satisfies: ReadableTileIterator | PredicatedTileIterator |
 /// ForwardTileIterator
 ///
-template <typename ThreadMap_,    ///< Thread map (conept: OutputTileThreadMap)
+template <typename ThreadMap_,    ///< Thread map (concept: OutputTileThreadMap)
           typename Element_,      ///< Element data type
           bool ScatterD = false,  ///< Scatter D operand or not
           bool UseCUDAStore = false>
diff --git a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
@@ -1086,7 +1086,7 @@ void DispatchWithDtype(const Context &dev_ctx,
   params.inv_sqrt_dh = inv_sqrt_dh;
   params.rotary_emb_dims = rotary_emb_dims;
 
-  params.steps_per_block = timestep;  // if not SPLIT, this is unuseful.
+  params.steps_per_block = timestep;  // if not SPLIT, this is useless.
   params.split_seq = 1;               // if not SPLIT, grid.x==1
 
   bool SPLIT = false;
diff --git a/paddle/phi/kernels/gpu/lars_momentum_kernel.cu b/paddle/phi/kernels/gpu/lars_momentum_kernel.cu
@@ -154,7 +154,8 @@ __device__ inline void VectorizeLarsUpdate(const T* __restrict__ grad,
   --rdc=true compile flag, then L2_norm kernel can be set with __device__ and
   cooperative_groups::grid_group also can be involved. Otherwise, adding this
   flag may affect much, L2_norm kernel shall be set with __global__.*/
-// TODO(limingshu): declaration of cooperative_groups wapper is invalid in host.
+// TODO(limingshu): declaration of cooperative_groups wrapper is invalid in
+// host.
 template <typename T, typename MT>
 __forceinline__ __device__ void L2NormKernel(
     const cooperative_groups::grid_group* cg,
@@ -193,7 +194,7 @@ __global__ void L2NormKernel(
     g_buffer[blockIdx.x] = g_tmp;
   }
 #if CUDA_VERSION >= 11000
-  cg->sync();  // Grid sync for writring partial result to global memory
+  cg->sync();  // Grid sync for writing partial result to global memory
   MT p_part_sum = threadIdx.x < gridDim.x ? p_buffer[threadIdx.x] : 0;
   MT g_part_sum = threadIdx.x < gridDim.x ? g_buffer[threadIdx.x] : 0;
   MT tmp0 = phi::funcs::BlockReduceSum<MT>(p_part_sum, FINAL_MASK);
diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu
@@ -244,7 +244,7 @@ void MultinomialKernel(const Context& dev_ctx,
   auto gen_cuda = dev_ctx.GetGenerator();
   size_t curand4_loop_times =
       (num_distributions + 4 * grid_y - 1) / (4 * grid_y);
-  // 'increment' shoulde be multiple of 4
+  // 'increment' should be multiple of 4
   uint64_t increment = curand4_loop_times * 4;
   auto seed_offset = gen_cuda->IncrementOffset(increment);
 
diff --git a/paddle/phi/kernels/gpu/qr_kernel.cu b/paddle/phi/kernels/gpu/qr_kernel.cu
@@ -93,7 +93,7 @@ struct QrFunctor {
     ctx.template Alloc<phi::dtype::Real<T>>(
         r, batch_size * k * n * sizeof(phi::dtype::Real<T>));
 
-    // Note: allocate temporary tensors because of lacking in-place operatios.
+    // Note: allocate temporary tensors because of lacking in-place operations.
     // Prepare qr
     DenseTensor qr;
     ctx.template Alloc<phi::dtype::Real<T>>(
@@ -220,7 +220,7 @@ struct QrFunctor<phi::dtype::complex<T>, Context> {
     }
     ctx.template Alloc<phi::dtype::complex<T>>(
         r, batch_size * k * n * sizeof(phi::dtype::complex<T>));
-    // Note: allocate temporary tensors because of lacking in-place operatios.
+    // Note: allocate temporary tensors because of lacking in-place operations.
     // Prepare qr
     DenseTensor qr;
     ctx.template Alloc<phi::dtype::complex<T>>(

Original file line number	Diff line number	Diff line change
`@@ -168,7 +168,7 @@ class CublasLtAlgoCache {`
`168`	`168`	`cudaDataType_t bias_type,`
`169`	`169`	`cudaDataType_t c_type,`
`170`	`170`	`cudaStream_t stream) {`
`171`		`- // If we don't have config file and we donot search, here return nullptr`
	`171`	`+ // If we don't have config file and we do not search, here return nullptr`
`172`	`172`	`if (!has_config_file_ && search_times_ <= 0) {`
`173`	`173`	`return nullptr;`
`174`	`174`	`}`