Fix

co63oc · co63oc · commit 37af05d203f5 · 2025-04-24T15:38:46.000+08:00
diff --git a/paddlenlp/layers/crf.py b/paddlenlp/layers/crf.py
@@ -248,7 +248,7 @@ def __init__(self, crf):
         self.crf = crf
         if isinstance(crf, paddle.Tensor):
             raise ValueError(
-                "From paddlenlp >= 2.0.0b4, the first param of LinearChainCrfLoss shoule be a LinearChainCrf object. For input parameter 'crf.transitions', you can remove '.transitions' to 'crf'"
+                "From paddlenlp >= 2.0.0b4, the first param of LinearChainCrfLoss should be a LinearChainCrf object. For input parameter 'crf.transitions', you can remove '.transitions' to 'crf'"
             )
 
     def forward(self, inputs, lengths, labels, old_version_labels=None):
diff --git a/paddlenlp/ops/distributed/parallel.py b/paddlenlp/ops/distributed/parallel.py
@@ -191,8 +191,8 @@ def __init__(self, size, num_partitions=1, gather_out=True, param_attr=None, bia
         main_block = paddle.static.default_main_program().global_block()
         startup_block.vars[weight.name].is_distributed = True
         main_block.vars[weight.name].is_distributed = True
-        # set is_distributed for splited bias
-        # if a linear layer is splited by col, the bias would also be split into each rank as its weight
+        # set is_distributed for split bias
+        # if a linear layer is split by col, the bias would also be split into each rank as its weight
         if self.linear._bias_attr:
             startup_block.vars[self.linear.bias.name].is_distributed = True
             main_block.vars[self.linear.bias.name].is_distributed = True
@@ -285,8 +285,8 @@ def __init__(self, size, num_partitions=1, input_is_parallel=False, param_attr=N
         main_block = paddle.static.default_main_program().global_block()
         startup_block.vars[weight.name].is_distributed = True
         main_block.vars[weight.name].is_distributed = True
-        # set is_distributed for splited bias
-        # if a linear layer is splited by row, each rank would hold a complete bias
+        # set is_distributed for split bias
+        # if a linear layer is split by row, each rank would hold a complete bias
 
         if bias_attr is not False:
             self.bias = self.create_parameter(shape=[num_cols], attr=bias_attr, dtype=self._dtype, is_bias=True)
diff --git a/paddlenlp/quantization/checkpoint_quantization_utils.py b/paddlenlp/quantization/checkpoint_quantization_utils.py
@@ -63,7 +63,7 @@ def group_wise_quant_dequant(
         tp_degree (`int`):
             Tensor parallel world size.
         use_pd (`bool`):
-            Whether to use paddle caculation. If False will use numpy.
+            Whether to use paddle calculation. If False will use numpy.
         symmetry (`bool`):
             Whether to use symmetry quantization.
     """
@@ -201,7 +201,7 @@ def cal_abs_min_max_channel(inputs, quant_axis=1):
         inputs (`numpy.array`):
             input tensor for quantization.
         quant_axis (`int`):
-            dimension where calulating inputs' abs min and max scales on.
+            dimension where calculating inputs' abs min and max scales on.
     """
     eps = 1e-8
     reduce_axis = tuple([i for i in range(len(inputs.shape)) if i != quant_axis])
@@ -227,7 +227,7 @@ def asymmetry_qdq_weight(
         quant_bits (`int`):
             Quantization bits.
         quant_axis (`int`):
-            Scales caculation axis.
+            Scales calculation axis.
         mins (`paddle.Tensor`):
             Min scales tensor in asymmetry quantization.
         maxs (`paddle.Tensor`):
@@ -239,7 +239,7 @@ def asymmetry_qdq_weight(
         tp_degree (`int`):
             Model parallel world size.
         use_pd (`bool`):
-            Whether to use paddle caculation. If False will use numpy.
+            Whether to use paddle calculation. If False will use numpy.
     """
 
     if mins is None:
@@ -288,7 +288,7 @@ def cal_abs_max_channel(inputs, quant_axis=1):
         inputs (`numpy.array`):
             input tensor for quantization.
         quant_axis (`int`):
-            dimension where calulating inputs' abs max scales on.
+            dimension where calculating inputs' abs max scales on.
     """
     epsilon = 1e-8
     reduce_axis = tuple([i for i in range(len(inputs.shape)) if i != quant_axis])
@@ -311,7 +311,7 @@ def qdq_weight(x, quant_bit=8, quant_axis=-1, scales=None, dequant=False, tp_ran
         quant_bits (`int`):
             Quantization bits.
         quant_axis (`int`):
-            Scales caculation axis.
+            Scales calculation axis.
         scales (`paddle.Tensor`):
             Abs max scales tensor in symmetry quantization.
         dequant (`bool`):
@@ -321,7 +321,7 @@ def qdq_weight(x, quant_bit=8, quant_axis=-1, scales=None, dequant=False, tp_ran
         tp_degree (`int`):
             Model parallel world size.
         use_pd (`bool`):
-            Whether to use paddle caculation. If False will use numpy.
+            Whether to use paddle calculation. If False will use numpy.
     """
 
     if scales is None:
diff --git a/paddlenlp/quantization/quantization_linear.py b/paddlenlp/quantization/quantization_linear.py
@@ -250,7 +250,7 @@ def __init__(
         self.quant_dtype, self.quant_weight_bit = QuantMapping[self.weight_quantize_algo]
         self.state = 0
 
-        # PaddlePaddle dosen't support 4bit data type, one 8bit data represents two 4bit data.
+        # PaddlePaddle doesn't support 4bit data type, one 8bit data represents two 4bit data.
         # paddle.nn.quant.weight_quantize will transpose in_features and out_features.
         if self.weight_quantize_algo in [
             "weight_only_int8",
@@ -405,7 +405,7 @@ def __init__(
         if self.sequence_parallel and self.gather_output:
             raise ValueError("Sequence parallel does not support gather_output")
 
-        # PaddlePaddle dosen't support Int4 data type, one Int8 data represents two Int4 data.
+        # PaddlePaddle doesn't support Int4 data type, one Int8 data represents two Int4 data.
         if self.weight_quantize_algo in [
             "weight_only_int8",
             "weight_only_int4",
@@ -542,7 +542,7 @@ def __init__(
         if not self.input_is_parallel and self.sequence_parallel:
             raise ValueError("Sequence parallel only support input_is_parallel.")
 
-        # PaddlePaddle dosen't support Int4 data type, one Int8 data represents two Int4 data.
+        # PaddlePaddle doesn't support Int4 data type, one Int8 data represents two Int4 data.
         # paddle.nn.quant.weight_quantize will transpose in_features and out_features.
         if self.weight_quantize_algo in [
             "weight_only_int8",
diff --git a/paddlenlp/rl/trainer/rl_trainer.py b/paddlenlp/rl/trainer/rl_trainer.py
@@ -674,7 +674,7 @@ def get_train_step_vars(self, vars: Optional[Dict] = None) -> Dict:
             if paddle.distributed.get_world_size() > 1:
                 assert self.model is not self.model_wrapped
             self.train_step_vars = {
-                # meaningless vars can pass from outter, dummy value is enough
+                # meaningless vars can pass from outer, dummy value is enough
                 "epoch": 0,  # meaningless for step training
                 "step": 0,  # meaningless for step training
                 "steps_in_epoch": 100000,  # meaningless for step training
@@ -718,15 +718,15 @@ def full_training_step(self, **inputs) -> paddle.Tensor:
         # trainer.train use `tr_loss` as loss var to accumulate loss.
         # NOTE: `tr_loss` in trainer.train not only accumulate mean loss for
         # steps in one `gradient_accumulation_steps`, but also accumulate for
-        # one logging intervel which may contains more than one accumulated steps.
+        # one logging interval which may contains more than one accumulated steps.
         # However, in RLTrainer we only want to use `tr_loss` to accumulate
         # mean loss for steps in a `gradient_accumulation_steps` range. As for
-        # logging intervel loss accumulation is not take into account here and
-        # should be considered in outter.
+        # logging interval loss accumulation is not take into account here and
+        # should be considered in outer.
         if loss_var is None:  # the first step of current loss type
             loss_var = paddle.to_tensor(0.0)
             train_step_vars[loss_name] = loss_var
-        elif self.is_accumulation_step:  # begin a new accumulation step intervel
+        elif self.is_accumulation_step:  # begin a new accumulation step interval
             for name in self.loss_names:
                 train_step_vars[name] = paddle.to_tensor(0.0)
             loss_var = train_step_vars[loss_name]
diff --git a/paddlenlp/taskflow/knowledge_mining.py b/paddlenlp/taskflow/knowledge_mining.py
@@ -146,7 +146,7 @@
 class WordTagTask(Task):
     """
     This the NER(Named Entity Recognition) task that convert the raw text to entities. And the task with the `wordtag`
-    model will link the more meesage with the entity.
+    model will link the more message with the entity.
     Args:
         task(string): The name of task.
         model(string): The model name in the task.
diff --git a/paddlenlp/taskflow/lexical_analysis.py b/paddlenlp/taskflow/lexical_analysis.py
@@ -68,7 +68,7 @@ def load_vocab(dict_path):
 
 class LacTask(Task):
     """
-    Lexical analysis of Chinese task to segement the chinese sentence.
+    Lexical analysis of Chinese task to segment the chinese sentence.
     Args:
         task(string): The name of task.
         model(string): The model name in the task.
diff --git a/paddlenlp/taskflow/named_entity_recognition.py b/paddlenlp/taskflow/named_entity_recognition.py
@@ -74,7 +74,7 @@
 class NERWordTagTask(WordTagTask):
     """
     This the NER(Named Entity Recognition) task that convert the raw text to entities. And the task with the `wordtag`
-    model will link the more meesage with the entity.
+    model will link the more message with the entity.
     Args:
         task(string): The name of task.
         model(string): The model name in the task.
diff --git a/paddlenlp/trainer/auto_trainer.py b/paddlenlp/trainer/auto_trainer.py
@@ -109,7 +109,7 @@ def parallel_model(cls, model, training_args: AutoTrainingArguments):
             model (paddle.nn.Layer): the model to be parallelized.
             training_args (AutoTrainingArguments) : Training arguments which contain distributed information
         Returns:
-            the model after parallelize and config conatins distributed strategy
+            the model after parallelize and config contains distributed strategy
         """
         if not training_args.use_intermediate_api:
             return model, None
diff --git a/paddlenlp/trainer/plugins/timer.py b/paddlenlp/trainer/plugins/timer.py
@@ -26,7 +26,7 @@
 
 
 class _Timer:
-    """Profile Timer for recording time taken by forward/ bacward/ reduce/ step."""
+    """Profile Timer for recording time taken by forward/ backward/ reduce/ step."""
 
     def __init__(self, name):
         self.name = name
diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
@@ -526,14 +526,14 @@ def _wrap_amp_model(self, args, model):
         if self.args.pipeline_parallel_degree > 1 or (self.args.tensor_parallel_degree > 1 and self.sharding is None):
             self.scaler = paddle.amp.GradScaler(init_loss_scaling=self.args.scale_loss)
             if self.args.amp_master_grad:
-                mix_precision_utils.MixPrecisionScaler(self.scaler)  # retun value has no use
+                mix_precision_utils.MixPrecisionScaler(self.scaler)  # return value has no use
             self.scaler = fleet.distributed_scaler(self.scaler)
         elif self.sharding is not None:
             self.scaler = paddle.amp.GradScaler(init_loss_scaling=self.args.scale_loss)
             if self.amp_dtype == "float16" or self.amp_dtype == "bfloat16":
                 if ShardingOption.SHARD_OP in self.args.sharding:
                     if self.args.amp_master_grad:
-                        mix_precision_utils.MixPrecisionScaler(self.scaler)  # retun value has no use
+                        mix_precision_utils.MixPrecisionScaler(self.scaler)  # return value has no use
                     self.scaler = fleet.distributed_scaler(self.scaler)
                 else:
                     # scaler for stage2 and stage3
@@ -763,7 +763,7 @@ def create_zcc_manager(self, unwrapped_model, resume_from_checkpoint=None):
         """
         Create zero cost checkpoint manager.
         Has to be called after pipeline model is created.
-        resume_from_checkpoint: if use Flash checkpoing EMA, load previous checkpoint status
+        resume_from_checkpoint: if use Flash checkpoint EMA, load previous checkpoint status
         """
         assert isinstance(
             self.model, PretrainedModel
@@ -1223,7 +1223,7 @@ def _inner_training_loop(
                 ) and availiable_no_sync
                 # sharding
                 # stage1. the same as ddp
-                # stage2. manualy collect gradient on dp group
+                # stage2. manually collect gradient on dp group
 
                 dp_master_grad = (
                     self.args.world_size > 1 and self.args.amp_master_grad and not self.args.use_hybrid_parallel
@@ -1263,15 +1263,15 @@ def fused_allreduce_gradients_no_sync(paramlist, hcg):
                     self._check_loss_valid(tr_loss)
 
                     self.timers and self.timers("forward-backward").stop()
-                    # Maunally collect gradients
+                    # Manually collect gradients
                     # Case 1: Use recompute and dp
                     # Case 2: Hack dp with master_grad
                     # Case 3: Pipeline or sharding overlap
                     # local_rank != -1 don't means dp in networks.
                     self.timers and self.timers("all-reduce").start()
 
                     # Case 1: Use recompute and dp / sharding stage1,
-                    # manualy collect gradient for dp.
+                    # manually collect gradient for dp.
                     if (args.recompute or args.use_expert_parallel) and availiable_no_sync:
                         fused_allreduce_gradients_no_sync(list(model.parameters()), None)
 
@@ -2041,21 +2041,21 @@ def _load_rng_state(self, checkpoint):
         core.default_cpu_generator().set_state(checkpoint_rng_state["cpu"])
         if core.is_compiled_with_cuda():
             if not len(checkpoint_rng_state["cuda"]) == core.get_cuda_device_count():
-                raise ValueError("Length of gpu state list shoule be equal to the gpu device count")
+                raise ValueError("Length of gpu state list should be equal to the gpu device count")
             for i in range(core.get_cuda_device_count()):
                 core.default_cuda_generator(i).set_state(checkpoint_rng_state["cuda"][i])
 
         if core.is_compiled_with_xpu():
             if not len(checkpoint_rng_state["cuda"]) == core.get_xpu_device_count():
-                raise ValueError("Length of xpu state list shoule be equal to the xpu device count")
+                raise ValueError("Length of xpu state list should be equal to the xpu device count")
             for i in range(core.get_xpu_device_count()):
                 core.default_xpu_generator(i).set_state(checkpoint_rng_state["cuda"][i])
 
         if paddle.device.get_all_custom_device_type() is not None:
             custom_device_type = paddle.device.get_all_custom_device_type()
             for device in custom_device_type:
                 if not len(checkpoint_rng_state["cuda"]) == core.get_custom_device_count(device):
-                    raise ValueError("Length of custom device state list shoule be equal to the custom device count")
+                    raise ValueError("Length of custom device state list should be equal to the custom device count")
                 for i in range(core.get_custom_device_count(device)):
                     core.default_custom_device_generator(paddle.CustomPlace(device, i)).set_state(
                         checkpoint_rng_state["cuda"][i]
diff --git a/paddlenlp/trainer/trainer_compress.py b/paddlenlp/trainer/trainer_compress.py
@@ -700,10 +700,10 @@ def _quant_aware_training_dynamic(self, input_dir):
     args.output_filename_prefix = "int8"
 
     quant_config = {
-        # It defauts to None, which means that no preprocessing is performed
+        # It defaults to None, which means that no preprocessing is performed
         # on the active value."
         "activation_preprocess_type": "PACT" if args.use_pact else None,
-        # It defauts to None, which means that no preprocessing is performed
+        # It defaults to None, which means that no preprocessing is performed
         # on weights.
         "weight_preprocess_type": "PACT" if args.use_pact else None,
         "weight_quantize_type": args.weight_quantize_type,
diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py
diff --git a/paddlenlp/trainer/utils/reshard/pp_reshard.py b/paddlenlp/trainer/utils/reshard/pp_reshard.py
diff --git a/paddlenlp/trainer/utils/zero_cost_checkpoint.py b/paddlenlp/trainer/utils/zero_cost_checkpoint.py
diff --git a/paddlenlp/transformers/nystromformer/modeling.py b/paddlenlp/transformers/nystromformer/modeling.py

Original file line number	Diff line number	Diff line change
`@@ -248,7 +248,7 @@ def __init__(self, crf):`
`248`	`248`	`self.crf = crf`
`249`	`249`	`if isinstance(crf, paddle.Tensor):`
`250`	`250`	`raise ValueError(`
`251`		`- "From paddlenlp >= 2.0.0b4, the first param of LinearChainCrfLoss shoule be a LinearChainCrf object. For input parameter 'crf.transitions', you can remove '.transitions' to 'crf'"`
	`251`	`+ "From paddlenlp >= 2.0.0b4, the first param of LinearChainCrfLoss should be a LinearChainCrf object. For input parameter 'crf.transitions', you can remove '.transitions' to 'crf'"`
`252`	`252`	`)`
`253`	`253`
`254`	`254`	`def forward(self, inputs, lengths, labels, old_version_labels=None):`