Skip to content

Commit 37af05d

Browse files
committed
Fix
1 parent 237933b commit 37af05d

16 files changed

+62
-62
lines changed

paddlenlp/layers/crf.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ def __init__(self, crf):
248248
self.crf = crf
249249
if isinstance(crf, paddle.Tensor):
250250
raise ValueError(
251-
"From paddlenlp >= 2.0.0b4, the first param of LinearChainCrfLoss shoule be a LinearChainCrf object. For input parameter 'crf.transitions', you can remove '.transitions' to 'crf'"
251+
"From paddlenlp >= 2.0.0b4, the first param of LinearChainCrfLoss should be a LinearChainCrf object. For input parameter 'crf.transitions', you can remove '.transitions' to 'crf'"
252252
)
253253

254254
def forward(self, inputs, lengths, labels, old_version_labels=None):

paddlenlp/ops/distributed/parallel.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -191,8 +191,8 @@ def __init__(self, size, num_partitions=1, gather_out=True, param_attr=None, bia
191191
main_block = paddle.static.default_main_program().global_block()
192192
startup_block.vars[weight.name].is_distributed = True
193193
main_block.vars[weight.name].is_distributed = True
194-
# set is_distributed for splited bias
195-
# if a linear layer is splited by col, the bias would also be split into each rank as its weight
194+
# set is_distributed for split bias
195+
# if a linear layer is split by col, the bias would also be split into each rank as its weight
196196
if self.linear._bias_attr:
197197
startup_block.vars[self.linear.bias.name].is_distributed = True
198198
main_block.vars[self.linear.bias.name].is_distributed = True
@@ -285,8 +285,8 @@ def __init__(self, size, num_partitions=1, input_is_parallel=False, param_attr=N
285285
main_block = paddle.static.default_main_program().global_block()
286286
startup_block.vars[weight.name].is_distributed = True
287287
main_block.vars[weight.name].is_distributed = True
288-
# set is_distributed for splited bias
289-
# if a linear layer is splited by row, each rank would hold a complete bias
288+
# set is_distributed for split bias
289+
# if a linear layer is split by row, each rank would hold a complete bias
290290

291291
if bias_attr is not False:
292292
self.bias = self.create_parameter(shape=[num_cols], attr=bias_attr, dtype=self._dtype, is_bias=True)

paddlenlp/quantization/checkpoint_quantization_utils.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def group_wise_quant_dequant(
6363
tp_degree (`int`):
6464
Tensor parallel world size.
6565
use_pd (`bool`):
66-
Whether to use paddle caculation. If False will use numpy.
66+
Whether to use paddle calculation. If False will use numpy.
6767
symmetry (`bool`):
6868
Whether to use symmetry quantization.
6969
"""
@@ -201,7 +201,7 @@ def cal_abs_min_max_channel(inputs, quant_axis=1):
201201
inputs (`numpy.array`):
202202
input tensor for quantization.
203203
quant_axis (`int`):
204-
dimension where calulating inputs' abs min and max scales on.
204+
dimension where calculating inputs' abs min and max scales on.
205205
"""
206206
eps = 1e-8
207207
reduce_axis = tuple([i for i in range(len(inputs.shape)) if i != quant_axis])
@@ -227,7 +227,7 @@ def asymmetry_qdq_weight(
227227
quant_bits (`int`):
228228
Quantization bits.
229229
quant_axis (`int`):
230-
Scales caculation axis.
230+
Scales calculation axis.
231231
mins (`paddle.Tensor`):
232232
Min scales tensor in asymmetry quantization.
233233
maxs (`paddle.Tensor`):
@@ -239,7 +239,7 @@ def asymmetry_qdq_weight(
239239
tp_degree (`int`):
240240
Model parallel world size.
241241
use_pd (`bool`):
242-
Whether to use paddle caculation. If False will use numpy.
242+
Whether to use paddle calculation. If False will use numpy.
243243
"""
244244

245245
if mins is None:
@@ -288,7 +288,7 @@ def cal_abs_max_channel(inputs, quant_axis=1):
288288
inputs (`numpy.array`):
289289
input tensor for quantization.
290290
quant_axis (`int`):
291-
dimension where calulating inputs' abs max scales on.
291+
dimension where calculating inputs' abs max scales on.
292292
"""
293293
epsilon = 1e-8
294294
reduce_axis = tuple([i for i in range(len(inputs.shape)) if i != quant_axis])
@@ -311,7 +311,7 @@ def qdq_weight(x, quant_bit=8, quant_axis=-1, scales=None, dequant=False, tp_ran
311311
quant_bits (`int`):
312312
Quantization bits.
313313
quant_axis (`int`):
314-
Scales caculation axis.
314+
Scales calculation axis.
315315
scales (`paddle.Tensor`):
316316
Abs max scales tensor in symmetry quantization.
317317
dequant (`bool`):
@@ -321,7 +321,7 @@ def qdq_weight(x, quant_bit=8, quant_axis=-1, scales=None, dequant=False, tp_ran
321321
tp_degree (`int`):
322322
Model parallel world size.
323323
use_pd (`bool`):
324-
Whether to use paddle caculation. If False will use numpy.
324+
Whether to use paddle calculation. If False will use numpy.
325325
"""
326326

327327
if scales is None:

paddlenlp/quantization/quantization_linear.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ def __init__(
250250
self.quant_dtype, self.quant_weight_bit = QuantMapping[self.weight_quantize_algo]
251251
self.state = 0
252252

253-
# PaddlePaddle dosen't support 4bit data type, one 8bit data represents two 4bit data.
253+
# PaddlePaddle doesn't support 4bit data type, one 8bit data represents two 4bit data.
254254
# paddle.nn.quant.weight_quantize will transpose in_features and out_features.
255255
if self.weight_quantize_algo in [
256256
"weight_only_int8",
@@ -405,7 +405,7 @@ def __init__(
405405
if self.sequence_parallel and self.gather_output:
406406
raise ValueError("Sequence parallel does not support gather_output")
407407

408-
# PaddlePaddle dosen't support Int4 data type, one Int8 data represents two Int4 data.
408+
# PaddlePaddle doesn't support Int4 data type, one Int8 data represents two Int4 data.
409409
if self.weight_quantize_algo in [
410410
"weight_only_int8",
411411
"weight_only_int4",
@@ -542,7 +542,7 @@ def __init__(
542542
if not self.input_is_parallel and self.sequence_parallel:
543543
raise ValueError("Sequence parallel only support input_is_parallel.")
544544

545-
# PaddlePaddle dosen't support Int4 data type, one Int8 data represents two Int4 data.
545+
# PaddlePaddle doesn't support Int4 data type, one Int8 data represents two Int4 data.
546546
# paddle.nn.quant.weight_quantize will transpose in_features and out_features.
547547
if self.weight_quantize_algo in [
548548
"weight_only_int8",

paddlenlp/rl/trainer/rl_trainer.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -674,7 +674,7 @@ def get_train_step_vars(self, vars: Optional[Dict] = None) -> Dict:
674674
if paddle.distributed.get_world_size() > 1:
675675
assert self.model is not self.model_wrapped
676676
self.train_step_vars = {
677-
# meaningless vars can pass from outter, dummy value is enough
677+
# meaningless vars can pass from outer, dummy value is enough
678678
"epoch": 0, # meaningless for step training
679679
"step": 0, # meaningless for step training
680680
"steps_in_epoch": 100000, # meaningless for step training
@@ -718,15 +718,15 @@ def full_training_step(self, **inputs) -> paddle.Tensor:
718718
# trainer.train use `tr_loss` as loss var to accumulate loss.
719719
# NOTE: `tr_loss` in trainer.train not only accumulate mean loss for
720720
# steps in one `gradient_accumulation_steps`, but also accumulate for
721-
# one logging intervel which may contains more than one accumulated steps.
721+
# one logging interval which may contains more than one accumulated steps.
722722
# However, in RLTrainer we only want to use `tr_loss` to accumulate
723723
# mean loss for steps in a `gradient_accumulation_steps` range. As for
724-
# logging intervel loss accumulation is not take into account here and
725-
# should be considered in outter.
724+
# logging interval loss accumulation is not take into account here and
725+
# should be considered in outer.
726726
if loss_var is None: # the first step of current loss type
727727
loss_var = paddle.to_tensor(0.0)
728728
train_step_vars[loss_name] = loss_var
729-
elif self.is_accumulation_step: # begin a new accumulation step intervel
729+
elif self.is_accumulation_step: # begin a new accumulation step interval
730730
for name in self.loss_names:
731731
train_step_vars[name] = paddle.to_tensor(0.0)
732732
loss_var = train_step_vars[loss_name]

paddlenlp/taskflow/knowledge_mining.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@
146146
class WordTagTask(Task):
147147
"""
148148
This the NER(Named Entity Recognition) task that convert the raw text to entities. And the task with the `wordtag`
149-
model will link the more meesage with the entity.
149+
model will link the more message with the entity.
150150
Args:
151151
task(string): The name of task.
152152
model(string): The model name in the task.

paddlenlp/taskflow/lexical_analysis.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def load_vocab(dict_path):
6868

6969
class LacTask(Task):
7070
"""
71-
Lexical analysis of Chinese task to segement the chinese sentence.
71+
Lexical analysis of Chinese task to segment the chinese sentence.
7272
Args:
7373
task(string): The name of task.
7474
model(string): The model name in the task.

paddlenlp/taskflow/named_entity_recognition.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@
7474
class NERWordTagTask(WordTagTask):
7575
"""
7676
This the NER(Named Entity Recognition) task that convert the raw text to entities. And the task with the `wordtag`
77-
model will link the more meesage with the entity.
77+
model will link the more message with the entity.
7878
Args:
7979
task(string): The name of task.
8080
model(string): The model name in the task.

paddlenlp/trainer/auto_trainer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def parallel_model(cls, model, training_args: AutoTrainingArguments):
109109
model (paddle.nn.Layer): the model to be parallelized.
110110
training_args (AutoTrainingArguments) : Training arguments which contain distributed information
111111
Returns:
112-
the model after parallelize and config conatins distributed strategy
112+
the model after parallelize and config contains distributed strategy
113113
"""
114114
if not training_args.use_intermediate_api:
115115
return model, None

paddlenlp/trainer/plugins/timer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727

2828
class _Timer:
29-
"""Profile Timer for recording time taken by forward/ bacward/ reduce/ step."""
29+
"""Profile Timer for recording time taken by forward/ backward/ reduce/ step."""
3030

3131
def __init__(self, name):
3232
self.name = name

paddlenlp/trainer/trainer.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -526,14 +526,14 @@ def _wrap_amp_model(self, args, model):
526526
if self.args.pipeline_parallel_degree > 1 or (self.args.tensor_parallel_degree > 1 and self.sharding is None):
527527
self.scaler = paddle.amp.GradScaler(init_loss_scaling=self.args.scale_loss)
528528
if self.args.amp_master_grad:
529-
mix_precision_utils.MixPrecisionScaler(self.scaler) # retun value has no use
529+
mix_precision_utils.MixPrecisionScaler(self.scaler) # return value has no use
530530
self.scaler = fleet.distributed_scaler(self.scaler)
531531
elif self.sharding is not None:
532532
self.scaler = paddle.amp.GradScaler(init_loss_scaling=self.args.scale_loss)
533533
if self.amp_dtype == "float16" or self.amp_dtype == "bfloat16":
534534
if ShardingOption.SHARD_OP in self.args.sharding:
535535
if self.args.amp_master_grad:
536-
mix_precision_utils.MixPrecisionScaler(self.scaler) # retun value has no use
536+
mix_precision_utils.MixPrecisionScaler(self.scaler) # return value has no use
537537
self.scaler = fleet.distributed_scaler(self.scaler)
538538
else:
539539
# scaler for stage2 and stage3
@@ -763,7 +763,7 @@ def create_zcc_manager(self, unwrapped_model, resume_from_checkpoint=None):
763763
"""
764764
Create zero cost checkpoint manager.
765765
Has to be called after pipeline model is created.
766-
resume_from_checkpoint: if use Flash checkpoing EMA, load previous checkpoint status
766+
resume_from_checkpoint: if use Flash checkpoint EMA, load previous checkpoint status
767767
"""
768768
assert isinstance(
769769
self.model, PretrainedModel
@@ -1223,7 +1223,7 @@ def _inner_training_loop(
12231223
) and availiable_no_sync
12241224
# sharding
12251225
# stage1. the same as ddp
1226-
# stage2. manualy collect gradient on dp group
1226+
# stage2. manually collect gradient on dp group
12271227

12281228
dp_master_grad = (
12291229
self.args.world_size > 1 and self.args.amp_master_grad and not self.args.use_hybrid_parallel
@@ -1263,15 +1263,15 @@ def fused_allreduce_gradients_no_sync(paramlist, hcg):
12631263
self._check_loss_valid(tr_loss)
12641264

12651265
self.timers and self.timers("forward-backward").stop()
1266-
# Maunally collect gradients
1266+
# Manually collect gradients
12671267
# Case 1: Use recompute and dp
12681268
# Case 2: Hack dp with master_grad
12691269
# Case 3: Pipeline or sharding overlap
12701270
# local_rank != -1 don't means dp in networks.
12711271
self.timers and self.timers("all-reduce").start()
12721272

12731273
# Case 1: Use recompute and dp / sharding stage1,
1274-
# manualy collect gradient for dp.
1274+
# manually collect gradient for dp.
12751275
if (args.recompute or args.use_expert_parallel) and availiable_no_sync:
12761276
fused_allreduce_gradients_no_sync(list(model.parameters()), None)
12771277

@@ -2041,21 +2041,21 @@ def _load_rng_state(self, checkpoint):
20412041
core.default_cpu_generator().set_state(checkpoint_rng_state["cpu"])
20422042
if core.is_compiled_with_cuda():
20432043
if not len(checkpoint_rng_state["cuda"]) == core.get_cuda_device_count():
2044-
raise ValueError("Length of gpu state list shoule be equal to the gpu device count")
2044+
raise ValueError("Length of gpu state list should be equal to the gpu device count")
20452045
for i in range(core.get_cuda_device_count()):
20462046
core.default_cuda_generator(i).set_state(checkpoint_rng_state["cuda"][i])
20472047

20482048
if core.is_compiled_with_xpu():
20492049
if not len(checkpoint_rng_state["cuda"]) == core.get_xpu_device_count():
2050-
raise ValueError("Length of xpu state list shoule be equal to the xpu device count")
2050+
raise ValueError("Length of xpu state list should be equal to the xpu device count")
20512051
for i in range(core.get_xpu_device_count()):
20522052
core.default_xpu_generator(i).set_state(checkpoint_rng_state["cuda"][i])
20532053

20542054
if paddle.device.get_all_custom_device_type() is not None:
20552055
custom_device_type = paddle.device.get_all_custom_device_type()
20562056
for device in custom_device_type:
20572057
if not len(checkpoint_rng_state["cuda"]) == core.get_custom_device_count(device):
2058-
raise ValueError("Length of custom device state list shoule be equal to the custom device count")
2058+
raise ValueError("Length of custom device state list should be equal to the custom device count")
20592059
for i in range(core.get_custom_device_count(device)):
20602060
core.default_custom_device_generator(paddle.CustomPlace(device, i)).set_state(
20612061
checkpoint_rng_state["cuda"][i]

paddlenlp/trainer/trainer_compress.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -700,10 +700,10 @@ def _quant_aware_training_dynamic(self, input_dir):
700700
args.output_filename_prefix = "int8"
701701

702702
quant_config = {
703-
# It defauts to None, which means that no preprocessing is performed
703+
# It defaults to None, which means that no preprocessing is performed
704704
# on the active value."
705705
"activation_preprocess_type": "PACT" if args.use_pact else None,
706-
# It defauts to None, which means that no preprocessing is performed
706+
# It defaults to None, which means that no preprocessing is performed
707707
# on weights.
708708
"weight_preprocess_type": "PACT" if args.use_pact else None,
709709
"weight_quantize_type": args.weight_quantize_type,

0 commit comments

Comments
 (0)