From 4157e3cd7e2bb5a7be6dc065a3e20c49cc1300ab Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 13 Sep 2022 18:16:36 +0100 Subject: [PATCH] new length penalty docstring (#19006) --- src/transformers/configuration_utils.py | 5 ++++- src/transformers/generation_beam_search.py | 14 +++++++------ src/transformers/generation_tf_utils.py | 21 +++++++++++-------- src/transformers/generation_utils.py | 7 ++++--- .../models/fsmt/configuration_fsmt.py | 5 ++++- src/transformers/models/rag/modeling_rag.py | 8 +++---- .../models/rag/modeling_tf_rag.py | 8 +++---- 7 files changed, 40 insertions(+), 28 deletions(-) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index db8147b4dee349..3fdc0f265f6331 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -148,7 +148,10 @@ class PretrainedConfig(PushToHubMixin): Parameter for repetition penalty that will be used by default in the `generate` method of the model. 1.0 means no penalty. length_penalty (`float`, *optional*, defaults to 1): - Exponential penalty to the length that will be used by default in the `generate` method of the model. + Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to + the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log + likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while + `length_penalty` < 0.0 encourages shorter sequences. no_repeat_ngram_size (`int`, *optional*, defaults to 0) -- Value that will be used by default in the `generate` method of the model for `no_repeat_ngram_size`. If set to int > 0, all ngrams of that size can only occur once. diff --git a/src/transformers/generation_beam_search.py b/src/transformers/generation_beam_search.py index 7c50c0d7acdccc..902160b228fdc2 100644 --- a/src/transformers/generation_beam_search.py +++ b/src/transformers/generation_beam_search.py @@ -138,9 +138,10 @@ class BeamSearchScorer(BeamScorer): Defines the device type (*e.g.*, `"cpu"` or `"cuda"`) on which this instance of `BeamSearchScorer` will be allocated. length_penalty (`float`, *optional*, defaults to 1.0): - Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the - model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer - sequences. + Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to + the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log + likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while + `length_penalty` < 0.0 encourages shorter sequences. do_early_stopping (`bool`, *optional*, defaults to `False`): Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not. num_beam_hyps_to_keep (`int`, *optional*, defaults to 1): @@ -405,9 +406,10 @@ class ConstrainedBeamSearchScorer(BeamScorer): Defines the device type (*e.g.*, `"cpu"` or `"cuda"`) on which this instance of `BeamSearchScorer` will be allocated. length_penalty (`float`, *optional*, defaults to 1.0): - Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the - model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer - sequences. + Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to + the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log + likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while + `length_penalty` < 0.0 encourages shorter sequences. do_early_stopping (`bool`, *optional*, defaults to `False`): Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not. num_beam_hyps_to_keep (`int`, *optional*, defaults to 1): diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py index 5652b0e180de85..30319dcce389ed 100644 --- a/src/transformers/generation_tf_utils.py +++ b/src/transformers/generation_tf_utils.py @@ -455,10 +455,10 @@ def generate( eos_token_id (`int`, *optional*): The id of the *end-of-sequence* token. length_penalty (`float`, *optional*, defaults to 1.0): - Exponential penalty to the length. 1.0 means no penalty. - - Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in - order to encourage the model to produce longer sequences. + Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent + to the sequence length, which in turn is used to divide the score of the sequence. Since the score is + the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, + while `length_penalty` < 0.0 encourages shorter sequences. no_repeat_ngram_size (`int`, *optional*, defaults to 0): If set to int > 0, all ngrams of that size can only occur once. bad_words_ids(`List[int]`, *optional*): @@ -1419,10 +1419,10 @@ def _generate( eos_token_id (`int`, *optional*): The id of the *end-of-sequence* token. length_penalty (`float`, *optional*, defaults to 1.0): - Exponential penalty to the length. 1.0 means no penalty. - - Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in - order to encourage the model to produce longer sequences. + Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent + to the sequence length, which in turn is used to divide the score of the sequence. Since the score is + the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, + while `length_penalty` < 0.0 encourages shorter sequences. no_repeat_ngram_size (`int`, *optional*, defaults to 0): If set to int > 0, all ngrams of that size can only occur once. bad_words_ids(`List[int]`, *optional*): @@ -2657,7 +2657,10 @@ def beam_search( eos_token_id (`int`, *optional*): The id of the *end-of-sequence* token. length_penalty (`float`, *optional*, defaults to 1.0): - Exponential penalty to the length. 1.0 means no penalty. + Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent + to the sequence length, which in turn is used to divide the score of the sequence. Since the score is + the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, + while `length_penalty` < 0.0 encourages shorter sequences. early_stopping (`bool`, *optional*, defaults to `False`): Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not. logits_processor (`[TFLogitsProcessorList]`, *optional*): diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py index 7544473d7838ec..84f1a6f0392a38 100644 --- a/src/transformers/generation_utils.py +++ b/src/transformers/generation_utils.py @@ -1005,9 +1005,10 @@ def generate( eos_token_id (`int`, *optional*, defaults to `model.config.eos_token_id`): The id of the *end-of-sequence* token. length_penalty (`float`, *optional*, defaults to `model.config.length_penalty` or 1.0 if the config does not set any value): - Exponential penalty to the length. 1.0 means that the beam score is penalized by the sequence length. - 0.0 means no penalty. Set to values < 0.0 in order to encourage the model to generate longer - sequences, to a value > 0.0 in order to encourage the model to produce shorter sequences. + Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent + to the sequence length, which in turn is used to divide the score of the sequence. Since the score is + the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, + while `length_penalty` < 0.0 encourages shorter sequences. no_repeat_ngram_size (`int`, *optional*, defaults to `model.config.no_repeat_ngram_size` or 0 if the config does not set any value): If set to int > 0, all ngrams of that size can only occur once. encoder_no_repeat_ngram_size (`int`, *optional*, defaults to `model.config.encoder_no_repeat_ngram_size` or 0 if the config does not set any value): diff --git a/src/transformers/models/fsmt/configuration_fsmt.py b/src/transformers/models/fsmt/configuration_fsmt.py index 14298d6a1cc029..de96c768a20c03 100644 --- a/src/transformers/models/fsmt/configuration_fsmt.py +++ b/src/transformers/models/fsmt/configuration_fsmt.py @@ -107,7 +107,10 @@ class FSMTConfig(PretrainedConfig): Number of beams for beam search that will be used by default in the `generate` method of the model. 1 means no beam search. length_penalty (`float`, *optional*, defaults to 1) - Exponential penalty to the length that will be used by default in the `generate` method of the model. + Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to + the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log + likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while + `length_penalty` < 0.0 encourages shorter sequences. early_stopping (`bool`, *optional*, defaults to `False`) Flag that will be used by default in the `generate` method of the model. Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not. diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py index 41af393c671032..45b606905362f8 100644 --- a/src/transformers/models/rag/modeling_rag.py +++ b/src/transformers/models/rag/modeling_rag.py @@ -1463,10 +1463,10 @@ def generate( eos_token_id (`int`, *optional*): The id of the *end-of-sequence* token. length_penalty (`float`, *optional*, defaults to 1.0): - Exponential penalty to the length. 1.0 means no penalty. - - Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in - order to encourage the model to produce longer sequences. + Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent + to the sequence length, which in turn is used to divide the score of the sequence. Since the score is + the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, + while `length_penalty` < 0.0 encourages shorter sequences. no_repeat_ngram_size (`int`, *optional*, defaults to 0): If set to int > 0, all ngrams of that size can only occur once. encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0): diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py index 26482026baa8f8..a31b2d45217e8d 100644 --- a/src/transformers/models/rag/modeling_tf_rag.py +++ b/src/transformers/models/rag/modeling_tf_rag.py @@ -1054,10 +1054,10 @@ def generate( eos_token_id (`int`, *optional*): The id of the *end-of-sequence* token. length_penalty (`float`, *optional*, defaults to 1.0): - Exponential penalty to the length. 1.0 means no penalty. - - Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in - order to encourage the model to produce longer sequences. + Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent + to the sequence length, which in turn is used to divide the score of the sequence. Since the score is + the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, + while `length_penalty` < 0.0 encourages shorter sequences. no_repeat_ngram_size (`int`, *optional*, defaults to 0): If set to int > 0, all ngrams of that size can only occur once. bad_words_ids(`List[int]`, *optional*):