From 4157e3cd7e2bb5a7be6dc065a3e20c49cc1300ab Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Tue, 13 Sep 2022 18:16:36 +0100
Subject: [PATCH] new length penalty docstring (#19006)

---
 src/transformers/configuration_utils.py       |  5 ++++-
 src/transformers/generation_beam_search.py    | 14 +++++++------
 src/transformers/generation_tf_utils.py       | 21 +++++++++++--------
 src/transformers/generation_utils.py          |  7 ++++---
 .../models/fsmt/configuration_fsmt.py         |  5 ++++-
 src/transformers/models/rag/modeling_rag.py   |  8 +++----
 .../models/rag/modeling_tf_rag.py             |  8 +++----
 7 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index db8147b4dee349..3fdc0f265f6331 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -148,7 +148,10 @@ class PretrainedConfig(PushToHubMixin):
             Parameter for repetition penalty that will be used by default in the `generate` method of the model. 1.0
             means no penalty.
         length_penalty (`float`, *optional*, defaults to 1):
-            Exponential penalty to the length that will be used by default in the `generate` method of the model.
+            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
+            the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
+            likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
+            `length_penalty` < 0.0 encourages shorter sequences.
         no_repeat_ngram_size (`int`, *optional*, defaults to 0) -- Value that will be used by default in the
             `generate` method of the model for `no_repeat_ngram_size`. If set to int > 0, all ngrams of that size can
             only occur once.
diff --git a/src/transformers/generation_beam_search.py b/src/transformers/generation_beam_search.py
index 7c50c0d7acdccc..902160b228fdc2 100644
--- a/src/transformers/generation_beam_search.py
+++ b/src/transformers/generation_beam_search.py
@@ -138,9 +138,10 @@ class BeamSearchScorer(BeamScorer):
             Defines the device type (*e.g.*, `"cpu"` or `"cuda"`) on which this instance of `BeamSearchScorer` will be
             allocated.
         length_penalty (`float`, *optional*, defaults to 1.0):
-            Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the
-            model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer
-            sequences.
+            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
+            the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
+            likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
+            `length_penalty` < 0.0 encourages shorter sequences.
         do_early_stopping (`bool`, *optional*, defaults to `False`):
             Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
         num_beam_hyps_to_keep (`int`, *optional*, defaults to 1):
@@ -405,9 +406,10 @@ class ConstrainedBeamSearchScorer(BeamScorer):
             Defines the device type (*e.g.*, `"cpu"` or `"cuda"`) on which this instance of `BeamSearchScorer` will be
             allocated.
         length_penalty (`float`, *optional*, defaults to 1.0):
-            Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the
-            model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer
-            sequences.
+            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
+            the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
+            likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
+            `length_penalty` < 0.0 encourages shorter sequences.
         do_early_stopping (`bool`, *optional*, defaults to `False`):
             Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
         num_beam_hyps_to_keep (`int`, *optional*, defaults to 1):
diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py
index 5652b0e180de85..30319dcce389ed 100644
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -455,10 +455,10 @@ def generate(
             eos_token_id (`int`, *optional*):
                 The id of the *end-of-sequence* token.
             length_penalty (`float`, *optional*, defaults to 1.0):
-                Exponential penalty to the length. 1.0 means no penalty.
-
-                Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in
-                order to encourage the model to produce longer sequences.
+                Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent
+                to the sequence length, which in turn is used to divide the score of the sequence. Since the score is
+                the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences,
+                while `length_penalty` < 0.0 encourages shorter sequences.
             no_repeat_ngram_size (`int`, *optional*, defaults to 0):
                 If set to int > 0, all ngrams of that size can only occur once.
             bad_words_ids(`List[int]`, *optional*):
@@ -1419,10 +1419,10 @@ def _generate(
             eos_token_id (`int`, *optional*):
                 The id of the *end-of-sequence* token.
             length_penalty (`float`, *optional*, defaults to 1.0):
-                Exponential penalty to the length. 1.0 means no penalty.
-
-                Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in
-                order to encourage the model to produce longer sequences.
+                Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent
+                to the sequence length, which in turn is used to divide the score of the sequence. Since the score is
+                the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences,
+                while `length_penalty` < 0.0 encourages shorter sequences.
             no_repeat_ngram_size (`int`, *optional*, defaults to 0):
                 If set to int > 0, all ngrams of that size can only occur once.
             bad_words_ids(`List[int]`, *optional*):
@@ -2657,7 +2657,10 @@ def beam_search(
             eos_token_id (`int`, *optional*):
                 The id of the *end-of-sequence* token.
             length_penalty (`float`, *optional*, defaults to 1.0):
-                Exponential penalty to the length. 1.0 means no penalty.
+                Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent
+                to the sequence length, which in turn is used to divide the score of the sequence. Since the score is
+                the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences,
+                while `length_penalty` < 0.0 encourages shorter sequences.
             early_stopping (`bool`, *optional*, defaults to `False`):
                 Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
             logits_processor (`[TFLogitsProcessorList]`, *optional*):
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 7544473d7838ec..84f1a6f0392a38 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -1005,9 +1005,10 @@ def generate(
             eos_token_id (`int`, *optional*, defaults to `model.config.eos_token_id`):
                 The id of the *end-of-sequence* token.
             length_penalty (`float`, *optional*, defaults to `model.config.length_penalty` or 1.0 if the config does not set any value):
-                 Exponential penalty to the length. 1.0 means that the beam score is penalized by the sequence length.
-                 0.0 means no penalty. Set to values < 0.0 in order to encourage the model to generate longer
-                 sequences, to a value > 0.0 in order to encourage the model to produce shorter sequences.
+                Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent
+                to the sequence length, which in turn is used to divide the score of the sequence. Since the score is
+                the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences,
+                while `length_penalty` < 0.0 encourages shorter sequences.
             no_repeat_ngram_size (`int`, *optional*, defaults to `model.config.no_repeat_ngram_size` or 0 if the config does not set any value):
                 If set to int > 0, all ngrams of that size can only occur once.
             encoder_no_repeat_ngram_size (`int`, *optional*, defaults to `model.config.encoder_no_repeat_ngram_size` or 0 if the config does not set any value):
diff --git a/src/transformers/models/fsmt/configuration_fsmt.py b/src/transformers/models/fsmt/configuration_fsmt.py
index 14298d6a1cc029..de96c768a20c03 100644
--- a/src/transformers/models/fsmt/configuration_fsmt.py
+++ b/src/transformers/models/fsmt/configuration_fsmt.py
@@ -107,7 +107,10 @@ class FSMTConfig(PretrainedConfig):
             Number of beams for beam search that will be used by default in the `generate` method of the model. 1 means
             no beam search.
         length_penalty (`float`, *optional*, defaults to 1)
-            Exponential penalty to the length that will be used by default in the `generate` method of the model.
+            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
+            the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
+            likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
+            `length_penalty` < 0.0 encourages shorter sequences.
         early_stopping (`bool`, *optional*, defaults to `False`)
             Flag that will be used by default in the `generate` method of the model. Whether to stop the beam search
             when at least `num_beams` sentences are finished per batch or not.
diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index 41af393c671032..45b606905362f8 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -1463,10 +1463,10 @@ def generate(
             eos_token_id (`int`, *optional*):
                 The id of the *end-of-sequence* token.
             length_penalty (`float`, *optional*, defaults to 1.0):
-                Exponential penalty to the length. 1.0 means no penalty.
-
-                Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in
-                order to encourage the model to produce longer sequences.
+                Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent
+                to the sequence length, which in turn is used to divide the score of the sequence. Since the score is
+                the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences,
+                while `length_penalty` < 0.0 encourages shorter sequences.
             no_repeat_ngram_size (`int`, *optional*, defaults to 0):
                 If set to int > 0, all ngrams of that size can only occur once.
             encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0):
diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py
index 26482026baa8f8..a31b2d45217e8d 100644
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -1054,10 +1054,10 @@ def generate(
             eos_token_id (`int`, *optional*):
                 The id of the *end-of-sequence* token.
             length_penalty (`float`, *optional*, defaults to 1.0):
-                Exponential penalty to the length. 1.0 means no penalty.
-
-                Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in
-                order to encourage the model to produce longer sequences.
+                Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent
+                to the sequence length, which in turn is used to divide the score of the sequence. Since the score is
+                the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences,
+                while `length_penalty` < 0.0 encourages shorter sequences.
             no_repeat_ngram_size (`int`, *optional*, defaults to 0):
                 If set to int > 0, all ngrams of that size can only occur once.
             bad_words_ids(`List[int]`, *optional*):