[doc] remove the implied defaults to :obj:None, s/True/ :obj:`True/…

…, etc. (huggingface#6956) * remove the implied defaults to :obj:`None` * fix bug in the original * replace to :obj:`True`, :obj:`False`
fabiocapsouza · Nov 15, 2020 · f397b75 · f397b75
1 parent 7f17334
commit f397b75
Show file tree

Hide file tree

Showing 71 changed files with 578 additions and 578 deletions.
diff --git a/examples/bert-loses-patience/pabee/modeling_pabee_albert.py b/examples/bert-loses-patience/pabee/modeling_pabee_albert.py
@@ -242,7 +242,7 @@ def forward(
         labels=None,
     ):
         r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
                 Labels for computing the sequence classification/regression loss.
                 Indices should be in ``[0, ..., config.num_labels - 1]``.
                 If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),

diff --git a/examples/bert-loses-patience/pabee/modeling_pabee_bert.py b/examples/bert-loses-patience/pabee/modeling_pabee_bert.py
@@ -266,7 +266,7 @@ def forward(
         labels=None,
     ):
         r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
                 Labels for computing the sequence classification/regression loss.
                 Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
                 If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),

diff --git a/examples/deebert/src/modeling_highway_bert.py b/examples/deebert/src/modeling_highway_bert.py
@@ -302,7 +302,7 @@ def forward(
         train_highway=False,
     ):
         r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
                 Labels for computing the sequence classification/regression loss.
                 Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
                 If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),

diff --git a/examples/deebert/src/modeling_highway_roberta.py b/examples/deebert/src/modeling_highway_roberta.py
@@ -59,7 +59,7 @@ def forward(
         train_highway=False,
     ):
         r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
                 Labels for computing the sequence classification/regression loss.
                 Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
                 If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),

diff --git a/examples/movement-pruning/emmental/modeling_bert_masked.py b/examples/movement-pruning/emmental/modeling_bert_masked.py
@@ -426,35 +426,35 @@ def _init_weights(self, module):
             :func:`transformers.PreTrainedTokenizer.__call__` for details.
 
             `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
 
             `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
             Segment token indices to indicate first and second portions of the inputs.
             Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
             corresponds to a `sentence B` token
 
             `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
             Indices of positions of each input sequence tokens in the position embeddings.
             Selected in the range ``[0, config.max_position_embeddings - 1]``.
 
             `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
             Mask to nullify selected heads of the self-attention modules.
             Mask values selected in ``[0, 1]``:
             :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
             Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
             This is useful if you want more control over how to convert `input_ids` indices into associated vectors
             than the model's internal embedding lookup matrix.
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
             Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
             if the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
             Mask to avoid performing attention on the padding token indices of the encoder input. This mask
             is used in the cross-attention if the model is configured as a decoder.
             Mask values selected in ``[0, 1]``:
@@ -684,7 +684,7 @@ def forward(
         threshold=None,
     ):
         r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
                 Labels for computing the sequence classification/regression loss.
                 Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
                 If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
@@ -769,7 +769,7 @@ def forward(
         threshold=None,
     ):
         r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
                 Labels for computing the multiple choice classification loss.
                 Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
                 of the input tensors. (see `input_ids` above)
@@ -859,7 +859,7 @@ def forward(
         threshold=None,
     ):
         r"""
-            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
                 Labels for computing the token classification loss.
                 Indices should be in ``[0, ..., config.num_labels - 1]``.
             threshold (:obj:`float`):
@@ -946,11 +946,11 @@ def forward(
         threshold=None,
     ):
         r"""
-            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
                 Labels for position (index) of the start of the labelled span for computing the token classification loss.
                 Positions are clamped to the length of the sequence (`sequence_length`).
                 Position outside of the sequence are not taken into account for computing the loss.
-            end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
                 Labels for position (index) of the end of the labelled span for computing the token classification loss.
                 Positions are clamped to the length of the sequence (`sequence_length`).
                 Position outside of the sequence are not taken into account for computing the loss.

diff --git a/src/transformers/configuration_bart.py b/src/transformers/configuration_bart.py
@@ -65,17 +65,17 @@
             Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
         init_std (:obj:`float`, optional, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        add_bias_logits (:obj:`bool`, optional, defaults to False):
+        add_bias_logits (:obj:`bool`, optional, defaults to :obj:`False`):
             True for marian only.
-        normalize_before (:obj:`bool`, optional, defaults to False):
+        normalize_before (:obj:`bool`, optional, defaults to :obj:`False`):
             Call layernorm before attention ops. True for pegasus, mbart. False for bart. FIXME: marian?
-        normalize_embedding (:obj:`bool`, optional, defaults to True):
+        normalize_embedding (:obj:`bool`, optional, defaults to :obj:`True`):
             Call layernorm after embeddings. Only True for Bart.
-        static_position_embeddings (:obj:`bool`, optional, defaults to False):
+        static_position_embeddings (:obj:`bool`, optional, defaults to :obj:`False`):
             Don't learn positional embeddings, use sinusoidal. True for marian, pegasus.
-        add_final_layer_norm (:obj:`bool`, optional, defaults to False):
+        add_final_layer_norm (:obj:`bool`, optional, defaults to :obj:`False`):
             Why not add another layernorm?
-        scale_embedding (:obj:`bool`, optional, defaults to False):
+        scale_embedding (:obj:`bool`, optional, defaults to :obj:`False`):
             Scale embeddings by diving by sqrt(d_model).
         eos_token_id (:obj:`int`, optional, defaults to 2)
             End of stream token id.
@@ -91,7 +91,7 @@
             How many extra learned positional embeddings to use. Should be pad_token_id+1 for bart.
         num_labels: (:obj:`int`, optional, defaults to 3):
             for SequenceClassification
-        is_encoder_decoder (:obj:`bool`, optional, defaults to True):
+        is_encoder_decoder (:obj:`bool`, optional, defaults to :obj:`True`):
             Whether this is an encoder/decoder model
         force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
             Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``), only true for `bart-large-cnn`.

diff --git a/src/transformers/configuration_bert.py b/src/transformers/configuration_bert.py
@@ -88,7 +88,7 @@ class BertConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        gradient_checkpointing (:obj:`bool`, optional, defaults to False):
+        gradient_checkpointing (:obj:`bool`, optional, defaults to :obj:`False`):
             If True, use gradient checkpointing to save memory at the expense of slower backward pass.
 
     Example::

diff --git a/src/transformers/configuration_electra.py b/src/transformers/configuration_electra.py
@@ -88,7 +88,7 @@ class ElectraConfig(PretrainedConfig):
             Argument used when doing sequence summary. Used in for the multiple choice head in
             :class:`~transformers.ElectraForMultipleChoice`.
             Add a projection after the vector extraction
-        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+        summary_activation (:obj:`string` or :obj:`None`, optional):
             Argument used when doing sequence summary. Used in for the multiple choice head in
             :class:`~transformers.ElectraForMultipleChoice`.
             'gelu' => add a gelu activation to the output, Other => no activation.

diff --git a/src/transformers/configuration_flaubert.py b/src/transformers/configuration_flaubert.py
@@ -117,7 +117,7 @@ class FlaubertConfig(XLMConfig):
             Argument used when doing sequence summary. Used in for the multiple choice head in
             :class:`~transformers.XLMForSequenceClassification`.
             Add a projection after the vector extraction
-        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+        summary_activation (:obj:`string` or :obj:`None`, optional):
             Argument used when doing sequence summary. Used in for the multiple choice head in
             :class:`~transformers.XLMForSequenceClassification`.
             'tanh' => add a tanh activation to the output, Other => no activation.

diff --git a/src/transformers/configuration_gpt2.py b/src/transformers/configuration_gpt2.py
@@ -85,7 +85,7 @@ class GPT2Config(PretrainedConfig):
             Argument used when doing sequence summary. Used in for the multiple choice head in
             :class:`~transformers.GPT2DoubleHeadsModel`.
             Add a projection after the vector extraction
-        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+        summary_activation (:obj:`string` or :obj:`None`, optional):
             Argument used when doing sequence summary. Used in for the multiple choice head in
             :class:`~transformers.GPT2DoubleHeadsModel`.
             'tanh' => add a tanh activation to the output, Other => no activation.

diff --git a/src/transformers/configuration_lxmert.py b/src/transformers/configuration_lxmert.py
@@ -85,25 +85,25 @@ class LxmertConfig(PretrainedConfig):
         num_attr_labels (:obj:`int`, optional, defaults to 400):
             This represents the total number of semantically unique attributes that lxmert will be able to classify a pooled-object feature
             as possessing.
-        task_matched (:obj:`bool`, optional, defaults to True):
+        task_matched (:obj:`bool`, optional, defaults to :obj:`True`):
             This task is used for sentence-image matching. If the sentence correctly describes the image the label will be 1.
             If the sentence does not correctly describe the image, the label will be 0.
-        task_mask_lm (:obj:`bool`, optional, defaults to True):
+        task_mask_lm (:obj:`bool`, optional, defaults to :obj:`True`):
             This task is the defacto masked langauge modeling used in pretraining models such as BERT.
-        task_obj_predict (:obj:`bool`, optional, defaults to True):
+        task_obj_predict (:obj:`bool`, optional, defaults to :obj:`True`):
             This task is set to true if the user would like to perform one of the following loss objectives:
             object predicition, atrribute predicition, feature regression
-        task_qa (:obj:`bool`, optional, defaults to True):
+        task_qa (:obj:`bool`, optional, defaults to :obj:`True`):
             This task specifies whether or not Lxmert will calculate the question-asnwering loss objective
-        visual_obj_loss (:obj:`bool`, optional, defaults to True):
+        visual_obj_loss (:obj:`bool`, optional, defaults to :obj:`True`):
             This task specifies whether or not Lxmert will calculate the object-prediction loss objective
-        visual_attr_loss (:obj:`bool`, optional, defaults to True):
+        visual_attr_loss (:obj:`bool`, optional, defaults to :obj:`True`):
             This task specifies whether or not Lxmert will calculate the attribute-prediction loss objective
-        visual_feat_loss (:obj:`bool`, optional, defaults to True):
+        visual_feat_loss (:obj:`bool`, optional, defaults to :obj:`True`):
             This task specifies whether or not Lxmert will calculate the feature-regression loss objective
-        output_attentions (:obj:`bool`, optional, defaults to False):
+        output_attentions (:obj:`bool`, optional, defaults to :obj:`False`):
                 if True, the vision, langauge, and cross-modality layers will be returned
-        output_hidden_states (:obj:`bool`, optional, defaults to False):
+        output_hidden_states (:obj:`bool`, optional, defaults to :obj:`False`):
                 if True, final cross-modality hidden states for language and vision features will be returned
 
     """

diff --git a/src/transformers/configuration_mobilebert.py b/src/transformers/configuration_mobilebert.py
@@ -67,15 +67,15 @@ class MobileBertConfig(PretrainedConfig):
             The ID of the token in the word embedding to use as padding.
         embedding_size (:obj:`int`, optional, defaults to 128):
             The dimension of the word embedding vectors.
-        trigram_input (:obj:`bool`, optional, defaults to True):
+        trigram_input (:obj:`bool`, optional, defaults to :obj:`True`):
             Use a convolution of trigram as input.
-        use_bottleneck (:obj:`bool`, optional, defaults to True):
+        use_bottleneck (:obj:`bool`, optional, defaults to :obj:`True`):
             Whether to use bottleneck in BERT.
         intra_bottleneck_size (:obj:`int`, optional, defaults to 128):
             Size of bottleneck layer output.
-        use_bottleneck_attention (:obj:`bool`, optional, defaults to False):
+        use_bottleneck_attention (:obj:`bool`, optional, defaults to :obj:`False`):
             Whether to use attention inputs from the bottleneck transformation.
-        key_query_shared_bottleneck (:obj:`bool`, optional, defaults to True):
+        key_query_shared_bottleneck (:obj:`bool`, optional, defaults to :obj:`True`):
             Whether to use the same linear transformation for query&key in the bottleneck.
         num_feedforward_networks (:obj:`int`, optional, defaults to 4):
             Number of FFNs in a block.

diff --git a/src/transformers/configuration_openai.py b/src/transformers/configuration_openai.py
@@ -81,7 +81,7 @@ class OpenAIGPTConfig(PretrainedConfig):
             Argument used when doing sequence summary. Used in for the multiple choice head in
             :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
             Add a projection after the vector extraction
-        summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+        summary_activation (:obj:`string` or :obj:`None`, optional):
             Argument used when doing sequence summary. Used in for the multiple choice head in
             :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
             'tanh' => add a tanh activation to the output, Other => no activation.

diff --git a/src/transformers/configuration_reformer.py b/src/transformers/configuration_reformer.py
@@ -45,7 +45,7 @@ class ReformerConfig(PretrainedConfig):
             LSHSelfAttention layer ("lsh") and a LocalSelfAttention layer ("local").
             For more information on LSHSelfAttention layer, see `LSH Self Attention <reformer.html#lsh-self-attention>`__ .
             For more information on LocalSelfAttention layer, see `Local Self Attention <reformer.html#local-sensitive-hashing-self-attention>`__ .
-        axial_pos_embds (:obj:`bool`, optional, defaults to True):
+        axial_pos_embds (:obj:`bool`, optional, defaults to :obj:`True`):
             If `True` use axial position embeddings. For more information on how axial position embeddings work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__
         axial_norm_std (:obj:`float`, optional, defaluts to 1.0):
             The standard deviation of the normal_initializer for initializing the weight matrices of the axial positional encodings.
@@ -77,7 +77,7 @@ class ReformerConfig(PretrainedConfig):
             Dimensionality of the output hidden states of the residual attention blocks.
         initializer_range (:obj:`float`, optional, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        is_decoder (:obj:`bool`, optional, defaults to False):
+        is_decoder (:obj:`bool`, optional, defaults to :obj:`False`):
             If `is_decoder` is True, a causal mask is used in addition to `attention_mask`.
             When using the Reformer for causal language modeling, `is_decoder` is set to `True`.
         layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):