huggingface
diff --git a/‎examples/bert-loses-patience/pabee/modeling_pabee_albert.py
Lines changed: 3 additions & 3 deletions b/‎examples/bert-loses-patience/pabee/modeling_pabee_albert.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/bert-loses-patience/pabee/modeling_pabee_bert.py
Lines changed: 3 additions & 3 deletions b/‎examples/bert-loses-patience/pabee/modeling_pabee_bert.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/deebert/src/modeling_highway_bert.py
Lines changed: 3 additions & 3 deletions b/‎examples/deebert/src/modeling_highway_bert.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/deebert/src/modeling_highway_roberta.py
Lines changed: 2 additions & 2 deletions b/‎examples/deebert/src/modeling_highway_roberta.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/movement-pruning/emmental/modeling_bert_masked.py
Lines changed: 6 additions & 6 deletions b/‎examples/movement-pruning/emmental/modeling_bert_masked.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/transformers/file_utils.py
Lines changed: 1 addition & 1 deletion b/‎src/transformers/file_utils.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/modeling_albert.py
Lines changed: 8 additions & 8 deletions b/‎src/transformers/modeling_albert.py
Lines changed: 8 additions & 8 deletions
diff --git a/‎src/transformers/modeling_bart.py
Lines changed: 5 additions & 5 deletions b/‎src/transformers/modeling_bart.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/transformers/modeling_bert.py
Lines changed: 10 additions & 10 deletions b/‎src/transformers/modeling_bert.py
Lines changed: 10 additions & 10 deletions
diff --git a/‎src/transformers/modeling_bert_generation.py
Lines changed: 3 additions & 3 deletions b/‎src/transformers/modeling_bert_generation.py
Lines changed: 3 additions & 3 deletions
@@ -20,7 +20,7 @@
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from transformers.modeling_albert import (
     ALBERT_INPUTS_DOCSTRING,
     ALBERT_START_DOCSTRING,
@@ -87,7 +87,7 @@ def log_stats(self):
         message = f"*** Patience = {self.patience} Avg. Inference Layers = {avg_inf_layers:.2f} Speed Up = {1 - avg_inf_layers / self.config.num_hidden_layers:.2f} ***"
         print(message)
 
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids=None,
@@ -230,7 +230,7 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids=None,
 
@@ -22,7 +22,7 @@
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from transformers.modeling_bert import (
     BERT_INPUTS_DOCSTRING,
     BERT_START_DOCSTRING,
@@ -92,7 +92,7 @@ def log_stats(self):
         message = f"*** Patience = {self.patience} Avg. Inference Layers = {avg_inf_layers:.2f} Speed Up = {1 - avg_inf_layers / self.config.num_hidden_layers:.2f} ***"
         print(message)
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids=None,
@@ -254,7 +254,7 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids=None,
 
@@ -2,7 +2,7 @@
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from transformers.modeling_bert import (
     BERT_INPUTS_DOCSTRING,
     BERT_START_DOCSTRING,
@@ -134,7 +134,7 @@ def _prune_heads(self, heads_to_prune):
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids=None,
@@ -288,7 +288,7 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids=None,
 
@@ -4,7 +4,7 @@
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from transformers.configuration_roberta import RobertaConfig
-from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from transformers.modeling_roberta import ROBERTA_INPUTS_DOCSTRING, ROBERTA_START_DOCSTRING, RobertaEmbeddings
 
 from .modeling_highway_bert import BertPreTrainedModel, DeeBertModel, HighwayException, entropy
@@ -45,7 +45,7 @@ def __init__(self, config):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
 
-    @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids=None,
 
@@ -28,7 +28,7 @@
 
 from emmental import MaskedBertConfig
 from emmental.modules import MaskedLinear
-from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from transformers.modeling_bert import ACT2FN, BertLayerNorm, load_tf_weights_in_bert
 from transformers.modeling_utils import PreTrainedModel, prune_linear_layer
 
@@ -498,7 +498,7 @@ def _prune_heads(self, heads_to_prune):
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    @add_start_docstrings_to_callable(MASKED_BERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(MASKED_BERT_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids=None,
@@ -671,7 +671,7 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(MASKED_BERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(MASKED_BERT_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids=None,
@@ -756,7 +756,7 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(MASKED_BERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(MASKED_BERT_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids=None,
@@ -846,7 +846,7 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(MASKED_BERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(MASKED_BERT_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids=None,
@@ -932,7 +932,7 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(MASKED_BERT_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(MASKED_BERT_INPUTS_DOCSTRING)
     def forward(
         self,
         input_ids=None,
 
@@ -425,7 +425,7 @@ def docstring_decorator(fn):
     return docstring_decorator
 
 
-def add_start_docstrings_to_callable(*docstr):
+def add_start_docstrings_to_model_forward(*docstr):
     def docstring_decorator(fn):
         class_name = ":class:`~transformers.{}`".format(fn.__qualname__.split(".")[0])
         intro = "   The {} forward method, overrides the :func:`__call__` special method.".format(class_name)
 
@@ -30,7 +30,7 @@
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
-    add_start_docstrings_to_callable,
+    add_start_docstrings_to_model_forward,
     replace_return_docstrings,
 )
 from .modeling_outputs import (
@@ -631,7 +631,7 @@ def _prune_heads(self, heads_to_prune):
             inner_group_idx = int(layer - group_idx * self.config.inner_group_num)
             self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads)
 
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
         checkpoint="albert-base-v2",
@@ -727,7 +727,7 @@ def get_output_embeddings(self):
     def get_input_embeddings(self):
         return self.albert.embeddings.word_embeddings
 
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @replace_return_docstrings(output_type=AlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -879,7 +879,7 @@ def get_output_embeddings(self):
     def get_input_embeddings(self):
         return self.albert.embeddings.word_embeddings
 
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
         checkpoint="albert-base-v2",
@@ -967,7 +967,7 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
         checkpoint="albert-base-v2",
@@ -1055,7 +1055,7 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
         checkpoint="albert-base-v2",
@@ -1143,7 +1143,7 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
         checkpoint="albert-base-v2",
@@ -1242,7 +1242,7 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
         checkpoint="albert-base-v2",
 
@@ -30,7 +30,7 @@
     add_code_sample_docstrings,
     add_end_docstrings,
     add_start_docstrings,
-    add_start_docstrings_to_callable,
+    add_start_docstrings_to_model_forward,
     replace_return_docstrings,
 )
 from .modeling_outputs import (
@@ -846,7 +846,7 @@ def __init__(self, config: BartConfig):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
         checkpoint="facebook/bart-large",
@@ -981,7 +981,7 @@ def _resize_final_logits_bias(self, new_num_tokens: int, old_num_tokens: int) ->
             new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
         self.register_buffer("final_logits_bias", new_bias)
 
-    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     @add_end_docstrings(BART_GENERATION_EXAMPLE)
     def forward(
@@ -1147,7 +1147,7 @@ def __init__(self, config: BartConfig, **kwargs):
         self.model._init_weights(self.classification_head.dense)
         self.model._init_weights(self.classification_head.out_proj)
 
-    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
         checkpoint="facebook/bart-large",
@@ -1234,7 +1234,7 @@ def __init__(self, config):
 
         self.model._init_weights(self.qa_outputs)
 
-    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
         checkpoint="facebook/bart-large",
 
@@ -33,7 +33,7 @@
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
-    add_start_docstrings_to_callable,
+    add_start_docstrings_to_model_forward,
     replace_return_docstrings,
 )
 from .modeling_outputs import (
@@ -748,7 +748,7 @@ class PreTrainedModel
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
         checkpoint="bert-base-uncased",
@@ -870,7 +870,7 @@ def __init__(self, config):
     def get_output_embeddings(self):
         return self.cls.predictions.decoder
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -983,7 +983,7 @@ def __init__(self, config):
     def get_output_embeddings(self):
         return self.cls.predictions.decoder
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @replace_return_docstrings(output_type=CausalLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -1103,7 +1103,7 @@ def __init__(self, config):
     def get_output_embeddings(self):
         return self.cls.predictions.decoder
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
         checkpoint="bert-base-uncased",
@@ -1206,7 +1206,7 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -1300,7 +1300,7 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
         checkpoint="bert-base-uncased",
@@ -1384,7 +1384,7 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
         checkpoint="bert-base-uncased",
@@ -1479,7 +1479,7 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
         checkpoint="bert-base-uncased",
@@ -1569,7 +1569,7 @@ def __init__(self, config):
 
         self.init_weights()
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
         checkpoint="bert-base-uncased",
 
@@ -24,7 +24,7 @@
 from .file_utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
-    add_start_docstrings_to_callable,
+    add_start_docstrings_to_model_forward,
     replace_return_docstrings,
 )
 from .modeling_bert import BertEncoder
@@ -293,7 +293,7 @@ class PreTrainedModel
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    @add_start_docstrings_to_callable(BERT_GENERATION_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_GENERATION_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
         checkpoint="google/bert_for_seq_generation_L-24_bbc_encoder",
@@ -421,7 +421,7 @@ def __init__(self, config):
     def get_output_embeddings(self):
         return self.lm_head.decoder
 
-    @add_start_docstrings_to_callable(BERT_GENERATION_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(BERT_GENERATION_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @replace_return_docstrings(output_type=CausalLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,