delete reorder_ functions

huggingface · sshleifer · Feb 20, 2020 · Jan 23, 2020 · Jan 24, 2020 · Jan 24, 2020
commit 43391026541888bbf766448e2a631639c2558aa1
diff --git a/src/transformers/configuration_bart.py b/src/transformers/configuration_bart.py
@@ -54,7 +54,6 @@ def __init__(
         max_position_embeddings=1024,
         init_std=0.02,
         classifier_dropout=0.0,
-        num_labels=3,
         **common_kwargs
     ):
         super().__init__(**common_kwargs)
@@ -81,7 +80,6 @@ def __init__(
 
         # Classifier stuff
         self.classif_dropout = classifier_dropout
-        self.num_labels = num_labels
 
     @property
     def num_attention_heads(self):

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
@@ -27,7 +27,7 @@
 from .configuration_bart import BartConfig
 from .file_utils import add_start_docstrings
 from .modeling_utils import PreTrainedModel
-
+from .utils_encoder_decoder import prepare_encoder_decoder_model_kwargs
 
 logger = logging.getLogger(__name__)
 
@@ -94,15 +94,12 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.shared = value
 
-    def forward(self, input_ids: torch.LongTensor = None, return_for_head=False, **kwargs):
-        input_ids = input_ids if input_ids is not None else kwargs["encoder_input_ids"]  # TODO(SS): decide on API
-        if input_ids.dim() == 1:
-            input_ids = input_ids.unsqueeze(0)
-        if input_ids.size(-1) > min(self.max_positions()):
-            raise ValueError(
-                "input_ids exceeds maximum length: {} > {}".format(input_ids.size(-1), self.max_positions())
-            )
-        encoder_out = self.encoder(input_ids)
+    #def forward(self, input_ids: torch.LongTensor = None, return_for_head=False, **kwargs):
+    def forward(self,return_for_head=False, **kwargs):
+        kwargs_encoder, kwargs_decoder = prepare_encoder_decoder_model_kwargs(**kwargs)
+        # TODO(SS): only call encoder if we need to
+        encoder_out = self.encoder(**kwargs_encoder)
+        input_ids = kwargs_encoder.pop('input_ids')
         prev_output_tokens = self.shift_tokens_left(input_ids, self.config.pad_token_id)
         dec_features, dec_hidden, dec_attn = self.decoder(prev_output_tokens, encoder_out=encoder_out,)
         if return_for_head:  # split encoder and decoder outputs nicely
@@ -176,20 +173,16 @@ class BartForSequenceClassification(PretrainedBartModel):
     def __init__(self, config: BartConfig, **kwargs):
         super().__init__(config, **kwargs)
         self.model = BartModel(config)
-        self.classification_head = BARTClassificationHead(
+        self.classification_head = BartClassificationHead(
             config.d_model, config.d_model, config.num_labels, config.classif_dropout,
         )
         self.loss_fn = nn.CrossEntropyLoss()
 
-    def forward(self, input_ids, *args, **kwargs):
-
-        if input_ids.ndim == 1:
-            input_ids = input_ids.unsqueeze(0)
-        kwargs["return_for_head"] = True
+    def forward(self,  **kwargs):
         labels = kwargs.pop("labels", None)
-        decoder_outputs, encoder_outputs = self.model(input_ids, *args, **kwargs)
+        decoder_outputs, encoder_outputs = self.model(return_for_head=True, **kwargs)
         x = decoder_outputs[0]  # last hidden state
-
+        input_ids = _get_input_ids_from_kwargs(**kwargs)
         eos_mask = input_ids.eq(self.eos_token)
         if len(torch.unique(eos_mask.sum(1))) > 1:
             raise ValueError("All examples must have the same number of <eos> tokens.")
@@ -204,6 +197,9 @@ def forward(self, input_ids, *args, **kwargs):
         return decoder_outputs + encoder_outputs
 
 
+def _get_input_ids_from_kwargs(**kwargs):
+    """Try to get input_ids and if that key is not present get encoder_input_ids."""
+    return kwargs.get('input_ids', kwargs.get('encoder_input_ids', None))
 # Encoder and Decoder
 
 
@@ -481,33 +477,6 @@ def max_positions(self):
         """Maximum input length supported by the encoder."""
         return min(self.max_source_positions, self.embed_positions.max_positions)
 
-    # Unused
-    def reorder_encoder_out(self, encoder_out, new_order):
-        """
-        Reorder encoder output according to *new_order*.
-
-        Args:
-            encoder_out: output from the ``forward()`` method
-            new_order (LongTensor): desired order
-
-        Returns:
-            *encoder_out* rearranged according to *new_order*
-        """
-        if encoder_out.encoder_out is not None:
-            encoder_out = encoder_out._replace(encoder_out=encoder_out.encoder_out.index_select(1, new_order))
-        if encoder_out.encoder_padding_mask is not None:
-            encoder_out = encoder_out._replace(
-                encoder_padding_mask=encoder_out.encoder_padding_mask.index_select(0, new_order)
-            )
-        if encoder_out.encoder_embedding is not None:
-            encoder_out = encoder_out._replace(
-                encoder_embedding=encoder_out.encoder_embedding.index_select(0, new_order)
-            )
-        if encoder_out.encoder_states is not None:
-            for idx, state in enumerate(encoder_out.encoder_states):
-                encoder_out.encoder_states[idx] = state.index_select(1, new_order)
-        return encoder_out
-
 
 class BartDecoder(nn.Module):
     """
@@ -644,7 +613,7 @@ def buffered_future_mask(self, tensor):
 # Helper Modules
 
 
-class BARTClassificationHead(nn.Module):
+class BartClassificationHead(nn.Module):
     """Head for sentence-level classification tasks."""
 
     # This can trivially be shared with RobertaClassificationHead
@@ -920,16 +889,6 @@ def _append_prev_key_padding_mask(
             new_key_padding_mask = prev_key_padding_mask
         return new_key_padding_mask
 
-    def reorder_incremental_state(self, incremental_state: Dict[str, Dict[str, Optional[Tensor]]], new_order):
-        """Reorder buffered internal state (for incremental generation)."""
-        # TODO(SS): Where is this used?
-        input_buffer = self._get_input_buffer(incremental_state)
-        if input_buffer is not None:
-            for k in input_buffer.keys():
-                if input_buffer[k] is not None:
-                    input_buffer[k] = input_buffer[k].index_select(0, new_order)
-            self._set_input_buffer(incremental_state, input_buffer)
-
     def _get_input_buffer(
         self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
     ) -> Dict[str, Optional[Tensor]]:

diff --git a/src/transformers/modeling_encoder_decoder.py b/src/transformers/modeling_encoder_decoder.py
@@ -236,42 +236,6 @@ def forward(self, encoder_input_ids, decoder_input_ids, **kwargs):
 
         return decoder_outputs + encoder_outputs
 
-    @staticmethod
-    def prepare_model_kwargs(**kwargs):
-        """ Prepare the encoder and decoder's keyword arguments.
-
-        Keyword arguments come in 3 flavors:
-        - encoder-specific (prefixed by `encoder_`)
-        - decoder-specific (prefixed by `decoder_`)
-        - those that apply to the model as whole.
-
-        We let the specific kwargs override the common ones in case of
-        conflict.
-        """
-        kwargs_common = {
-            argument: value
-            for argument, value in kwargs.items()
-            if not argument.startswith("encoder_") and not argument.startswith("decoder_")
-        }
-        decoder_kwargs = kwargs_common.copy()
-        encoder_kwargs = kwargs_common.copy()
-        encoder_kwargs.update(
-            {
-                argument[len("encoder_") :]: value
-                for argument, value in kwargs.items()
-                if argument.startswith("encoder_")
-            }
-        )
-        decoder_kwargs.update(
-            {
-                argument[len("decoder_") :]: value
-                for argument, value in kwargs.items()
-                if argument.startswith("decoder_")
-            }
-        )
-        decoder_kwargs["encoder_attention_mask"] = encoder_kwargs.get("attention_mask", None)
-        return encoder_kwargs, decoder_kwargs
-
 
 class Model2Model(PreTrainedEncoderDecoder):
     r"""
@@ -348,3 +312,5 @@ def from_pretrained(cls, *args, **kwargs):
             kwargs["decoder_model"] = torch.nn.LSTM(kwargs.pop("decoder_config"))
         model = super().from_pretrained(*args, **kwargs)
         return model
+
+
diff --git a/src/transformers/utils_encoder_decoder.py b/src/transformers/utils_encoder_decoder.py
@@ -0,0 +1,51 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Classes to support Encoder-Decoder architectures """
+
+
+def prepare_encoder_decoder_model_kwargs(**kwargs):
+    """ Prepare the encoder and decoder's keyword arguments.
+
+    Keyword arguments come in 3 flavors:
+    - encoder-specific (prefixed by `encoder_`)
+    - decoder-specific (prefixed by `decoder_`)
+    - those that apply to the model as whole.
+
+    We let the specific kwargs override the common ones in case of
+    conflict.
+    """
+    kwargs_common = {
+        argument: value
+        for argument, value in kwargs.items()
+        if not argument.startswith("encoder_") and not argument.startswith("decoder_")
+    }
+    decoder_kwargs = kwargs_common.copy()
+    encoder_kwargs = kwargs_common.copy()
+    encoder_kwargs.update(
+        {
+            argument[len("encoder_") :]: value
+            for argument, value in kwargs.items()
+            if argument.startswith("encoder_")
+        }
+    )
+    decoder_kwargs.update(
+        {
+            argument[len("decoder_") :]: value
+            for argument, value in kwargs.items()
+            if argument.startswith("decoder_")
+        }
+    )
+    decoder_kwargs["encoder_attention_mask"] = encoder_kwargs.get("attention_mask", None)
+    return encoder_kwargs, decoder_kwargs
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
@@ -116,11 +116,11 @@ def prepare_config_and_inputs_for_common(self):
             return (
                 config,
                 {
-                    "input_ids": input_ids,
+                    #"input_ids": input_ids,
                     "token_type_ids": token_type_ids,
                     "attention_mask": input_mask,
                     "encoder_input_ids": input_ids,
-                    "decoder_input_ids": input_ids,  # TODO(SS): use prepare_model_kwargs llike T5
+                    "decoder_input_ids": input_ids,
                     "decoder_lm_labels": decoder_lm_labels,
                 },
             )
@@ -213,13 +213,13 @@ def test_forward(self):
             max_position_embeddings=48,
         )
         model = BartForSequenceClassification(config)
-        outputs = model(input_ids)
+        outputs = model(input_ids=input_ids)
         logits = outputs[0]
         expected_shape = torch.Size((self.batch_size, config.num_labels))
         self.assertEqual(logits.shape, expected_shape)
 
         lm_model = BartForMaskedLM(config)
-        output = lm_model(input_ids)[0]
+        output = lm_model(input_ids=input_ids)[0]
         expected_shape = (self.batch_size, input_ids.shape[1], config.vocab_size)
         self.assertEqual(output.shape, expected_shape)
 
@@ -230,7 +230,7 @@ def test_inference_no_head(self):
         model = BartModel.from_pretrained("bart-large")
         input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         with torch.no_grad():
-            output = model(input_ids)[0]
+            output = model(input_ids=input_ids)[0]
         expected_shape = torch.Size((1, 11, 1024))
         self.assertEqual(output.shape, expected_shape)
         expected_slice = torch.Tensor(