Add PerceiverForSequenceClassification

huggingface · NielsRogge · Dec 8, 2021 · Aug 2, 2021 · Sep 6, 2021 · Sep 6, 2021
commit 2a3c57c3b3f9b6802079cd83673af077a34a4dbb
diff --git a/docs/source/model_doc/perceiver.rst b/docs/source/model_doc/perceiver.rst
@@ -192,6 +192,13 @@ PerceiverForMaskedLM
     :members: forward
 
 
+PerceiverForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.PerceiverForSequenceClassification
+    :members: forward
+
+
 PerceiverForImageClassificationLearned
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -1140,6 +1140,7 @@
             "PerceiverForMaskedLM",
             "PerceiverForMultimodalAutoencoding",
             "PerceiverForOpticalFlow",
+            "PerceiverForSequenceClassification",
             "PerceiverLayer",
             "PerceiverModel",
             "PerceiverPreTrainedModel",
@@ -3008,6 +3009,7 @@
             PerceiverForMaskedLM,
             PerceiverForMultimodalAutoencoding,
             PerceiverForOpticalFlow,
+            PerceiverForSequenceClassification,
             PerceiverLayer,
             PerceiverModel,
             PerceiverPreTrainedModel,

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -347,6 +347,7 @@
 MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Sequence Classification mapping
+        ("perceiver", "PerceiverForSequenceClassification"),
         ("qdqbert", "QDQBertForSequenceClassification"),
         ("fnet", "FNetForSequenceClassification"),
         ("gptj", "GPTJForSequenceClassification"),

diff --git a/src/transformers/models/perceiver/__init__.py b/src/transformers/models/perceiver/__init__.py
@@ -37,6 +37,7 @@
         "PerceiverForMaskedLM",
         "PerceiverForMultimodalAutoencoding",
         "PerceiverForOpticalFlow",
+        "PerceiverForSequenceClassification",
         "PerceiverLayer",
         "PerceiverModel",
         "PerceiverPreTrainedModel",
@@ -59,6 +60,7 @@
             PerceiverForMaskedLM,
             PerceiverForMultimodalAutoencoding,
             PerceiverForOpticalFlow,
+            PerceiverForSequenceClassification,
             PerceiverLayer,
             PerceiverModel,
             PerceiverPreTrainedModel,

diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -956,6 +956,115 @@ def forward(
         )
 
 
+@add_start_docstrings("""Example use of Perceiver for text classification. """, PERCEIVER_START_DOCSTRING)
+class PerceiverForSequenceClassification(PerceiverPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        trainable_position_encoding_kwargs_decoder = dict(num_channels=config.d_latents, index_dims=1)
+
+        self.num_labels = config.num_labels
+        self.perceiver = PerceiverModel(
+            config,
+            input_preprocessor=PerceiverTextPreprocessor(config),
+            decoder=PerceiverClassificationDecoder(
+                config,
+                num_channels=config.d_latents,
+                trainable_position_encoding_kwargs=trainable_position_encoding_kwargs_decoder,
+                use_query_residual=True,
+            ),
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(PERCEIVER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=PerceiverClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        inputs=None,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        labels=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Examples::
+
+            >>> from transformers import PerceiverTokenizer, PerceiverForSequenceClassification
+
+            >>> tokenizer = PerceiverTokenizer.from_pretrained('deepmind/vision-perceiver')
+            >>> model = PerceiverForSequenceClassification.from_pretrained('deepmind/vision-perceiver')
+
+            >>> text = "hello world"
+            >>> inputs = tokenizer(images=image, return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> logits = outputs.logits
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.perceiver(
+            inputs=inputs,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        logits = outputs.logits if return_dict else outputs[0]
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return PerceiverClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
 @add_start_docstrings(
     """
 Example use of Perceiver for image classification, for tasks such as ImageNet.

diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
@@ -3769,6 +3769,18 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class PerceiverForSequenceClassification:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    def forward(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class PerceiverLayer:
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])

diff --git a/tests/test_modeling_perceiver.py b/tests/test_modeling_perceiver.py
@@ -25,6 +25,7 @@
 from transformers import PerceiverConfig
 from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.models.auto import get_values
+from transformers.models.perceiver.modeling_perceiver import PerceiverForSequenceClassification
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -137,7 +138,7 @@ def prepare_config_and_inputs(self, model_class=None):
         if model_class is None or model_class.__name__ == "PerceiverModel":
             inputs = floats_tensor([self.batch_size, self.seq_length, config.d_model], self.vocab_size)
             return config, inputs, input_mask, sequence_labels, token_labels
-        elif model_class.__name__ == "PerceiverForMaskedLM":
+        elif model_class.__name__ in ["PerceiverForMaskedLM", "PerceiverForSequenceClassification"]:
             inputs = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
             # input mask is only relevant for text inputs
             if self.use_input_mask:
@@ -171,33 +172,33 @@ def prepare_config_and_inputs(self, model_class=None):
 
         return config, inputs, input_mask, sequence_labels, token_labels
 
-    def prepare_config_and_inputs_masked_lm(self):
-        inputs = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+    # def prepare_config_and_inputs_masked_lm(self):
+    #     inputs = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
-        input_mask = None
-        if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+    #     input_mask = None
+    #     if self.use_input_mask:
+    #         input_mask = random_attention_mask([self.batch_size, self.seq_length])
 
-        token_labels = None
-        if self.use_labels:
-            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+    #     token_labels = None
+    #     if self.use_labels:
+    #         token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
 
-        config = self.get_config()
+    #     config = self.get_config()
 
-        return config, inputs, input_mask, token_labels
+    #     return config, inputs, input_mask, token_labels
 
-    def prepare_config_and_inputs_image_classification(self):
-        inputs = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+    # def prepare_config_and_inputs_classification(self):
+    #     inputs = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
 
-        input_mask = None
+    #     input_mask = None
 
-        image_labels = None
-        if self.use_labels:
-            image_labels = ids_tensor([self.batch_size], self.num_labels)
+    #     classification_labels = None
+    #     if self.use_labels:
+    #         classification_labels = ids_tensor([self.batch_size], self.num_labels)
 
-        config = self.get_config()
+    #     config = self.get_config()
 
-        return config, inputs, input_mask, image_labels
+    #     return config, inputs, input_mask, classification_labels
 
     def get_config(self):
         return PerceiverConfig(
@@ -220,21 +221,56 @@ def get_config(self):
             num_labels=self.num_labels,
         )
 
-    def create_and_check_for_masked_lm(self, config, inputs, input_mask, token_labels):
+    def create_and_check_for_masked_lm(self, config, inputs, input_mask, sequence_labels, token_labels):
         model = PerceiverForMaskedLM(config=config)
         model.to(torch_device)
         model.eval()
         result = model(inputs, attention_mask=input_mask, labels=token_labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
 
-    def create_and_check_for_image_classification(self, config, inputs, input_mask, image_labels):
+    def create_and_check_for_sequence_classification(self, config, inputs, input_mask, sequence_labels, token_labels):
+        # set num_labels
+        config.num_labels = self.num_labels
+        model = PerceiverForSequenceClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(inputs, attention_mask=input_mask, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_image_classification_learned(
+        self, config, inputs, input_mask, sequence_labels, token_labels
+    ):
         # set d_model and num_labels
         config.d_model = 512
         config.num_labels = self.num_labels
         model = PerceiverForImageClassificationLearned(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(inputs, attention_mask=input_mask, labels=image_labels)
+        result = model(inputs, attention_mask=input_mask, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_image_classification_fourier(
+        self, config, inputs, input_mask, sequence_labels, token_labels
+    ):
+        # set d_model and num_labels
+        config.d_model = 261
+        config.num_labels = self.num_labels
+        model = PerceiverForImageClassificationFourier(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(inputs, attention_mask=input_mask, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_image_classification_conv(
+        self, config, inputs, input_mask, sequence_labels, token_labels
+    ):
+        # set d_model and num_labels
+        config.d_model = 322
+        config.num_labels = self.num_labels
+        model = PerceiverForImageClassificationConvProcessing(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(inputs, attention_mask=input_mask, labels=sequence_labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
 
     def prepare_config_and_inputs_for_common(self):
@@ -263,6 +299,7 @@ class PerceiverModelTest(ModelTesterMixin, unittest.TestCase):
             PerceiverForImageClassificationFourier,
             PerceiverForOpticalFlow,
             PerceiverForMultimodalAutoencoding,
+            PerceiverForSequenceClassification,
         )
         if is_torch_available()
         else ()
@@ -309,12 +346,30 @@ def test_config(self):
         self.config_tester.check_config_can_be_init_without_params()
 
     def test_for_masked_lm(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_masked_lm()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs(model_class=PerceiverForMaskedLM)
         self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
 
-    def test_for_image_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_image_classification()
-        self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs(model_class=PerceiverForSequenceClassification)
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_image_classification_learned(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs(
+            model_class=PerceiverForImageClassificationLearned
+        )
+        self.model_tester.create_and_check_for_image_classification_learned(*config_and_inputs)
+
+    def test_for_image_classification_fourier(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs(
+            model_class=PerceiverForImageClassificationFourier
+        )
+        self.model_tester.create_and_check_for_image_classification_fourier(*config_and_inputs)
+
+    def test_for_image_classification_conv(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs(
+            model_class=PerceiverForImageClassificationConvProcessing
+        )
+        self.model_tester.create_and_check_for_image_classification_conv(*config_and_inputs)
 
     def test_model_common_attributes(self):
         for model_class in self.all_model_classes:
@@ -676,6 +731,7 @@ def test_correct_missing_keys(self):
             if model_class in [
                 PerceiverForOpticalFlow,
                 PerceiverForMultimodalAutoencoding,
+                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
                 *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
             ]:
                 continue