WIP getting equivalence on pipelines

thomwolf · thomwolf · commit 191f88a4cc2b · 2020-10-26T23:37:09.000+01:00
diff --git a/src/transformers/tokenization_roberta_fast.py b/src/transformers/tokenization_roberta_fast.py
@@ -190,10 +190,10 @@ def mask_token(self) -> str:
 
     @mask_token.setter
     def mask_token(self, value):
-        """ Overriding the default behavior of the mask token to have it eat the space before it.
+        """Overriding the default behavior of the mask token to have it eat the space before it.
 
-            This is needed to preserve backward compatibility with all the previously used models
-            based on Roberta.
+        This is needed to preserve backward compatibility with all the previously used models
+        based on Roberta.
         """
         # Mask token behave like a normal word, i.e. include the space before it
         # So we set lstrip to True
diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py
@@ -1,22 +1,24 @@
 import unittest
-from unittest import mock
 from typing import List, Optional
+from unittest import mock
 
 from transformers import is_tf_available, is_torch_available, pipeline
-from transformers.tokenization_utils_base import to_py_obj
 from transformers.pipelines import DefaultArgumentHandler, Pipeline
 from transformers.testing_utils import _run_slow_tests, is_pipeline_test, require_tf, require_torch, slow
+from transformers.tokenization_utils_base import to_py_obj
 
 
 VALID_INPUTS = ["A simple string", ["list of strings"]]
 
 
-@is_pipeline_test
+# @is_pipeline_test
 class CustomInputPipelineCommonMixin:
     pipeline_task = None
-    pipeline_loading_kwargs = {}
-    small_models = None  # Models tested without the @slow decorator
-    large_models = None  # Models tested with the @slow decorator
+    pipeline_loading_kwargs = {}  # Additional kwargs to load the pipeline with
+    pipeline_running_kwargs = {}  # Additional kwargs to run the pipeline with
+    small_models = []  # Models tested without the @slow decorator
+    large_models = []  # Models tested with the @slow decorator
+    valid_inputs = VALID_INPUTS  # Some inputs which are valid to compare fast and slow tokenizers
 
     def setUp(self) -> None:
         if not is_tf_available() and not is_torch_available():
@@ -48,78 +50,41 @@ def setUp(self) -> None:
     @require_torch
     @slow
     def test_pt_defaults(self):
-        pipeline(self.pipeline_task, framework="pt")
+        pipeline(self.pipeline_task, framework="pt", **self.pipeline_loading_kwargs)
 
     @require_tf
     @slow
     def test_tf_defaults(self):
-        pipeline(self.pipeline_task, framework="tf")
+        pipeline(self.pipeline_task, framework="tf", **self.pipeline_loading_kwargs)
 
     @require_torch
     def test_torch_small(self):
         for model_name in self.small_models:
-            nlp = pipeline(task=self.pipeline_task, model=model_name, tokenizer=model_name, framework="pt")
+            nlp = pipeline(
+                task=self.pipeline_task,
+                model=model_name,
+                tokenizer=model_name,
+                framework="pt",
+                **self.pipeline_loading_kwargs,
+            )
             self._test_pipeline(nlp)
 
     @require_tf
     def test_tf_small(self):
         for model_name in self.small_models:
-            nlp = pipeline(task=self.pipeline_task, model=model_name, tokenizer=model_name, framework="tf")
+            nlp = pipeline(
+                task=self.pipeline_task,
+                model=model_name,
+                tokenizer=model_name,
+                framework="tf",
+                **self.pipeline_loading_kwargs,
+            )
             self._test_pipeline(nlp)
 
     @require_torch
     @slow
     def test_torch_large(self):
         for model_name in self.large_models:
-            nlp = pipeline(task=self.pipeline_task, model=model_name, tokenizer=model_name, framework="pt")
-            self._test_pipeline(nlp)
-
-    @require_tf
-    @slow
-    def test_tf_large(self):
-        for model_name in self.large_models:
-            nlp = pipeline(task=self.pipeline_task, model=model_name, tokenizer=model_name, framework="tf")
-            self._test_pipeline(nlp)
-
-    def _test_pipeline(self, nlp: Pipeline):
-        raise NotImplementedError
-
-
-# @is_pipeline_test
-class MonoInputPipelineCommonMixin:
-    pipeline_task = None
-    pipeline_loading_kwargs = {}  # Additional kwargs to load the pipeline with
-    pipeline_running_kwargs = {}  # Additional kwargs to run the pipeline with
-    small_models = []  # Models tested without the @slow decorator
-    large_models = []  # Models tested with the @slow decorator
-    mandatory_keys = {}  # Keys which should be in the output
-    valid_inputs = VALID_INPUTS  # inputs which are valid
-    invalid_inputs = [None]  # inputs which are not allowed
-    expected_multi_result: Optional[List] = None
-    expected_check_keys: Optional[List[str]] = None
-
-    def setUp(self) -> None:
-        if not is_tf_available() and not is_torch_available():
-            return  # Currently no JAX pipelines
-
-        for model_name in self.small_models:
-            pipeline(self.pipeline_task, model=model_name, tokenizer=model_name, **self.pipeline_loading_kwargs)
-        for model_name in self.large_models:
-            pipeline(self.pipeline_task, model=model_name, tokenizer=model_name, **self.pipeline_loading_kwargs)
-
-    @require_torch
-    @slow
-    def test_pt_defaults_loads(self):
-        pipeline(self.pipeline_task, framework="pt", **self.pipeline_loading_kwargs)
-
-    @require_tf
-    @slow
-    def test_tf_defaults_loads(self):
-        pipeline(self.pipeline_task, framework="tf", **self.pipeline_loading_kwargs)
-
-    @require_torch
-    def test_torch_small(self):
-        for model_name in self.small_models:
             nlp = pipeline(
                 task=self.pipeline_task,
                 model=model_name,
@@ -130,8 +95,9 @@ def test_torch_small(self):
             self._test_pipeline(nlp)
 
     @require_tf
-    def test_tf_small(self):
-        for model_name in self.small_models:
+    @slow
+    def test_tf_large(self):
+        for model_name in self.large_models:
             nlp = pipeline(
                 task=self.pipeline_task,
                 model=model_name,
@@ -141,6 +107,9 @@ def test_tf_small(self):
             )
             self._test_pipeline(nlp)
 
+    def _test_pipeline(self, nlp: Pipeline):
+        raise NotImplementedError
+
     @require_torch
     def test_compare_slow_fast_torch(self):
         for model_name in self.small_models:
@@ -160,7 +129,7 @@ def test_compare_slow_fast_torch(self):
                 use_fast=True,
                 **self.pipeline_loading_kwargs,
             )
-            self._compare_slow_fast_pipelines(nlp_slow, nlp_fast)
+            self._compare_slow_fast_pipelines(nlp_slow, nlp_fast, method="forward")
 
     @require_tf
     def test_compare_slow_fast_tf(self):
@@ -181,54 +150,51 @@ def test_compare_slow_fast_tf(self):
                 use_fast=True,
                 **self.pipeline_loading_kwargs,
             )
-            self._compare_slow_fast_pipelines(nlp_slow, nlp_fast)
-
-    def _compare_slow_fast_pipelines(self, nlp_slow: Pipeline, nlp_fast: Pipeline):
-        with mock.patch.object(nlp_slow.model, 'forward', wraps=nlp_slow.model.forward) as mock_slow,\
-                mock.patch.object(nlp_fast.model, 'forward', wraps=nlp_fast.model.forward) as mock_fast:
+            self._compare_slow_fast_pipelines(nlp_slow, nlp_fast, method="call")
+
+    def _compare_slow_fast_pipelines(self, nlp_slow: Pipeline, nlp_fast: Pipeline, method: str):
+        """We check that the inputs to the models forward passes are identical for
+        slow and fast tokenizers.
+        """
+        with mock.patch.object(
+            nlp_slow.model, method, wraps=getattr(nlp_slow.model, method)
+        ) as mock_slow, mock.patch.object(nlp_fast.model, method, wraps=getattr(nlp_fast.model, method)) as mock_fast:
             for inputs in self.valid_inputs:
-                outputs_slow = nlp_slow(inputs, **self.pipeline_running_kwargs)
-                outputs_fast = nlp_fast(inputs, **self.pipeline_running_kwargs)
+                if isinstance(inputs, dict):
+                    inputs.update(self.pipeline_running_kwargs)
+                    _ = nlp_slow(**inputs)
+                    _ = nlp_fast(**inputs)
+                else:
+                    _ = nlp_slow(inputs, **self.pipeline_running_kwargs)
+                    _ = nlp_fast(inputs, **self.pipeline_running_kwargs)
 
                 mock_slow.assert_called()
                 mock_fast.assert_called()
 
-                slow_call_args, slow_call_kwargs = mock_slow.call_args
-                fast_call_args, fast_call_kwargs = mock_fast.call_args
+                self.assertEqual(len(mock_slow.call_args_list), len(mock_fast.call_args_list))
+                for mock_slow_call_args, mock_fast_call_args in zip(
+                    mock_slow.call_args_list, mock_slow.call_args_list
+                ):
+                    slow_call_args, slow_call_kwargs = mock_slow_call_args
+                    fast_call_args, fast_call_kwargs = mock_fast_call_args
 
-                slow_call_args, slow_call_kwargs = to_py_obj(slow_call_args), to_py_obj(slow_call_kwargs)
-                fast_call_args, fast_call_kwargs = to_py_obj(fast_call_args), to_py_obj(fast_call_kwargs)
+                    slow_call_args, slow_call_kwargs = to_py_obj(slow_call_args), to_py_obj(slow_call_kwargs)
+                    fast_call_args, fast_call_kwargs = to_py_obj(fast_call_args), to_py_obj(fast_call_kwargs)
 
-                self.assertEqual(slow_call_args, fast_call_args)
-                self.assertDictEqual(slow_call_kwargs, fast_call_kwargs)
+                    self.assertEqual(slow_call_args, fast_call_args)
+                    self.assertDictEqual(slow_call_kwargs, fast_call_kwargs)
 
-                self.assertEqual(outputs_slow, outputs_fast)
 
-    @require_torch
-    @slow
-    def test_torch_large(self):
-        for model_name in self.large_models:
-            nlp = pipeline(
-                task=self.pipeline_task,
-                model=model_name,
-                tokenizer=model_name,
-                framework="pt",
-                **self.pipeline_loading_kwargs,
-            )
-            self._test_pipeline(nlp)
+@is_pipeline_test
+class MonoInputPipelineCommonMixin(CustomInputPipelineCommonMixin):
+    """A version of the CustomInputPipelineCommonMixin
+    with a predefined `_test_pipeline` method.
+    """
 
-    @require_tf
-    @slow
-    def test_tf_large(self):
-        for model_name in self.large_models:
-            nlp = pipeline(
-                task=self.pipeline_task,
-                model=model_name,
-                tokenizer=model_name,
-                framework="tf",
-                **self.pipeline_loading_kwargs,
-            )
-            self._test_pipeline(nlp)
+    mandatory_keys = {}  # Keys which should be in the output
+    invalid_inputs = [None]  # inputs which are not allowed
+    expected_multi_result: Optional[List] = None
+    expected_check_keys: Optional[List[str]] = None
 
     def _test_pipeline(self, nlp: Pipeline):
         self.assertIsNotNone(nlp)
diff --git a/tests/test_pipelines_dialog.py b/tests/test_pipelines_dialog.py
@@ -9,6 +9,7 @@ class DialoguePipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase):
     pipeline_task = "conversational"
     small_models = []  # Default model - Models tested without the @slow decorator
     large_models = ["microsoft/DialoGPT-medium"]  # Models tested with the @slow decorator
+    valid_inputs = [Conversation("Hi there!"), [Conversation("Hi there!"), Conversation("How are you?")]]
 
     def _test_pipeline(self, nlp: Pipeline):
         valid_inputs = [Conversation("Hi there!"), [Conversation("Hi there!"), Conversation("How are you?")]]
diff --git a/tests/test_pipelines_zero_shot.py b/tests/test_pipelines_zero_shot.py
@@ -11,6 +11,18 @@ class ZeroShotClassificationPipelineTests(CustomInputPipelineCommonMixin, unitte
         "sshleifer/tiny-distilbert-base-uncased-finetuned-sst-2-english"
     ]  # Models tested without the @slow decorator
     large_models = ["roberta-large-mnli"]  # Models tested with the @slow decorator
+    valid_inputs = [
+        {"sequences": "Who are you voting for in 2020?", "candidate_labels": "politics"},
+        {"sequences": "Who are you voting for in 2020?", "candidate_labels": ["politics"]},
+        {"sequences": "Who are you voting for in 2020?", "candidate_labels": "politics, public health"},
+        {"sequences": "Who are you voting for in 2020?", "candidate_labels": ["politics", "public health"]},
+        {"sequences": ["Who are you voting for in 2020?"], "candidate_labels": "politics"},
+        {
+            "sequences": "Who are you voting for in 2020?",
+            "candidate_labels": "politics",
+            "hypothesis_template": "This text is about {}",
+        },
+    ]
 
     def _test_scores_sum_to_one(self, result):
         sum = 0.0