PoC for a ProcessorMixin class

huggingface · sgugger · Feb 9, 2022 · Feb 7, 2022 · Feb 7, 2022 · Feb 8, 2022
commit af76d4d6c5b8c865008d078af81a7329881d6ff0
diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py
@@ -15,12 +15,11 @@
 """
 Image/Text processor class for CLIP
 """
+from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
-from .feature_extraction_clip import CLIPFeatureExtractor
-from .tokenization_clip import CLIPTokenizer
 
 
-class CLIPProcessor:
+class CLIPProcessor(ProcessorMixin):
     r"""
     Constructs a CLIP processor which wraps a CLIP feature extractor and a CLIP tokenizer into a single processor.
 
@@ -33,77 +32,13 @@ class CLIPProcessor:
         tokenizer ([`CLIPTokenizer`]):
             The tokenizer is a required input.
     """
+    feature_extractor_class = "CLIPFeatureExtractor"
+    tokenizer_class = "CLIPTokenizer"
 
     def __init__(self, feature_extractor, tokenizer):
-        if not isinstance(feature_extractor, CLIPFeatureExtractor):
-            raise ValueError(
-                f"`feature_extractor` has to be of type CLIPFeatureExtractor, but is {type(feature_extractor)}"
-            )
-        if not isinstance(tokenizer, CLIPTokenizer):
-            raise ValueError(f"`tokenizer` has to be of type CLIPTokenizer, but is {type(tokenizer)}")
-
-        self.feature_extractor = feature_extractor
-        self.tokenizer = tokenizer
+        super().__init__(feature_extractor, tokenizer)
         self.current_processor = self.feature_extractor
 
-    def save_pretrained(self, save_directory):
-        """
-        Save a CLIP feature extractor object and CLIP tokenizer object to the directory `save_directory`, so that it
-        can be re-loaded using the [`~CLIPProcessor.from_pretrained`] class method.
-
-        <Tip>
-
-        This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
-        [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
-        above for more information.
-
-        </Tip>
-
-        Args:
-            save_directory (`str` or `os.PathLike`):
-                Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
-                be created if it does not exist).
-        """
-        self.feature_extractor._set_processor_class(self.__class__.__name__)
-        self.feature_extractor.save_pretrained(save_directory)
-
-        self.tokenizer._set_processor_class(self.__class__.__name__)
-        self.tokenizer.save_pretrained(save_directory)
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        r"""
-        Instantiate a [`CLIPProcessor`] from a pretrained CLIP processor.
-
-        <Tip>
-
-        This class method is simply calling CLIPFeatureExtractor's [`~PreTrainedFeatureExtractor.from_pretrained`] and
-        CLIPTokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
-        docstrings of the methods above for more information.
-
-        </Tip>
-
-        Args:
-            pretrained_model_name_or_path (`str` or `os.PathLike`):
-                This can be either:
-
-                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like `clip-vit-base-patch32`, or
-                  namespaced under a user or organization name, like `openai/clip-vit-base-patch32`.
-                - a path to a *directory* containing a feature extractor file saved using the
-                  [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
-                - a path or url to a saved feature extractor JSON *file*, e.g.,
-                  `./my_model_directory/preprocessor_config.json`.
-
-            **kwargs
-                Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
-                [`PreTrainedTokenizer`]
-        """
-        feature_extractor = CLIPFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
-
     def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`

diff --git a/src/transformers/models/wav2vec2/processing_wav2vec2.py b/src/transformers/models/wav2vec2/processing_wav2vec2.py
@@ -18,14 +18,12 @@
 import warnings
 from contextlib import contextmanager
 
-from ...tokenization_utils import PreTrainedTokenizer
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ..auto.tokenization_auto import AutoTokenizer
+from ...processing_utils import ProcessorMixin
 from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
 from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
 
 
-class Wav2Vec2Processor:
+class Wav2Vec2Processor(ProcessorMixin):
     r"""
     Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single
     processor.
@@ -39,82 +37,17 @@ class Wav2Vec2Processor:
         tokenizer ([`PreTrainedTokenizer`]):
             An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
     """
+    feature_extractor_class = "Wav2Vec2FeatureExtractor"
+    tokenizer_class = "AutoTokenizer"
 
     def __init__(self, feature_extractor, tokenizer):
-        if not isinstance(feature_extractor, Wav2Vec2FeatureExtractor):
-            raise ValueError(
-                f"`feature_extractor` has to be of type {Wav2Vec2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
-            )
-        if not isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
-            raise ValueError(
-                f"`tokenizer` has to be of type {PreTrainedTokenizer.__class__}, but is {type(tokenizer)}"
-            )
-
-        self.feature_extractor = feature_extractor
-        self.tokenizer = tokenizer
+        super().__init__(feature_extractor, tokenizer)
         self.current_processor = self.feature_extractor
 
-    def save_pretrained(self, save_directory):
-        """
-        Save a Wav2Vec2 feature_extractor object and Wav2Vec2 tokenizer object to the directory `save_directory`, so
-        that it can be re-loaded using the [`~Wav2Vec2Processor.from_pretrained`] class method.
-
-        <Tip>
-
-        This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
-        [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
-        above for more information.
-
-        </Tip>
-
-        Args:
-            save_directory (`str` or `os.PathLike`):
-                Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
-                be created if it does not exist).
-        """
-        self.feature_extractor._set_processor_class(self.__class__.__name__)
-        self.feature_extractor.save_pretrained(save_directory)
-
-        self.tokenizer._set_processor_class(self.__class__.__name__)
-        self.tokenizer.save_pretrained(save_directory)
-
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        r"""
-        Instantiate a [`Wav2Vec2Processor`] from a pretrained Wav2Vec2 processor.
-
-        <Tip>
-
-        This class method is simply calling Wav2Vec2FeatureExtractor's
-        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and PreTrainedTokenizer's
-        [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods
-        above for more information.
-
-        </Tip>
-
-        Args:
-            pretrained_model_name_or_path (`str` or `os.PathLike`):
-                This can be either:
-
-                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
-                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
-                - a path to a *directory* containing a feature extractor file saved using the
-                  [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
-                - a path or url to a saved feature extractor JSON *file*, e.g.,
-                  `./my_model_directory/preprocessor_config.json`.
-            **kwargs
-                Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
-                [`PreTrainedTokenizer`]
-        """
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
-
-        # load generic `AutoTokenizer`
-        # need fallback here for backward compatibility in case processor is
-        # loaded from just a tokenizer file that does not have a `tokenizer_class` attribute
-        # behavior should be deprecated in major future release
         try:
-            tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
         except OSError:
             warnings.warn(
                 f"Loading a tokenizer inside {cls.__name__} from a config that does not"
@@ -124,9 +57,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 "file to suppress this warning: ",
                 FutureWarning,
             )
+
+            feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
             tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
 
-        return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
 
     def __call__(self, *args, **kwargs):
         """

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
@@ -0,0 +1,146 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ Processing saving/loading class for common processors.
+"""
+
+import importlib.util
+from pathlib import Path
+
+
+# Comment to write
+spec = importlib.util.spec_from_file_location(
+    "transformers", Path(__file__).parent / "__init__.py", submodule_search_locations=[Path(__file__).parent]
+)
+transformers_module = spec.loader.load_module()
+
+
+AUTO_TO_BASE_CLASS_MAPPING = {
+    "AutoTokenizer": "PreTrainedTokenizerBase",
+    "AutoFeatureExtractor": "FeatureExtractionMixin",
+}
+
+
+class ProcessorMixin:
+    """
+    This is a mixin used to provide saving/loading functionality for all processor classes.
+    """
+
+    attributes = ["feature_extractor", "tokenizer"]
+    # Names need to be attr_class for attr in attributes
+    feature_extractor_class = None
+    tokenizer_class = None
+
+    # args have to match the attributes class attribute
+    def __init__(self, *args):
+        if len(args) != len(self.attributes):
+            raise ValueError(
+                f"This processor requires {len(self.attributes)} arguments: {', '.join(self.attributes)}. Got "
+                f"{len(args)} arguments instead."
+            )
+
+        # Check each arg is of the proper class (this will also catch a user initializing in the wrong order)
+        for arg, attribute_name in zip(args, self.attributes):
+            class_name = getattr(self, f"{attribute_name}_class")
+            # Nothing is every going to be an instance of "AutoXxx", in that case we check the base class.
+            class_name = AUTO_TO_BASE_CLASS_MAPPING.get(class_name, class_name)
+            if isinstance(class_name, tuple):
+                proper_class = tuple(getattr(transformers_module, n) for n in class_name if n is not None)
+            else:
+                proper_class = getattr(transformers_module, class_name)
+
+            if not isinstance(arg, proper_class):
+                raise ValueError(
+                    f"Received a {type(arg)} for argument {attribute_name}, but a {class_name} was expected."
+                )
+
+            setattr(self, attribute_name, arg)
+
+    def __repr__(self):
+        attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes]
+        attributes_repr = "\n".join(attributes_repr)
+        return f"{self.__class__.__name__}:\n{attributes_repr}"
+
+    def save_pretrained(self, save_directory):
+        """
+        Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it
+        can be reloaded using the [`~ProcessorMixin.from_pretrained`] method.
+
+        <Tip>
+
+        This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
+        [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
+        above for more information.
+
+        </Tip>
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
+                be created if it does not exist).
+        """
+        for attribute_name in self.attributes:
+            attribute = getattr(self, attribute_name)
+            # Include the processor class in the attribute config so this processor can then be reloaded with the
+            # `AutoProcessor` API.
+            if hasattr(attribute, "_set_processor_class"):
+                attribute._set_processor_class(self.__class__.__name__)
+            attribute.save_pretrained(save_directory)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r"""
+        Instantiate a processor associated with a pretrained model.
+
+        <Tip>
+
+        This class method is simply calling the feature extractor
+        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and the tokenizer
+        [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the
+        methods above for more information.
+
+        </Tip>
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
+            **kwargs
+                Additional keyword arguments passed along to both
+                [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and
+                [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
+        """
+        args = []
+        for attribute_name in cls.attributes:
+            class_name = getattr(cls, f"{attribute_name}_class")
+            if isinstance(class_name, tuple):
+                classes = tuple(getattr(transformers_module, n) if n is not None else None for n in class_name)
+                use_fast = kwargs.get("use_fast", True)
+                if use_fast and classes[1] is not None:
+                    attribute_class = classes[1]
+                else:
+                    attribute_class = classes[0]
+            else:
+                attribute_class = getattr(transformers_module, class_name)
+
+            args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
+        return cls(*args)