Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
ydshieh committed Dec 14, 2023
1 parent 2788f8d commit fabf6be
Show file tree
Hide file tree
Showing 2 changed files with 250 additions and 3 deletions.
252 changes: 249 additions & 3 deletions src/transformers/processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,28 @@
Processing saving/loading class for common processors.
"""

import copy
import inspect
import json
import os
import warnings
from pathlib import Path
from typing import Optional, Union
from typing import Any, Dict, Optional, Tuple, Union

from .dynamic_module_utils import custom_object_save
from .tokenization_utils_base import PreTrainedTokenizerBase
from .utils import PushToHubMixin, copy_func, direct_transformers_import, logging
from .utils import (
PROCESSOR_NAME,
PushToHubMixin,
add_model_info_to_auto_map,
cached_file,
copy_func,
direct_transformers_import,
download_url,
is_offline_mode,
is_remote_url,
logging,
)


logger = logging.get_logger(__name__)
Expand Down Expand Up @@ -85,6 +99,77 @@ def __init__(self, *args, **kwargs):

setattr(self, attribute_name, arg)

def to_dict(self) -> Dict[str, Any]:
"""
Serializes this instance to a Python dictionary.
Returns:
`Dict[str, Any]`: Dictionary of all the attributes that make up this processor instance.
"""
output = copy.deepcopy(self.__dict__)

# Get the kwargs in `__init__`.
sig = inspect.signature(self.__init__)
# Only save the attributes that are presented in the kwargs of `__init__`.
output = {k: v for k, v in output.items() if k in sig.parameters and k not in self.__class__.attributes}

output["processor_type"] = self.__class__.__name__

if "tokenizer" in output:
del output["tokenizer"]
if "image_processor" in output:
del output["image_processor"]
if "feature_extractor" in output:
del output["feature_extractor"]

# TODO: deal the following with a generic approach - by checking the types of the values.

# Some old processor class (for example, `Wav2Vec2Processor`) have this attribute
if "current_processor" in output:
del output["current_processor"]

# For `MgpstrProcessor`
if "char_tokenizer" in output:
del output["char_tokenizer"]
if "bpe_tokenizer" in output:
del output["bpe_tokenizer"]
if "wp_tokenizer" in output:
del output["wp_tokenizer"]

# For `InstructBlipProcessor`
# TODO: update custom `from_pretrained`
if "qformer_tokenizer" in output:
del output["qformer_tokenizer"]

# For `Wav2Vec2ProcessorWithLM`
# TODO: update custom `from_pretrained`
if "decoder" in output:
del output["decoder"]

return output

def to_json_string(self) -> str:
"""
Serializes this instance to a JSON string.
Returns:
`str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
"""
dictionary = self.to_dict()

return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"

def to_json_file(self, json_file_path: Union[str, os.PathLike]):
"""
Save this instance to a JSON file.
Args:
json_file_path (`str` or `os.PathLike`):
Path to the JSON file in which this processor instance's parameters will be saved.
"""
with open(json_file_path, "w", encoding="utf-8") as writer:
writer.write(self.to_json_string())

def __repr__(self):
attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes]
attributes_repr = "\n".join(attributes_repr)
Expand Down Expand Up @@ -156,6 +241,12 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
if isinstance(attribute, PreTrainedTokenizerBase):
del attribute.init_kwargs["auto_map"]

# If we save using the predefined names, we can load using `from_pretrained`
output_processor_file = os.path.join(save_directory, PROCESSOR_NAME)

self.to_json_file(output_processor_file)
logger.info(f"processor saved in {output_processor_file}")

if push_to_hub:
self._upload_modified_files(
save_directory,
Expand All @@ -165,6 +256,149 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
token=kwargs.get("token"),
)

@classmethod
def get_processor_dict(
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
"""
From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
processor of type [`~processing_utils.ProcessingMixin`] using `from_args_and_dict`.
Parameters:
pretrained_model_name_or_path (`str` or `os.PathLike`):
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
subfolder (`str`, *optional*, defaults to `""`):
In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
specify the folder name here.
Returns:
`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the processor object.
"""
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
resume_download = kwargs.pop("resume_download", False)
proxies = kwargs.pop("proxies", None)
token = kwargs.pop("token", None)
local_files_only = kwargs.pop("local_files_only", False)
revision = kwargs.pop("revision", None)
subfolder = kwargs.pop("subfolder", "")

from_pipeline = kwargs.pop("_from_pipeline", None)
from_auto_class = kwargs.pop("_from_auto", False)

user_agent = {"file_type": "processor", "from_auto_class": from_auto_class}
if from_pipeline is not None:
user_agent["using_pipeline"] = from_pipeline

if is_offline_mode() and not local_files_only:
logger.info("Offline mode: forcing local_files_only=True")
local_files_only = True

pretrained_model_name_or_path = str(pretrained_model_name_or_path)
is_local = os.path.isdir(pretrained_model_name_or_path)
if os.path.isdir(pretrained_model_name_or_path):
processor_file = os.path.join(pretrained_model_name_or_path, PROCESSOR_NAME)
if os.path.isfile(pretrained_model_name_or_path):
resolved_processor_file = pretrained_model_name_or_path
is_local = True
elif is_remote_url(pretrained_model_name_or_path):
processor_file = pretrained_model_name_or_path
resolved_processor_file = download_url(pretrained_model_name_or_path)
else:
processor_file = PROCESSOR_NAME
try:
# Load from local folder or from cache or download from model Hub and cache
resolved_processor_file = cached_file(
pretrained_model_name_or_path,
processor_file,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
resume_download=resume_download,
local_files_only=local_files_only,
token=token,
user_agent=user_agent,
revision=revision,
subfolder=subfolder,
)
except EnvironmentError:
# Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
# the original exception.
raise
except Exception:
# For any other exception, we throw a generic error.
raise EnvironmentError(
f"Can't load processor for '{pretrained_model_name_or_path}'. If you were trying to load"
" it from 'https://huggingface.co/models', make sure you don't have a local directory with the"
f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
f" directory containing a {PROCESSOR_NAME} file"
)

try:
# Load processor dict
with open(resolved_processor_file, "r", encoding="utf-8") as reader:
text = reader.read()
processor_dict = json.loads(text)

except json.JSONDecodeError:
raise EnvironmentError(
f"It looks like the config file at '{resolved_processor_file}' is not a valid JSON file."
)

if is_local:
logger.info(f"loading configuration file {resolved_processor_file}")
else:
logger.info(f"loading configuration file {processor_file} from cache at {resolved_processor_file}")

if "auto_map" in processor_dict and not is_local:
processor_dict["auto_map"] = add_model_info_to_auto_map(
processor_dict["auto_map"], pretrained_model_name_or_path
)

return processor_dict, kwargs

@classmethod
def from_args_and_dict(cls, args, processor_dict: Dict[str, Any], **kwargs):
"""
Instantiates a type of [`~processing_utils.ProcessingMixin`] from a Python dictionary of parameters.
Args:
processor_dict (`Dict[str, Any]`):
Dictionary that will be used to instantiate the processor object. Such a dictionary can be
retrieved from a pretrained checkpoint by leveraging the
[`~processing_utils.ProcessingMixin.to_dict`] method.
kwargs (`Dict[str, Any]`):
Additional parameters from which to initialize the processor object.
Returns:
[`~processing_utils.ProcessingMixin`]: The processor object instantiated from those
parameters.
"""
processor_dict = processor_dict.copy()
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)

# Unlike image processors or feature extractors whose `__init__` accept `kwargs`, processor don't have `kwargs`.
# We have to pop up some unused (but specific) arguments to make it work.
if "processor_type" in processor_dict:
del processor_dict["processor_type"]

processor = cls(*args, **processor_dict)

# Update processor with kwargs if needed
to_remove = []
for key, value in kwargs.items():
if hasattr(processor, key):
setattr(processor, key, value)
to_remove.append(key)
for key in to_remove:
kwargs.pop(key, None)

logger.info(f"Processor {processor}")
if return_unused_kwargs:
return processor, kwargs
else:
return processor

@classmethod
def from_pretrained(
cls,
Expand Down Expand Up @@ -226,7 +460,19 @@ def from_pretrained(
kwargs["token"] = token

args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls(*args)

# Existing processors on the Hub don't have `processor_config.json`, but we need to keep `from_pretrained` work.
# This is not ideal (for models added in the future) as it might hide some bug/error silently.
# TODO: How to deal with this better and safer. Can we use timestamp as a condition to determine this?
try:
processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
except EnvironmentError as e:
if "does not appear to have a file named processor_config.json." in str(e):
processor_dict, kwargs = {}, {}
else:
raise

return cls.from_args_and_dict(args, processor_dict, **kwargs)

@classmethod
def register_for_auto_class(cls, auto_class="AutoProcessor"):
Expand Down
1 change: 1 addition & 0 deletions src/transformers/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@
CONFIG_NAME = "config.json"
FEATURE_EXTRACTOR_NAME = "preprocessor_config.json"
IMAGE_PROCESSOR_NAME = FEATURE_EXTRACTOR_NAME
PROCESSOR_NAME = "processor_config.json"
GENERATION_CONFIG_NAME = "generation_config.json"
MODEL_CARD_NAME = "modelcard.json"

Expand Down

0 comments on commit fabf6be

Please sign in to comment.