diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 7760369507bd92..83bb7041d3942b 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -800,7 +800,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): _ = kwargs.pop("code_revision", None) if os.path.isdir(pretrained_model_name_or_path): tokenizer_class.register_for_auto_class() - return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + return tokenizer_class.from_pretrained( + pretrained_model_name_or_path, *inputs, trust_remote_code=trust_remote_code, **kwargs + ) elif config_tokenizer_class is not None: tokenizer_class = None if use_fast and not config_tokenizer_class.endswith("Fast"): diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index f4a467c32fa92d..c6b003c612d34e 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1810,6 +1810,7 @@ def from_pretrained( local_files_only: bool = False, token: Optional[Union[str, bool]] = None, revision: str = "main", + trust_remote_code=False, **kwargs, ): r""" @@ -1853,6 +1854,10 @@ def from_pretrained( facebook/rag-token-base), specify it here. inputs (additional positional arguments, *optional*): Will be passed along to the Tokenizer `__init__` method. + trust_remote_code (`bool`, *optional*, defaults to `False`): + Whether or not to allow for custom models defined on the Hub in their own modeling files. This option + should only be set to `True` for repositories you trust and in which you have read the code, as it will + execute code present on the Hub on your local machine. kwargs (additional keyword arguments, *optional*): Will be passed to the Tokenizer `__init__` method. Can be used to set special tokens like `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`, @@ -2036,6 +2041,7 @@ def from_pretrained( local_files_only=local_files_only, _commit_hash=commit_hash, _is_local=is_local, + trust_remote_code=trust_remote_code, **kwargs, ) @@ -2051,6 +2057,7 @@ def _from_pretrained( local_files_only=False, _commit_hash=None, _is_local=False, + trust_remote_code=False, **kwargs, ): # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json @@ -2099,6 +2106,10 @@ def _from_pretrained( ) if config_tokenizer_class is None: + # Matt: This entire block is only used to decide if the tokenizer class matches the class in the repo. + # If not, it raises a warning, but otherwise continues. Since we mostly load tokenizers with + # AutoTokenizer these days, it seems like a lot of work (and a source of bugs) for little gain. + # Maybe we can just remove this entirely? from .models.auto.configuration_auto import AutoConfig # tests_ignore # Second attempt. If we have not yet found tokenizer_class, let's try to use the config. @@ -2108,6 +2119,7 @@ def _from_pretrained( token=token, cache_dir=cache_dir, local_files_only=local_files_only, + trust_remote_code=trust_remote_code, _commit_hash=_commit_hash, ) config_tokenizer_class = config.tokenizer_class diff --git a/tests/models/auto/test_tokenization_auto.py b/tests/models/auto/test_tokenization_auto.py index 8ebf834f12ae08..2de5d1a9174aec 100644 --- a/tests/models/auto/test_tokenization_auto.py +++ b/tests/models/auto/test_tokenization_auto.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import os import shutil import sys @@ -429,3 +430,73 @@ def test_cached_tokenizer_has_minimum_calls_to_head(self): self.assertEqual(counter["GET"], 0) self.assertEqual(counter["HEAD"], 1) self.assertEqual(counter.total_calls, 1) + + def test_init_tokenizer_with_trust(self): + nop_tokenizer_code = """ +import transformers + +class NopTokenizer(transformers.PreTrainedTokenizer): + def get_vocab(self): + return {} +""" + + nop_config_code = """ +from transformers import PretrainedConfig + +class NopConfig(PretrainedConfig): + model_type = "test_unregistered_dynamic" + + def __init__(self, **kwargs): + super().__init__(**kwargs) +""" + + with tempfile.TemporaryDirectory() as tmp_dir: + fake_model_id = "hf-internal-testing/test_unregistered_dynamic" + fake_repo = os.path.join(tmp_dir, fake_model_id) + os.makedirs(fake_repo) + + tokenizer_src_file = os.path.join(fake_repo, "tokenizer.py") + with open(tokenizer_src_file, "w") as wfp: + wfp.write(nop_tokenizer_code) + + model_config_src_file = os.path.join(fake_repo, "config.py") + with open(model_config_src_file, "w") as wfp: + wfp.write(nop_config_code) + + config = { + "model_type": "test_unregistered_dynamic", + "auto_map": {"AutoConfig": f"{fake_model_id}--config.NopConfig"}, + } + + config_file = os.path.join(fake_repo, "config.json") + with open(config_file, "w") as wfp: + json.dump(config, wfp, indent=2) + + tokenizer_config = { + "auto_map": { + "AutoTokenizer": [ + f"{fake_model_id}--tokenizer.NopTokenizer", + None, + ] + } + } + + tokenizer_config_file = os.path.join(fake_repo, "tokenizer_config.json") + with open(tokenizer_config_file, "w") as wfp: + json.dump(tokenizer_config, wfp, indent=2) + + prev_dir = os.getcwd() + try: + # it looks like subdir= is broken in the from_pretrained also, so this is necessary + os.chdir(tmp_dir) + + # this should work because we trust the code + _ = AutoTokenizer.from_pretrained(fake_model_id, local_files_only=True, trust_remote_code=True) + try: + # this should fail because we don't trust and we're not at a terminal for interactive response + _ = AutoTokenizer.from_pretrained(fake_model_id, local_files_only=True, trust_remote_code=False) + self.fail("AutoTokenizer.from_pretrained with trust_remote_code=False should raise ValueException") + except ValueError: + pass + finally: + os.chdir(prev_dir)