-
Notifications
You must be signed in to change notification settings - Fork 31.5k
Auto convert tekken.json #42299
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Auto convert tekken.json #42299
Changes from all commits
1f83f14
798c29f
55c1652
5f851fe
bdcde31
fb41fe3
70e8a37
66d3b89
416f4c6
0d0484d
699fb5c
16a833f
80a80ac
70e14eb
2f292b0
56afbe2
5689423
e0be8ad
214e3cf
865318a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,6 +33,7 @@ | |
| from typing import TYPE_CHECKING, Any, Literal, NamedTuple, Optional, Union, overload | ||
|
|
||
| import numpy as np | ||
| from huggingface_hub import list_repo_files | ||
| from packaging import version | ||
|
|
||
| from . import __version__ | ||
|
|
@@ -2098,7 +2099,21 @@ def from_pretrained( | |
| template = template.removesuffix(".jinja") | ||
| vocab_files[f"chat_template_{template}"] = f"{CHAT_TEMPLATE_DIR}/{template}.jinja" | ||
|
|
||
| # Get files from url, cache, or disk depending on the case | ||
| if not is_local and not local_files_only: | ||
| try: | ||
| remote_files = list_repo_files(pretrained_model_name_or_path) | ||
| except Exception: | ||
| remote_files = [] | ||
| else: | ||
| remote_files = os.listdir(pretrained_model_name_or_path) | ||
|
|
||
| if "tokenizer_file" in vocab_files and not re.search(vocab_files["tokenizer_file"], "".join(remote_files)): | ||
| # mistral tokenizer names are different, but we can still convert them if | ||
| # mistral common is not there | ||
| other_pattern = re.escape("tekken.json|tokenizer.model.*") | ||
| if match := re.search(other_pattern, "\n".join(remote_files)): | ||
| vocab_files["vocab_file"] = match.group() | ||
|
|
||
| resolved_vocab_files = {} | ||
| for file_id, file_path in vocab_files.items(): | ||
| if file_path is None: | ||
|
|
@@ -2417,6 +2432,75 @@ def _from_pretrained( | |
| "Special tokens have been added in the vocabulary, make sure the associated word embeddings are" | ||
| " fine-tuned or trained." | ||
| ) | ||
| try: | ||
| vocab_size = tokenizer.vocab_size | ||
| except NotImplementedError: | ||
| vocab_size = 0 | ||
|
|
||
| if ( | ||
| vocab_size > 100000 | ||
| and hasattr(tokenizer, "_tokenizer") | ||
| and getattr(tokenizer._tokenizer, "pre_tokenizer", None) is not None | ||
| ): | ||
| from huggingface_hub import model_info | ||
|
|
||
| def is_base_mistral(model_id: str) -> bool: | ||
| model = model_info(model_id) | ||
| if model.tags is not None: | ||
| if re.search("base_model:.*mistralai", "".join(model.tags)): | ||
| return True | ||
| return False | ||
|
|
||
| if _is_local or is_base_mistral(pretrained_model_name_or_path): | ||
| _config_file = cached_file( | ||
| pretrained_model_name_or_path, | ||
| "config.json", | ||
| cache_dir=cache_dir, | ||
| token=token, | ||
| local_files_only=local_files_only, | ||
| _raise_exceptions_for_missing_entries=False, | ||
| _raise_exceptions_for_connection_errors=False, | ||
| _commit_hash=_commit_hash, | ||
| ) | ||
| if _config_file is not None: | ||
| with open(_config_file, encoding="utf-8") as f: | ||
| _config = json.load(f) | ||
| transformers_version = _config.get("transformers_version") | ||
|
|
||
| if transformers_version and version.parse(transformers_version) <= version.parse("4.57.2"): | ||
| if _is_local and _config.model_type not in [ | ||
| "mistral", | ||
| "mistral3", | ||
| "voxstral", | ||
| "ministral", | ||
| "pixtral", | ||
| ]: | ||
| return tokenizer | ||
|
Comment on lines
+2470
to
+2478
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The non-existent attribute use of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Change it to
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah sorry
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have no idea why the CI was full green |
||
|
|
||
| # Expose the `fix_mistral_regex` flag on the tokenizer when provided, even if no correction is applied. | ||
| if "fix_mistral_regex" in init_kwargs: | ||
| setattr(tokenizer, "fix_mistral_regex", init_kwargs["fix_mistral_regex"]) | ||
|
|
||
| fix_mistral_regex = kwargs.get("fix_mistral_regex") # not init kwargs | ||
| # only warn if its not explicitly passed | ||
| if fix_mistral_regex is None and not getattr(tokenizer, "fix_mistral_regex", False): | ||
| setattr(tokenizer, "fix_mistral_regex", False) | ||
| logger.warning( | ||
| f"The tokenizer you are loading from '{pretrained_model_name_or_path}'" | ||
| f" with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. " | ||
| " This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue." | ||
| ) | ||
| elif fix_mistral_regex is True or getattr(tokenizer, "fix_mistral_regex", False): | ||
| setattr(tokenizer, "fix_mistral_regex", True) | ||
| import tokenizers | ||
|
|
||
| tokenizer.backend_tokenizer.pre_tokenizer[0] = tokenizers.pre_tokenizers.Split( | ||
| pattern=tokenizers.Regex( | ||
| r"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+" | ||
| ), | ||
| behavior="isolated", | ||
| ) | ||
|
|
||
| return tokenizer | ||
|
|
||
| @staticmethod | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm so that's only for mistral org no? Should we directly check of `model_type in ["mistral" ....] so that it also works for other orgs?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can't do that until we download the config / config is there