Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 85 additions & 39 deletions src/transformers/models/auto/tokenization_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@
("aimv2", "CLIPTokenizerFast" if is_tokenizers_available() else None),
("albert", "AlbertTokenizer" if is_tokenizers_available() else None),
("align", "BertTokenizer" if is_tokenizers_available() else None),
("arcee", "LlamaTokenizer" if is_tokenizers_available() else None),
("aria", "LlamaTokenizer" if is_tokenizers_available() else None),
("arcee", "TokenizersBackend" if is_tokenizers_available() else None),
("aria", "TokenizersBackend" if is_tokenizers_available() else None),
("aya_vision", "CohereTokenizer" if is_tokenizers_available() else None),
("bark", "BertTokenizer" if is_tokenizers_available() else None),
("bart", "RobertaTokenizer" if is_tokenizers_available() else None),
Expand All @@ -92,7 +92,7 @@
("byt5", "ByT5Tokenizer"),
("camembert", "CamembertTokenizer" if is_tokenizers_available() else None),
("canine", "CanineTokenizer"),
("chameleon", "LlamaTokenizer" if is_tokenizers_available() else None),
("chameleon", "TokenizersBackend" if is_tokenizers_available() else None),
("chinese_clip", "BertTokenizer" if is_tokenizers_available() else None),
("clap", "RobertaTokenizer"),
("clip", "CLIPTokenizer" if is_tokenizers_available() else None),
Expand All @@ -102,7 +102,7 @@
("codegen", "GPT2Tokenizer" if is_tokenizers_available() else None),
("cohere", "CohereTokenizer" if is_tokenizers_available() else None),
("cohere2", "CohereTokenizer" if is_tokenizers_available() else None),
("colpali", "LlamaTokenizer" if is_tokenizers_available() else None),
("colpali", "TokenizersBackend" if is_tokenizers_available() else None),
("colqwen2", "Qwen2TokenizerFast" if is_tokenizers_available() else None),
("convbert", "BertTokenizer" if is_tokenizers_available() else None),
("cpm", "CpmTokenizer" if is_tokenizers_available() else None),
Expand All @@ -114,19 +114,19 @@
("dbrx", "GPT2Tokenizer" if is_tokenizers_available() else None),
("deberta", "DebertaTokenizer" if is_tokenizers_available() else None),
("deberta-v2", "DebertaV2Tokenizer" if is_tokenizers_available() else None),
("deepseek_v2", "LlamaTokenizer" if is_tokenizers_available() else None),
("deepseek_v3", "LlamaTokenizer" if is_tokenizers_available() else None),
("deepseek_vl", "LlamaTokenizer" if is_tokenizers_available() else None),
("deepseek_vl_hybrid", "LlamaTokenizer" if is_tokenizers_available() else None),
("deepseek_v2", "TokenizersBackend" if is_tokenizers_available() else None),
("deepseek_v3", "TokenizersBackend" if is_tokenizers_available() else None),
("deepseek_vl", "TokenizersBackend" if is_tokenizers_available() else None),
("deepseek_vl_hybrid", "TokenizersBackend" if is_tokenizers_available() else None),
("dia", "DiaTokenizer"),
("diffllama", "LlamaTokenizer" if is_tokenizers_available() else None),
("diffllama", "TokenizersBackend" if is_tokenizers_available() else None),
("distilbert", "BertTokenizer" if is_tokenizers_available() else None),
("dpr", "DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None),
("electra", "BertTokenizer" if is_tokenizers_available() else None),
("emu3", "GPT2Tokenizer" if is_tokenizers_available() else None),
("ernie", "BertTokenizer" if is_tokenizers_available() else None),
("ernie4_5", "LlamaTokenizer" if is_tokenizers_available() else None),
("ernie4_5_moe", "LlamaTokenizer" if is_tokenizers_available() else None),
("ernie4_5", "TokenizersBackend" if is_tokenizers_available() else None),
("ernie4_5_moe", "TokenizersBackend" if is_tokenizers_available() else None),
("esm", "EsmTokenizer"),
("exaone4", "GPT2Tokenizer" if is_tokenizers_available() else None),
("falcon", "TokenizersBackend" if is_tokenizers_available() else None),
Expand Down Expand Up @@ -171,15 +171,15 @@
("herbert", "HerbertTokenizer" if is_tokenizers_available() else None),
("hubert", "Wav2Vec2CTCTokenizer"),
("ibert", "RobertaTokenizer"),
("idefics", "LlamaTokenizer" if is_tokenizers_available() else None),
("idefics2", "LlamaTokenizer" if is_tokenizers_available() else None),
("idefics3", "LlamaTokenizer" if is_tokenizers_available() else None),
("idefics", "TokenizersBackend" if is_tokenizers_available() else None),
("idefics2", "TokenizersBackend" if is_tokenizers_available() else None),
("idefics3", "TokenizersBackend" if is_tokenizers_available() else None),
("instructblip", "GPT2Tokenizer" if is_tokenizers_available() else None),
("instructblipvideo", "GPT2Tokenizer" if is_tokenizers_available() else None),
("internvl", "Qwen2TokenizerFast" if is_tokenizers_available() else None),
("jamba", "LlamaTokenizer" if is_tokenizers_available() else None),
("janus", "LlamaTokenizer" if is_tokenizers_available() else None),
("jetmoe", "LlamaTokenizer" if is_tokenizers_available() else None),
("jamba", "TokenizersBackend" if is_tokenizers_available() else None),
("janus", "TokenizersBackend" if is_tokenizers_available() else None),
("jetmoe", "TokenizersBackend" if is_tokenizers_available() else None),
("kosmos-2", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
("kosmos-2.5", "TokenizersBackend" if is_tokenizers_available() else None),
("layoutlm", "BertTokenizer" if is_tokenizers_available() else None),
Expand All @@ -190,12 +190,12 @@
("lfm2_vl", "TokenizersBackend" if is_tokenizers_available() else None),
("lilt", "RobertaTokenizer" if is_tokenizers_available() else None),
("llama", "LlamaTokenizer" if is_tokenizers_available() else None),
("llama4", "LlamaTokenizer" if is_tokenizers_available() else None),
("llama4_text", "LlamaTokenizer" if is_tokenizers_available() else None),
("llava", "LlamaTokenizer" if is_tokenizers_available() else None),
("llava_next", "LlamaTokenizer" if is_tokenizers_available() else None),
("llava_next_video", "LlamaTokenizer" if is_tokenizers_available() else None),
("llava_onevision", "LlamaTokenizer" if is_tokenizers_available() else None),
("llama4", "TokenizersBackend" if is_tokenizers_available() else None),
("llama4_text", "TokenizersBackend" if is_tokenizers_available() else None),
("llava", "TokenizersBackend" if is_tokenizers_available() else None),
("llava_next", "TokenizersBackend" if is_tokenizers_available() else None),
("llava_next_video", "TokenizersBackend" if is_tokenizers_available() else None),
("llava_onevision", "TokenizersBackend" if is_tokenizers_available() else None),
("longformer", "RobertaTokenizer" if is_tokenizers_available() else None),
("longt5", "T5Tokenizer" if is_tokenizers_available() else None),
("luke", "LukeTokenizer"),
Expand All @@ -216,32 +216,32 @@
(
"MistralCommonBackend"
if is_mistral_common_available()
else ("LlamaTokenizer" if is_sentencepiece_available() else None),
"LlamaTokenizer" if is_tokenizers_available() and not is_mistral_common_available() else None,
else ("TokenizersBackend" if is_sentencepiece_available() else None),
"TokenizersBackend" if is_tokenizers_available() and not is_mistral_common_available() else None,
),
),
(
"mistral",
"MistralCommonBackend"
if is_mistral_common_available()
else ("LlamaTokenizer" if is_tokenizers_available() else None),
else ("TokenizersBackend" if is_tokenizers_available() else None),
),
(
"mistral3",
(
"MistralCommonBackend"
if is_mistral_common_available()
else ("LlamaTokenizer" if is_sentencepiece_available() else None),
"LlamaTokenizer" if is_tokenizers_available() and not is_mistral_common_available() else None,
else ("TokenizersBackend" if is_sentencepiece_available() else None),
"TokenizersBackend" if is_tokenizers_available() and not is_mistral_common_available() else None,
),
),
(
"mixtral",
"MistralCommonBackend"
if is_mistral_common_available()
else ("LlamaTokenizer" if is_tokenizers_available() else None),
else ("TokenizersBackend" if is_tokenizers_available() else None),
),
("mllama", "LlamaTokenizer" if is_tokenizers_available() else None),
("mllama", "TokenizersBackend" if is_tokenizers_available() else None),
("mluke", "MLukeTokenizer" if is_sentencepiece_available() else None),
("mm-grounding-dino", "BertTokenizer" if is_tokenizers_available() else None),
("mobilebert", "MobileBertTokenizer" if is_tokenizers_available() else None),
Expand Down Expand Up @@ -274,14 +274,14 @@
("owlv2", "CLIPTokenizerFast" if is_tokenizers_available() else None),
("owlvit", "CLIPTokenizerFast" if is_tokenizers_available() else None),
("paddleocr_vl", "TokenizersBackend" if is_tokenizers_available() else None),
("paligemma", "LlamaTokenizer" if is_tokenizers_available() else None),
("paligemma", "TokenizersBackend" if is_tokenizers_available() else None),
("pegasus", "PegasusTokenizer" if is_tokenizers_available() else None),
("pegasus_x", "PegasusTokenizer" if is_tokenizers_available() else None),
("perceiver", "PerceiverTokenizer"),
("persimmon", "LlamaTokenizer" if is_tokenizers_available() else None),
("persimmon", "TokenizersBackend" if is_tokenizers_available() else None),
("phi", "GPT2Tokenizer" if is_tokenizers_available() else None),
("phi3", "LlamaTokenizer" if is_tokenizers_available() else None),
("phimoe", "LlamaTokenizer" if is_tokenizers_available() else None),
("phi3", "TokenizersBackend" if is_tokenizers_available() else None),
("phimoe", "TokenizersBackend" if is_tokenizers_available() else None),
("phobert", "PhobertTokenizer"),
("pix2struct", "T5Tokenizer" if is_tokenizers_available() else None),
(
Expand Down Expand Up @@ -336,16 +336,16 @@
("tvp", "BertTokenizer" if is_tokenizers_available() else None),
("udop", "UdopTokenizer" if is_tokenizers_available() else None),
("umt5", "T5Tokenizer" if is_tokenizers_available() else None),
("video_llava", "LlamaTokenizer" if is_tokenizers_available() else None),
("video_llava", "TokenizersBackend" if is_tokenizers_available() else None),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ArthurZucker Just wondering if the LlamaTokenizer for all of these was causing issues?

("vilt", "BertTokenizer" if is_tokenizers_available() else None),
("vipllava", "LlamaTokenizer" if is_tokenizers_available() else None),
("vipllava", "TokenizersBackend" if is_tokenizers_available() else None),
("visual_bert", "BertTokenizer" if is_tokenizers_available() else None),
("vits", "VitsTokenizer"),
(
"voxtral",
"MistralCommonBackend"
if is_mistral_common_available()
else ("LlamaTokenizer" if is_tokenizers_available() else None),
else ("TokenizersBackend" if is_tokenizers_available() else None),
),
("wav2vec2", "Wav2Vec2CTCTokenizer"),
("wav2vec2-bert", "Wav2Vec2CTCTokenizer"),
Expand All @@ -361,15 +361,29 @@
("xlstm", "GPTNeoXTokenizerFast" if is_tokenizers_available() else None),
("xmod", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None),
("yoso", "AlbertTokenizer" if is_tokenizers_available() else None),
("zamba", "LlamaTokenizer" if is_tokenizers_available() else None),
("zamba2", "LlamaTokenizer" if is_tokenizers_available() else None),
("zamba", "TokenizersBackend" if is_tokenizers_available() else None),
("zamba2", "TokenizersBackend" if is_tokenizers_available() else None),
]
)

TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES)

CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()}

# Model types that should prioritize TOKENIZER_MAPPING over tokenizer_config
PRIORITIZE_MAPPING_FOR_MODELS = [
"ministral3",
"mistral3",
"mixtral",
"pixtral",
"voxtral",
"qwen2",
]

TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES)

CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()}


def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
Expand Down Expand Up @@ -675,6 +689,38 @@ def from_pretrained(
else:
tokenizer_auto_map = tokenizer_config["auto_map"].get("AutoTokenizer", None)

# START PRIORITIZE MAPPING FOR MODELS
# Check config's model_type (not tokenizer_config's) for models with unreliable tokenizer_config
try:
config_model_type = (
getattr(config, "model_type", None)
if isinstance(config, PreTrainedConfig)
else PreTrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)[0].get("model_type")
)
except Exception:
config_model_type = None

if config_model_type in PRIORITIZE_MAPPING_FOR_MODELS:
if not isinstance(config, PreTrainedConfig):
if gguf_file:
gguf_path = cached_file(pretrained_model_name_or_path, gguf_file, **kwargs)
config_dict = load_gguf_checkpoint(gguf_path, return_tensors=False)["config"]
config = AutoConfig.for_model(**config_dict)
else:
config = AutoConfig.from_pretrained(
pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
)
config_for_lookup = config.encoder if isinstance(config, EncoderDecoderConfig) else config
if type(config_for_lookup) in TOKENIZER_MAPPING:
tokenizer_class = TOKENIZER_MAPPING.get(type(config_for_lookup), TokenizersBackend)
if isinstance(tokenizer_class, tuple):
tokenizer_class = tokenizer_class[1] or tokenizer_class[0]
if isinstance(tokenizer_class, str):
tokenizer_class = tokenizer_class_from_name(tokenizer_class)
if tokenizer_class is not None:
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
# END PRIORITIZE MAPPING FOR MODELS

# If that did not work, let's try to use the config.
if config_tokenizer_class is None:
if not isinstance(config, PreTrainedConfig):
Expand Down
4 changes: 3 additions & 1 deletion src/transformers/models/pixtral/processing_pixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def __call__(

output_kwargs = self._merge_kwargs(
PixtralProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
tokenizer_init_kwargs=getattr(self.tokenizer, "init_kwargs", {}),
**kwargs,
)

Expand Down Expand Up @@ -197,6 +197,8 @@ def __call__(

return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
# Remove return_token_type_ids as MistralCommonBackend doesn't support it
output_kwargs["text_kwargs"].pop("return_token_type_ids", None)
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"], return_tensors=None)
self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])

Expand Down
10 changes: 0 additions & 10 deletions src/transformers/processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1467,16 +1467,6 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
if "token_type_ids" in tokenizer.model_input_names:
tokenizer.model_input_names.remove("token_type_ids")
args.append(tokenizer)
elif "PixtralProcessor" in cls.__name__ and "tokenizer" in sub_processor_type:
from tokenizers import pre_tokenizers

from .models.llama import LlamaTokenizer

tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
tokenizer._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[pre_tokenizers.ByteLevel(False), tokenizer._tokenizer.pre_tokenizer]
)
args.append(tokenizer)
elif sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING:
auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type]
sub_processor = auto_processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
Expand Down
11 changes: 11 additions & 0 deletions src/transformers/tokenization_utils_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from tokenizers import Encoding as EncodingFast
from tokenizers import Tokenizer as TokenizerFast
from tokenizers.decoders import Decoder as DecoderFast
from tokenizers.models import BPE, Unigram
from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer

from .integrations.ggml import convert_gguf_tokenizer
Expand Down Expand Up @@ -236,6 +237,9 @@ def __init__(self, *args, **kwargs):
add_prefix_space = kwargs.get("add_prefix_space", False)
vocab_file = kwargs.get("vocab_file")

vocab = kwargs.get("vocab")
merges = kwargs.get("merges")

fast_tokenizer = None
if tokenizer_object is not None:
fast_tokenizer = copy.deepcopy(tokenizer_object)
Expand All @@ -252,6 +256,13 @@ def __init__(self, *args, **kwargs):
kwargs.update(tokenizer_config)
if len(additional_kwargs) > 0:
kwargs.update(additional_kwargs)
elif self._tokenizer is None and vocab is not None:
# Build from vocab/merges extracted by convert_to_native_format
if merges is not None:
vocab_dict = vocab if isinstance(vocab, dict) else {w: i for i, (w, _) in enumerate(vocab)}
fast_tokenizer = TokenizerFast(BPE(vocab=vocab_dict, merges=merges, fuse_unk=True, dropout=None))
elif isinstance(vocab, list) and vocab and isinstance(vocab[0], (tuple, list)):
fast_tokenizer = TokenizerFast(Unigram(vocab=vocab, unk_id=kwargs.get("unk_id", 0)))
elif self._tokenizer is None:
raise ValueError(
"Couldn't instantiate the backend tokenizer from one of: \n"
Expand Down
2 changes: 1 addition & 1 deletion tests/models/chinese_clip/test_processing_chinese_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def _setup_tokenizer(cls):
vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
return tokenizer_class(vocab_file=vocab_file)
return tokenizer_class.from_pretrained(cls.tmpdirname)

@classmethod
def _setup_image_processor(cls):
Expand Down
4 changes: 2 additions & 2 deletions tests/models/deepseek_vl/test_processing_deepseek_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ class DeepseekVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
@classmethod
def _setup_tokenizer(cls):
tokenizer_class = cls._get_component_class_from_processor("tokenizer")
return tokenizer_class(
vocab_file=SAMPLE_VOCAB,
return tokenizer_class.from_pretrained(
SAMPLE_VOCAB,
extra_special_tokens={
"pad_token": "<|end▁of▁sentence|>",
"image_token": "<image_placeholder>",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ class DeepseekVLHybridProcessorTest(ProcessorTesterMixin, unittest.TestCase):
@classmethod
def _setup_tokenizer(cls):
tokenizer_class = cls._get_component_class_from_processor("tokenizer")
return tokenizer_class(
vocab_file=SAMPLE_VOCAB,
return tokenizer_class.from_pretrained(
get_tests_dir("fixtures"),
extra_special_tokens={
"pad_token": "<|end▁of▁sentence|>",
"image_token": "<image_placeholder>",
Expand Down
Loading