Skip to content

Commit f065e40

Browse files
authored
[v5] 🚨Refactor subprocessors handling in processors (#41633)
* remove attributes and add all missing sub processors to their auto classes * remove all mentions of .attributes * cleanup * fix processor tests * fix modular * remove last attributes * fixup * fixes after merge * fix wrong tokenizer in auto florence2 * fix missing audio_processor + nits * Override __init__ in NewProcessor and change hf-internal-testing-repo (temporarily) * fix auto tokenizer test * add init to markup_lm * update CustomProcessor in custom_processing * remove print * nit * fix test modeling owlv2 * fix test_processing_layoutxlm * Fix owlv2, wav2vec2, markuplm, voxtral issues * add support for loading and saving multiple tokenizer natively * remove exclude_attributes from save_pretrained * modifs after review
1 parent 91d250e commit f065e40

File tree

149 files changed

+368
-839
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

149 files changed

+368
-839
lines changed

src/transformers/models/align/processing_align.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,6 @@ class AlignProcessor(ProcessorMixin):
5959
6060
"""
6161

62-
attributes = ["image_processor", "tokenizer"]
63-
image_processor_class = "EfficientNetImageProcessor"
64-
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
6562
valid_processor_kwargs = AlignProcessorKwargs
6663

6764
def __init__(self, image_processor, tokenizer):

src/transformers/models/altclip/processing_altclip.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,6 @@ class AltCLIPProcessor(ProcessorMixin):
3535
The tokenizer is a required input.
3636
"""
3737

38-
attributes = ["image_processor", "tokenizer"]
39-
image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast")
40-
tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast")
41-
4238
@deprecate_kwarg(old_name="feature_extractor", version="5.0.0", new_name="image_processor")
4339
def __init__(self, image_processor=None, tokenizer=None):
4440
super().__init__(image_processor, tokenizer)

src/transformers/models/aria/modular_aria.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -906,10 +906,6 @@ class AriaProcessor(ProcessorMixin):
906906
A dictionary indicating size conversions for images.
907907
"""
908908

909-
attributes = ["image_processor", "tokenizer"]
910-
image_processor_class = "AriaImageProcessor"
911-
tokenizer_class = "AutoTokenizer"
912-
913909
def __init__(
914910
self,
915911
image_processor=None,

src/transformers/models/aria/processing_aria.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,6 @@ class AriaProcessor(ProcessorMixin):
6767
A dictionary indicating size conversions for images.
6868
"""
6969

70-
attributes = ["image_processor", "tokenizer"]
71-
image_processor_class = "AriaImageProcessor"
72-
tokenizer_class = "AutoTokenizer"
73-
7470
def __init__(
7571
self,
7672
image_processor=None,

src/transformers/models/auto/configuration_auto.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,7 @@
223223
("layoutlm", "LayoutLMConfig"),
224224
("layoutlmv2", "LayoutLMv2Config"),
225225
("layoutlmv3", "LayoutLMv3Config"),
226+
("layoutxlm", "LayoutLMv2Config"),
226227
("led", "LEDConfig"),
227228
("levit", "LevitConfig"),
228229
("lfm2", "Lfm2Config"),

src/transformers/models/auto/feature_extraction_auto.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
("audio-spectrogram-transformer", "ASTFeatureExtractor"),
4242
("clap", "ClapFeatureExtractor"),
4343
("clvp", "ClvpFeatureExtractor"),
44+
("csm", "EncodecFeatureExtractor"),
4445
("dac", "DacFeatureExtractor"),
4546
("data2vec-audio", "Wav2Vec2FeatureExtractor"),
4647
("dia", "DiaFeatureExtractor"),
@@ -49,14 +50,20 @@
4950
("granite_speech", "GraniteSpeechFeatureExtractor"),
5051
("hubert", "Wav2Vec2FeatureExtractor"),
5152
("kyutai_speech_to_text", "KyutaiSpeechToTextFeatureExtractor"),
53+
("markuplm", "MarkupLMFeatureExtractor"),
5254
("mctct", "MCTCTFeatureExtractor"),
5355
("mimi", "EncodecFeatureExtractor"),
5456
("moonshine", "Wav2Vec2FeatureExtractor"),
5557
("moshi", "EncodecFeatureExtractor"),
58+
("musicgen", "EncodecFeatureExtractor"),
59+
("musicgen_melody", "MusicgenMelodyFeatureExtractor"),
5660
("parakeet_ctc", "ParakeetFeatureExtractor"),
5761
("parakeet_encoder", "ParakeetFeatureExtractor"),
5862
("phi4_multimodal", "Phi4MultimodalFeatureExtractor"),
5963
("pop2piano", "Pop2PianoFeatureExtractor"),
64+
("qwen2_5_omni", "WhisperFeatureExtractor"),
65+
("qwen2_audio", "WhisperFeatureExtractor"),
66+
("qwen3_omni_moe", "WhisperFeatureExtractor"),
6067
("seamless_m4t", "SeamlessM4TFeatureExtractor"),
6168
("seamless_m4t_v2", "SeamlessM4TFeatureExtractor"),
6269
("sew", "Wav2Vec2FeatureExtractor"),
@@ -66,6 +73,7 @@
6673
("unispeech", "Wav2Vec2FeatureExtractor"),
6774
("unispeech-sat", "Wav2Vec2FeatureExtractor"),
6875
("univnet", "UnivNetFeatureExtractor"),
76+
("voxtral", "WhisperFeatureExtractor"),
6977
("wav2vec2", "Wav2Vec2FeatureExtractor"),
7078
("wav2vec2-bert", "Wav2Vec2FeatureExtractor"),
7179
("wav2vec2-conformer", "Wav2Vec2FeatureExtractor"),

src/transformers/models/auto/image_processing_auto.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,9 @@
6262
("aimv2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
6363
("aimv2_vision_model", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
6464
("align", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")),
65+
("altclip", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
6566
("aria", ("AriaImageProcessor", None)),
67+
("aya_vision", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),
6668
("beit", ("BeitImageProcessor", "BeitImageProcessorFast")),
6769
("bit", ("BitImageProcessor", "BitImageProcessorFast")),
6870
("blip", ("BlipImageProcessor", "BlipImageProcessorFast")),
@@ -73,6 +75,8 @@
7375
("clip", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
7476
("clipseg", ("ViTImageProcessor", "ViTImageProcessorFast")),
7577
("cohere2_vision", (None, "Cohere2VisionImageProcessorFast")),
78+
("colpali", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
79+
("colqwen2", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
7680
("conditional_detr", ("ConditionalDetrImageProcessor", "ConditionalDetrImageProcessorFast")),
7781
("convnext", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
7882
("convnextv2", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
@@ -95,8 +99,10 @@
9599
("efficientformer", ("EfficientFormerImageProcessor", None)),
96100
("efficientloftr", ("EfficientLoFTRImageProcessor", "EfficientLoFTRImageProcessorFast")),
97101
("efficientnet", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")),
102+
("emu3", ("Emu3ImageProcessor", None)),
98103
("eomt", ("EomtImageProcessor", "EomtImageProcessorFast")),
99104
("flava", ("FlavaImageProcessor", "FlavaImageProcessorFast")),
105+
("florence2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
100106
("focalnet", ("BitImageProcessor", "BitImageProcessorFast")),
101107
("fuyu", ("FuyuImageProcessor", "FuyuImageProcessorFast")),
102108
("gemma3", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")),
@@ -114,11 +120,13 @@
114120
("ijepa", ("ViTImageProcessor", "ViTImageProcessorFast")),
115121
("imagegpt", ("ImageGPTImageProcessor", "ImageGPTImageProcessorFast")),
116122
("instructblip", ("BlipImageProcessor", "BlipImageProcessorFast")),
123+
("internvl", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")),
117124
("janus", ("JanusImageProcessor", "JanusImageProcessorFast")),
118125
("kosmos-2", ("CLIPImageProcessor", "CLIPImageProcessorFast")),
119126
("kosmos-2.5", ("Kosmos2_5ImageProcessor", "Kosmos2_5ImageProcessorFast")),
120127
("layoutlmv2", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessorFast")),
121128
("layoutlmv3", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")),
129+
("layoutxlm", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessor")),
122130
("levit", ("LevitImageProcessor", "LevitImageProcessorFast")),
123131
("lfm2_vl", (None, "Lfm2VlImageProcessorFast")),
124132
("lightglue", ("LightGlueImageProcessor", "LightGlueImageProcessorFast")),
@@ -141,6 +149,7 @@
141149
("mobilevitv2", ("MobileViTImageProcessor", "MobileViTImageProcessorFast")),
142150
("nat", ("ViTImageProcessor", "ViTImageProcessorFast")),
143151
("nougat", ("NougatImageProcessor", "NougatImageProcessorFast")),
152+
("omdet-turbo", ("DetrImageProcessor", "DetrImageProcessorFast")),
144153
("oneformer", ("OneFormerImageProcessor", "OneFormerImageProcessorFast")),
145154
("ovis2", ("Ovis2ImageProcessor", "Ovis2ImageProcessorFast")),
146155
("owlv2", ("Owlv2ImageProcessor", "Owlv2ImageProcessorFast")),
@@ -155,14 +164,17 @@
155164
("prompt_depth_anything", ("PromptDepthAnythingImageProcessor", "PromptDepthAnythingImageProcessorFast")),
156165
("pvt", ("PvtImageProcessor", "PvtImageProcessorFast")),
157166
("pvt_v2", ("PvtImageProcessor", "PvtImageProcessorFast")),
167+
("qwen2_5_omni", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
158168
("qwen2_5_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
159169
("qwen2_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
170+
("qwen3_omni_moe", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
160171
("qwen3_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
161172
("regnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
162173
("resnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
163174
("rt_detr", ("RTDetrImageProcessor", "RTDetrImageProcessorFast")),
164175
("sam", ("SamImageProcessor", "SamImageProcessorFast")),
165176
("sam2", (None, "Sam2ImageProcessorFast")),
177+
("sam2_video", (None, "Sam2ImageProcessorFast")),
166178
("sam_hq", ("SamImageProcessor", "SamImageProcessorFast")),
167179
("segformer", ("SegformerImageProcessor", "SegformerImageProcessorFast")),
168180
("seggpt", ("SegGptImageProcessor", None)),
@@ -180,12 +192,14 @@
180192
("textnet", ("TextNetImageProcessor", "TextNetImageProcessorFast")),
181193
("timesformer", ("VideoMAEImageProcessor", None)),
182194
("timm_wrapper", ("TimmWrapperImageProcessor", None)),
195+
("trocr", ("ViTImageProcessor", "ViTImageProcessorFast")),
183196
("tvlt", ("TvltImageProcessor", None)),
184197
("tvp", ("TvpImageProcessor", "TvpImageProcessorFast")),
185198
("udop", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")),
186199
("upernet", ("SegformerImageProcessor", "SegformerImageProcessorFast")),
187200
("van", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
188201
("video_llama_3", ("VideoLlama3ImageProcessor", "VideoLlama3ImageProcessorFast")),
202+
("video_llava", ("VideoLlavaImageProcessor", None)),
189203
("videomae", ("VideoMAEImageProcessor", None)),
190204
("vilt", ("ViltImageProcessor", "ViltImageProcessorFast")),
191205
("vipllava", ("CLIPImageProcessor", "CLIPImageProcessorFast")),

src/transformers/models/auto/processing_auto.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@
107107
("mllama", "MllamaProcessor"),
108108
("mm-grounding-dino", "GroundingDinoProcessor"),
109109
("moonshine", "Wav2Vec2Processor"),
110+
("omdet-turbo", "OmDetTurboProcessor"),
110111
("oneformer", "OneFormerProcessor"),
111112
("ovis2", "Ovis2Processor"),
112113
("owlv2", "Owlv2Processor"),

0 commit comments

Comments
 (0)