@@ -67,17 +67,16 @@ def __init__(
6767 self .initializer_range = initializer_range
6868
6969
70- class Qwen2_5_VLConfig (PretrainedConfig ):
70+ class Qwen2_5_VLTextConfig (PretrainedConfig ):
7171 r"""
72- This is the configuration class to store the configuration of a [`Qwen2_5_VLModel `]. It is used to instantiate a
72+ This is the configuration class to store the configuration of a [`Qwen2_5_VLTextModel `]. It is used to instantiate a
7373 Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
7474 with the defaults will yield a similar configuration to that of
7575 Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct).
7676
7777 Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
7878 documentation from [`PretrainedConfig`] for more information.
7979
80-
8180 Args:
8281 vocab_size (`int`, *optional*, defaults to 152064):
8382 Vocabulary size of the Qwen2_5_VL model. Defines the number of different tokens that can be represented by the
@@ -120,8 +119,6 @@ class Qwen2_5_VLConfig(PretrainedConfig):
120119 The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
121120 attention_dropout (`float`, *optional*, defaults to 0.0):
122121 The dropout ratio for the attention probabilities.
123- vision_config (`Dict`, *optional*):
124- The config for the visual encoder initialization.
125122 rope_scaling (`Dict`, *optional*):
126123 Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
127124 and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
@@ -161,20 +158,20 @@ class Qwen2_5_VLConfig(PretrainedConfig):
161158 Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
162159
163160 ```python
164- >>> from transformers import Qwen2_5_VLForConditionalGeneration , Qwen2_5_VLConfig
161+ >>> from transformers import Qwen2_5_VLTextModel , Qwen2_5_VLConfig
165162
166163 >>> # Initializing a Qwen2_5_VL style configuration
167164 >>> configuration = Qwen2_5_VLConfig()
168165
169166 >>> # Initializing a model from the Qwen2-VL-7B style configuration
170- >>> model = Qwen2_5_VLForConditionalGeneration (configuration)
167+ >>> model = Qwen2_5_VLTextModel (configuration)
171168
172169 >>> # Accessing the model configuration
173170 >>> configuration = model.config
174171 ```"""
175172
176- model_type = "qwen2_5_vl "
177- sub_configs = { "vision_config" : Qwen2_5_VLVisionConfig }
173+ model_type = "qwen2_5_vl_text "
174+ base_config_key = "text_config"
178175 keys_to_ignore_at_inference = ["past_key_values" ]
179176 # Default tensor parallel plan for base model `Qwen2_5_VL`
180177 base_model_tp_plan = {
@@ -211,15 +208,9 @@ def __init__(
211208 sliding_window = 4096 ,
212209 max_window_layers = 80 ,
213210 attention_dropout = 0.0 ,
214- vision_config = None ,
215211 rope_scaling = None ,
216212 ** kwargs ,
217213 ):
218- if isinstance (vision_config , dict ):
219- self .vision_config = self .sub_configs ["vision_config" ](** vision_config )
220- elif vision_config is None :
221- self .vision_config = self .sub_configs ["vision_config" ]()
222-
223214 self .vocab_size = vocab_size
224215 self .max_position_embeddings = max_position_embeddings
225216 self .hidden_size = hidden_size
@@ -257,4 +248,67 @@ def __init__(
257248 super ().__init__ (tie_word_embeddings = tie_word_embeddings , ** kwargs )
258249
259250
260- __all__ = ["Qwen2_5_VLConfig" ]
251+ class Qwen2_5_VLConfig (PretrainedConfig ):
252+ r"""
253+ This is the configuration class to store the configuration of a [`Qwen2_5_VLModel`]. It is used to instantiate a
254+ Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
255+ with the defaults will yield a similar configuration to that of
256+ Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct).
257+
258+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
259+ documentation from [`PretrainedConfig`] for more information.
260+
261+
262+ Args:
263+ text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen2_5_VLTextConfig`):
264+ The config object or dictionary of the text backbone.
265+ vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen2_5_VLVisionConfig`):
266+ The config object or dictionary of the vision backbone.
267+ image_token_id (`int`, *optional*, defaults to 151655):
268+ The image token index to encode the image prompt.
269+ video_token_id (`int`, *optional*, defaults to 151656):
270+ The video token index to encode the image prompt.
271+
272+ ```python
273+ >>> from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLConfig
274+
275+ >>> # Initializing a Qwen2_5_VL style configuration
276+ >>> configuration = Qwen2_5_VLConfig()
277+
278+ >>> # Initializing a model from the Qwen2-VL-7B style configuration
279+ >>> model = Qwen2_5_VLForConditionalGeneration(configuration)
280+
281+ >>> # Accessing the model configuration
282+ >>> configuration = model.config
283+ ```"""
284+
285+ model_type = "qwen2_5_vl"
286+ sub_configs = {"vision_config" : Qwen2_5_VLVisionConfig , "text_config" : Qwen2_5_VLTextConfig }
287+ keys_to_ignore_at_inference = ["past_key_values" ]
288+
289+ def __init__ (
290+ self ,
291+ text_config = None ,
292+ vision_config = None ,
293+ image_token_id = 151655 ,
294+ video_token_id = 151656 ,
295+ ** kwargs ,
296+ ):
297+ if isinstance (vision_config , dict ):
298+ self .vision_config = self .sub_configs ["vision_config" ](** vision_config )
299+ elif vision_config is None :
300+ self .vision_config = self .sub_configs ["vision_config" ]()
301+
302+ if isinstance (text_config , dict ):
303+ self .text_config = self .sub_configs ["text_config" ](** text_config )
304+ elif text_config is None :
305+ # For BC use all kwargs to init `TextConfig`
306+ self .text_config = self .sub_configs ["text_config" ](** kwargs )
307+
308+ self .image_token_id = image_token_id
309+ self .video_token_id = video_token_id
310+
311+ super ().__init__ (** kwargs )
312+
313+
314+ __all__ = ["Qwen2_5_VLConfig" , "Qwen2_5_VLTextConfig" ]
0 commit comments