@@ -45,7 +45,7 @@ class SentencePieceTokenTypes(IntEnum):
45
45
46
46
class ModelType (IntEnum ):
47
47
TEXT = 1
48
- VISION = 2
48
+ MMPROJ = 2
49
49
50
50
51
51
AnyModel = TypeVar ("AnyModel" , bound = "type[ModelBase]" )
@@ -54,7 +54,7 @@ class ModelType(IntEnum):
54
54
class ModelBase :
55
55
_model_classes : dict [ModelType , dict [str , type [ModelBase ]]] = {
56
56
ModelType .TEXT : {},
57
- ModelType .VISION : {},
57
+ ModelType .MMPROJ : {},
58
58
}
59
59
60
60
dir_model : Path
@@ -88,7 +88,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
88
88
small_first_shard : bool = False , hparams : dict [str , Any ] | None = None , remote_hf_model_id : str | None = None ):
89
89
if type (self ) is ModelBase or \
90
90
type (self ) is TextModel or \
91
- type (self ) is VisionModel :
91
+ type (self ) is MmprojModel :
92
92
raise TypeError (f"{ type (self ).__name__ !r} should not be directly instantiated" )
93
93
94
94
self .dir_model = dir_model
@@ -439,7 +439,7 @@ def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
439
439
assert names
440
440
441
441
def func (modelcls : AnyModel ) -> AnyModel :
442
- model_type = ModelType .VISION if modelcls .model_arch == gguf .MODEL_ARCH .CLIP_VISION else ModelType .TEXT
442
+ model_type = ModelType .MMPROJ if modelcls .model_arch == gguf .MODEL_ARCH .MMPROJ else ModelType .TEXT
443
443
for name in names :
444
444
cls ._model_classes [model_type ][name ] = modelcls
445
445
return modelcls
@@ -1115,24 +1115,27 @@ def _try_set_pooling_type(self) -> None:
1115
1115
self .gguf_writer .add_pooling_type (pooling_type )
1116
1116
1117
1117
1118
- class VisionModel (ModelBase ):
1119
- model_type = ModelType .VISION
1120
- model_arch = gguf .MODEL_ARCH .CLIP_VISION
1118
+ class MmprojModel (ModelBase ):
1119
+ model_type = ModelType .MMPROJ
1120
+ model_arch = gguf .MODEL_ARCH .MMPROJ
1121
1121
preprocessor_config : dict [str , Any ]
1122
1122
global_config : dict [str , Any ]
1123
- has_vision_encoder : bool = True
1123
+
1124
+ has_vision_encoder : bool = True # by default
1124
1125
has_audio_encoder : bool = False
1125
1126
1126
1127
def __init__ (self , * args , ** kwargs ):
1127
1128
super ().__init__ (* args , ** kwargs )
1128
1129
1129
- if self .model_arch != gguf .MODEL_ARCH .CLIP_VISION :
1130
- raise TypeError ("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION" )
1130
+ if self .model_arch != gguf .MODEL_ARCH .MMPROJ :
1131
+ raise TypeError ("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ" )
1132
+
1133
+ if self .has_vision_encoder and self .has_audio_encoder :
1134
+ raise NotImplementedError ("both vision + audio not supported yet" )
1131
1135
1132
1136
# get n_embd of the text model
1133
1137
if "text_config" not in self .hparams :
1134
1138
self .hparams ["text_config" ] = {}
1135
- # TODO @ngxson : separate VisionModel and AudioModel
1136
1139
if "audio_config" not in self .hparams :
1137
1140
self .hparams ["audio_config" ] = {}
1138
1141
text_config = {** self .hparams , ** self .hparams ["text_config" ]}
@@ -1150,37 +1153,49 @@ def __init__(self, *args, **kwargs):
1150
1153
raise ValueError ("vision_config / audio_config not found in hparams" )
1151
1154
1152
1155
self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" , "depth" ])
1153
- self .tensor_map = gguf .get_tensor_name_map (gguf .MODEL_ARCH .CLIP_VISION , self .block_count )
1156
+ self .tensor_map = gguf .get_tensor_name_map (gguf .MODEL_ARCH .MMPROJ , self .block_count )
1154
1157
1155
1158
# load preprocessor config
1156
1159
with open (self .dir_model / "preprocessor_config.json" , "r" , encoding = "utf-8" ) as f :
1157
1160
self .preprocessor_config = json .load (f )
1158
1161
1159
1162
def set_type (self ):
1160
- self .gguf_writer .add_type (gguf .GGUFType .CLIP_VISION )
1163
+ self .gguf_writer .add_type (gguf .GGUFType .MMPROJ )
1161
1164
1162
1165
def set_gguf_parameters (self ):
1163
1166
self .gguf_writer .add_file_type (self .ftype )
1164
- self . gguf_writer . add_vision_projection_dim ( self . n_embd_text )
1167
+
1165
1168
if self .has_vision_encoder :
1166
- self .gguf_writer .add_vision_has_vision_encoder (True )
1167
- if self .has_audio_encoder :
1168
- self .gguf_writer .add_vision_has_audio_encoder (True )
1169
-
1170
- # vision config
1171
- self .gguf_writer .add_vision_image_size (self .find_hparam (["image_size" ]))
1172
- self .gguf_writer .add_vision_patch_size (self .find_hparam (["patch_size" ]))
1173
- self .gguf_writer .add_vision_embedding_length (self .find_hparam (["hidden_size" ]))
1174
- self .gguf_writer .add_vision_feed_forward_length (self .find_hparam (["intermediate_size" ]))
1175
- self .gguf_writer .add_vision_block_count (self .block_count )
1176
- self .gguf_writer .add_vision_head_count (self .find_hparam (["num_attention_heads" ]))
1177
-
1178
- # preprocessor config
1179
- self .gguf_writer .add_vision_image_mean (self .preprocessor_config ["image_mean" ])
1180
- self .gguf_writer .add_vision_image_std (self .preprocessor_config ["image_std" ])
1169
+ self .gguf_writer .add_clip_has_vision_encoder (True )
1170
+ self .gguf_writer .add_vision_projection_dim (self .n_embd_text )
1171
+
1172
+ # vision config
1173
+ self .gguf_writer .add_vision_image_size (self .find_hparam (["image_size" ]))
1174
+ self .gguf_writer .add_vision_patch_size (self .find_hparam (["patch_size" ]))
1175
+ self .gguf_writer .add_vision_embedding_length (self .find_hparam (["hidden_size" ]))
1176
+ self .gguf_writer .add_vision_feed_forward_length (self .find_hparam (["intermediate_size" ]))
1177
+ self .gguf_writer .add_vision_block_count (self .block_count )
1178
+ self .gguf_writer .add_vision_head_count (self .find_hparam (["num_attention_heads" ]))
1179
+
1180
+ # preprocessor config
1181
+ self .gguf_writer .add_vision_image_mean (self .preprocessor_config ["image_mean" ])
1182
+ self .gguf_writer .add_vision_image_std (self .preprocessor_config ["image_std" ])
1183
+
1184
+ elif self .has_audio_encoder :
1185
+ self .gguf_writer .add_clip_has_audio_encoder (True )
1186
+ self .gguf_writer .add_audio_projection_dim (self .n_embd_text )
1187
+
1188
+ # audio config
1189
+ self .gguf_writer .add_audio_embedding_length (self .find_hparam (["hidden_size" ]))
1190
+ self .gguf_writer .add_audio_feed_forward_length (self .find_hparam (["intermediate_size" ]))
1191
+ self .gguf_writer .add_audio_block_count (self .block_count )
1192
+ self .gguf_writer .add_audio_head_count (self .find_hparam (["num_attention_heads" ]))
1193
+
1194
+ else :
1195
+ raise ValueError ("MmprojModel must have either vision or audio encoder" )
1181
1196
1182
1197
def write_vocab (self ):
1183
- raise ValueError ("VisionModel does not support vocab writing" )
1198
+ raise ValueError ("MmprojModel does not support vocab writing" )
1184
1199
1185
1200
1186
1201
@ModelBase .register ("GPTNeoXForCausalLM" )
@@ -1964,7 +1979,7 @@ def prepare_tensors(self):
1964
1979
"LlavaForConditionalGeneration" , # pixtral
1965
1980
"Mistral3ForConditionalGeneration" , # mistral small 3.1
1966
1981
)
1967
- class LlavaVisionModel (VisionModel ):
1982
+ class LlavaVisionModel (MmprojModel ):
1968
1983
img_break_tok_id = - 1
1969
1984
1970
1985
def __init__ (self , * args , ** kwargs ):
@@ -1990,7 +2005,7 @@ def set_gguf_parameters(self):
1990
2005
super ().set_gguf_parameters ()
1991
2006
hparams = self .hparams
1992
2007
if hparams ["model_type" ] == "pixtral" :
1993
- self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .PIXTRAL )
2008
+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .PIXTRAL )
1994
2009
self .gguf_writer .add_vision_attention_layernorm_eps (hparams ["layer_norm_eps" ])
1995
2010
1996
2011
# hidden_act
@@ -2029,7 +2044,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
2029
2044
2030
2045
2031
2046
@ModelBase .register ("Idefics3ForConditionalGeneration" , "SmolVLMForConditionalGeneration" )
2032
- class SmolVLMModel (VisionModel ):
2047
+ class SmolVLMModel (MmprojModel ):
2033
2048
def __init__ (self , * args , ** kwargs ):
2034
2049
super ().__init__ (* args , ** kwargs )
2035
2050
if self .hparams ["model_type" ] == "smolvlm_vision" :
@@ -2041,7 +2056,7 @@ def __init__(self, *args, **kwargs):
2041
2056
2042
2057
def set_gguf_parameters (self ):
2043
2058
super ().set_gguf_parameters ()
2044
- self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .IDEFICS3 )
2059
+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .IDEFICS3 )
2045
2060
self .gguf_writer .add_vision_attention_layernorm_eps (self .hparams .get ("layer_norm_eps" , 1e-5 ))
2046
2061
self .gguf_writer .add_vision_projector_scale_factor (self .global_config .get ("scale_factor" , 2 ))
2047
2062
self .gguf_writer .add_vision_use_gelu (True )
@@ -2107,10 +2122,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
2107
2122
2108
2123
2109
2124
@ModelBase .register ("Llama4ForConditionalGeneration" )
2110
- class Llama4VisionModel (VisionModel ):
2125
+ class Llama4VisionModel (MmprojModel ):
2111
2126
def set_gguf_parameters (self ):
2112
2127
super ().set_gguf_parameters ()
2113
- self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .LLAMA4 )
2128
+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .LLAMA4 )
2114
2129
self .gguf_writer .add_vision_attention_layernorm_eps (self .hparams ["norm_eps" ])
2115
2130
self .gguf_writer .add_vision_projector_scale_factor (int (1.0 / self .hparams ["pixel_shuffle_ratio" ]))
2116
2131
assert self .hparams ["hidden_act" ] == "gelu"
@@ -2683,7 +2698,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
2683
2698
2684
2699
2685
2700
@ModelBase .register ("Qwen2VLModel" , "Qwen2VLForConditionalGeneration" , "Qwen2_5_VLForConditionalGeneration" )
2686
- class Qwen2VLVisionModel (VisionModel ):
2701
+ class Qwen2VLVisionModel (MmprojModel ):
2687
2702
def __init__ (self , * args , ** kwargs ):
2688
2703
super ().__init__ (* args , ** kwargs )
2689
2704
self .hparams ["image_size" ] = self .hparams .get ("image_size" , 560 )
@@ -2698,9 +2713,9 @@ def set_gguf_parameters(self):
2698
2713
super ().set_gguf_parameters ()
2699
2714
hparams = self .hparams
2700
2715
if self .global_config ['model_type' ] == 'qwen2_vl' :
2701
- self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .QWEN2VL )
2716
+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN2VL )
2702
2717
elif self .global_config ['model_type' ] == 'qwen2_5_vl' :
2703
- self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .QWEN25VL )
2718
+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN25VL )
2704
2719
self .gguf_writer .add_vision_use_silu (True )
2705
2720
# find n_wa_pattern (window attention pattern)
2706
2721
fullatt_block_indexes = hparams .get ("fullatt_block_indexes" )
@@ -2759,11 +2774,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
2759
2774
2760
2775
2761
2776
@ModelBase .register ("InternVisionModel" )
2762
- class InternVisionModel (VisionModel ):
2777
+ class InternVisionModel (MmprojModel ):
2763
2778
def set_gguf_parameters (self ):
2764
2779
super ().set_gguf_parameters ()
2765
2780
hparams = self .hparams
2766
- self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .INTERNVL )
2781
+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .INTERNVL )
2767
2782
self .gguf_writer .add_vision_attention_layernorm_eps (hparams ["layer_norm_eps" ])
2768
2783
# hidden_act
2769
2784
if hparams ["hidden_act" ] == "silu" :
@@ -4021,11 +4036,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
4021
4036
4022
4037
4023
4038
@ModelBase .register ("Gemma3ForConditionalGeneration" )
4024
- class Gemma3VisionModel (VisionModel ):
4039
+ class Gemma3VisionModel (MmprojModel ):
4025
4040
def set_gguf_parameters (self ):
4026
4041
super ().set_gguf_parameters ()
4027
4042
hparams = self .hparams
4028
- self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .GEMMA3 )
4043
+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .GEMMA3 )
4029
4044
# default values below are taken from HF tranformers code
4030
4045
self .gguf_writer .add_vision_attention_layernorm_eps (hparams .get ("layer_norm_eps" , 1e-6 ))
4031
4046
self .gguf_writer .add_vision_use_gelu (True )
@@ -5982,24 +5997,22 @@ def __init__(self, *args, **kwargs):
5982
5997
5983
5998
5984
5999
@ModelBase .register ("UltravoxModel" )
5985
- class UltravoxAudioModel (VisionModel ):
6000
+ class UltravoxAudioModel (MmprojModel ):
6001
+ has_vision_encoder = False # no vision encoder
6002
+ has_audio_encoder = True
6003
+
5986
6004
def __init__ (self , * args , ** kwargs ):
5987
6005
super ().__init__ (* args , ** kwargs )
5988
- self .has_vision_encoder = False
5989
- self .has_audio_encoder = True
5990
- self .hparams ["image_size" ] = self .hparams ["num_mel_bins" ]
5991
- self .hparams ["patch_size" ] = self .hparams ["num_mel_bins" ]
5992
6006
self .hparams ["hidden_size" ] = self .hparams ["d_model" ]
5993
6007
self .hparams ["intermediate_size" ] = self .hparams ["encoder_ffn_dim" ]
5994
6008
self .hparams ["num_attention_heads" ] = self .hparams ["encoder_attention_heads" ]
5995
- self .preprocessor_config ["image_mean" ] = [0 , 0 , 0 ]
5996
- self .preprocessor_config ["image_std" ] = [0 , 0 , 0 ]
5997
6009
5998
6010
def set_gguf_parameters (self ):
5999
6011
super ().set_gguf_parameters ()
6000
- self .gguf_writer .add_vision_projector_type (gguf .VisionProjectorType .ULTRAVOX )
6001
- self .gguf_writer .add_vision_attention_layernorm_eps (self .hparams .get ("layer_norm_eps" , 1e-5 ))
6002
- self .gguf_writer .add_uint32 (gguf .Keys .ClipVision .Projector .STACK_FACTOR , self .global_config ["stack_factor" ])
6012
+ self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .ULTRAVOX )
6013
+ self .gguf_writer .add_audio_num_mel_bins (self .hparams ["num_mel_bins" ])
6014
+ self .gguf_writer .add_audio_attention_layernorm_eps (self .hparams .get ("layer_norm_eps" , 1e-5 ))
6015
+ self .gguf_writer .add_audio_stack_factor (self .global_config ["stack_factor" ])
6003
6016
6004
6017
def tensor_force_quant (self , name , new_name , bid , n_dims ):
6005
6018
del bid , new_name , n_dims # unused
@@ -6195,13 +6208,15 @@ def split_str_to_n_bytes(split_str: str) -> int:
6195
6208
6196
6209
6197
6210
def get_model_architecture (hparams : dict [str , Any ], model_type : ModelType ) -> str :
6211
+ # TODO @ngxson : this won't work correctly if the model has both audio & vision encoders
6212
+ # maybe we should fallback to text model's arch in that case, since not many models have both
6198
6213
text_config = hparams .get ("text_config" , {})
6199
6214
vision_config = hparams .get ("vision_config" , {})
6200
6215
arch = hparams ["architectures" ][0 ]
6201
6216
# if "architectures" is found in the sub-config, use that instead
6202
6217
if model_type == ModelType .TEXT and text_config .get ("architectures" ) is not None :
6203
6218
arch = text_config ["architectures" ][0 ]
6204
- elif model_type == ModelType .VISION and vision_config .get ("architectures" ) is not None :
6219
+ elif model_type == ModelType .MMPROJ and vision_config .get ("architectures" ) is not None :
6205
6220
arch = vision_config ["architectures" ][0 ]
6206
6221
return arch
6207
6222
@@ -6264,7 +6279,7 @@ def main() -> None:
6264
6279
6265
6280
with torch .inference_mode ():
6266
6281
output_type = ftype_map [args .outtype ]
6267
- model_type = ModelType .VISION if args .mmproj else ModelType .TEXT
6282
+ model_type = ModelType .MMPROJ if args .mmproj else ModelType .TEXT
6268
6283
hparams = ModelBase .load_hparams (dir_model )
6269
6284
model_architecture = get_model_architecture (hparams , model_type )
6270
6285
logger .info (f"Model architecture: { model_architecture } " )
0 commit comments