Skip to content

Commit 9afb3af

Browse files
committed
refactor kv metadata
1 parent 7602ee4 commit 9afb3af

File tree

5 files changed

+194
-104
lines changed

5 files changed

+194
-104
lines changed

convert_hf_to_gguf.py

Lines changed: 70 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ class SentencePieceTokenTypes(IntEnum):
4545

4646
class ModelType(IntEnum):
4747
TEXT = 1
48-
VISION = 2
48+
MMPROJ = 2
4949

5050

5151
AnyModel = TypeVar("AnyModel", bound="type[ModelBase]")
@@ -54,7 +54,7 @@ class ModelType(IntEnum):
5454
class ModelBase:
5555
_model_classes: dict[ModelType, dict[str, type[ModelBase]]] = {
5656
ModelType.TEXT: {},
57-
ModelType.VISION: {},
57+
ModelType.MMPROJ: {},
5858
}
5959

6060
dir_model: Path
@@ -88,7 +88,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
8888
small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None):
8989
if type(self) is ModelBase or \
9090
type(self) is TextModel or \
91-
type(self) is VisionModel:
91+
type(self) is MmprojModel:
9292
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
9393

9494
self.dir_model = dir_model
@@ -439,7 +439,7 @@ def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
439439
assert names
440440

441441
def func(modelcls: AnyModel) -> AnyModel:
442-
model_type = ModelType.VISION if modelcls.model_arch == gguf.MODEL_ARCH.CLIP_VISION else ModelType.TEXT
442+
model_type = ModelType.MMPROJ if modelcls.model_arch == gguf.MODEL_ARCH.MMPROJ else ModelType.TEXT
443443
for name in names:
444444
cls._model_classes[model_type][name] = modelcls
445445
return modelcls
@@ -1115,24 +1115,27 @@ def _try_set_pooling_type(self) -> None:
11151115
self.gguf_writer.add_pooling_type(pooling_type)
11161116

11171117

1118-
class VisionModel(ModelBase):
1119-
model_type = ModelType.VISION
1120-
model_arch = gguf.MODEL_ARCH.CLIP_VISION
1118+
class MmprojModel(ModelBase):
1119+
model_type = ModelType.MMPROJ
1120+
model_arch = gguf.MODEL_ARCH.MMPROJ
11211121
preprocessor_config: dict[str, Any]
11221122
global_config: dict[str, Any]
1123-
has_vision_encoder: bool = True
1123+
1124+
has_vision_encoder: bool = True # by default
11241125
has_audio_encoder: bool = False
11251126

11261127
def __init__(self, *args, **kwargs):
11271128
super().__init__(*args, **kwargs)
11281129

1129-
if self.model_arch != gguf.MODEL_ARCH.CLIP_VISION:
1130-
raise TypeError("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION")
1130+
if self.model_arch != gguf.MODEL_ARCH.MMPROJ:
1131+
raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ")
1132+
1133+
if self.has_vision_encoder and self.has_audio_encoder:
1134+
raise NotImplementedError("both vision + audio not supported yet")
11311135

11321136
# get n_embd of the text model
11331137
if "text_config" not in self.hparams:
11341138
self.hparams["text_config"] = {}
1135-
# TODO @ngxson : separate VisionModel and AudioModel
11361139
if "audio_config" not in self.hparams:
11371140
self.hparams["audio_config"] = {}
11381141
text_config = {**self.hparams, **self.hparams["text_config"]}
@@ -1150,37 +1153,49 @@ def __init__(self, *args, **kwargs):
11501153
raise ValueError("vision_config / audio_config not found in hparams")
11511154

11521155
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"])
1153-
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, self.block_count)
1156+
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
11541157

11551158
# load preprocessor config
11561159
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
11571160
self.preprocessor_config = json.load(f)
11581161

11591162
def set_type(self):
1160-
self.gguf_writer.add_type(gguf.GGUFType.CLIP_VISION)
1163+
self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
11611164

11621165
def set_gguf_parameters(self):
11631166
self.gguf_writer.add_file_type(self.ftype)
1164-
self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
1167+
11651168
if self.has_vision_encoder:
1166-
self.gguf_writer.add_vision_has_vision_encoder(True)
1167-
if self.has_audio_encoder:
1168-
self.gguf_writer.add_vision_has_audio_encoder(True)
1169-
1170-
# vision config
1171-
self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"]))
1172-
self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
1173-
self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
1174-
self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
1175-
self.gguf_writer.add_vision_block_count(self.block_count)
1176-
self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))
1177-
1178-
# preprocessor config
1179-
self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
1180-
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
1169+
self.gguf_writer.add_clip_has_vision_encoder(True)
1170+
self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
1171+
1172+
# vision config
1173+
self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"]))
1174+
self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
1175+
self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
1176+
self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
1177+
self.gguf_writer.add_vision_block_count(self.block_count)
1178+
self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))
1179+
1180+
# preprocessor config
1181+
self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
1182+
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
1183+
1184+
elif self.has_audio_encoder:
1185+
self.gguf_writer.add_clip_has_audio_encoder(True)
1186+
self.gguf_writer.add_audio_projection_dim(self.n_embd_text)
1187+
1188+
# audio config
1189+
self.gguf_writer.add_audio_embedding_length(self.find_hparam(["hidden_size"]))
1190+
self.gguf_writer.add_audio_feed_forward_length(self.find_hparam(["intermediate_size"]))
1191+
self.gguf_writer.add_audio_block_count(self.block_count)
1192+
self.gguf_writer.add_audio_head_count(self.find_hparam(["num_attention_heads"]))
1193+
1194+
else:
1195+
raise ValueError("MmprojModel must have either vision or audio encoder")
11811196

11821197
def write_vocab(self):
1183-
raise ValueError("VisionModel does not support vocab writing")
1198+
raise ValueError("MmprojModel does not support vocab writing")
11841199

11851200

11861201
@ModelBase.register("GPTNeoXForCausalLM")
@@ -1964,7 +1979,7 @@ def prepare_tensors(self):
19641979
"LlavaForConditionalGeneration", # pixtral
19651980
"Mistral3ForConditionalGeneration", # mistral small 3.1
19661981
)
1967-
class LlavaVisionModel(VisionModel):
1982+
class LlavaVisionModel(MmprojModel):
19681983
img_break_tok_id = -1
19691984

19701985
def __init__(self, *args, **kwargs):
@@ -1990,7 +2005,7 @@ def set_gguf_parameters(self):
19902005
super().set_gguf_parameters()
19912006
hparams = self.hparams
19922007
if hparams["model_type"] == "pixtral":
1993-
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.PIXTRAL)
2008+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL)
19942009
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
19952010

19962011
# hidden_act
@@ -2029,7 +2044,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
20292044

20302045

20312046
@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration")
2032-
class SmolVLMModel(VisionModel):
2047+
class SmolVLMModel(MmprojModel):
20332048
def __init__(self, *args, **kwargs):
20342049
super().__init__(*args, **kwargs)
20352050
if self.hparams["model_type"] == "smolvlm_vision":
@@ -2041,7 +2056,7 @@ def __init__(self, *args, **kwargs):
20412056

20422057
def set_gguf_parameters(self):
20432058
super().set_gguf_parameters()
2044-
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.IDEFICS3)
2059+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.IDEFICS3)
20452060
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
20462061
self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
20472062
self.gguf_writer.add_vision_use_gelu(True)
@@ -2107,10 +2122,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
21072122

21082123

21092124
@ModelBase.register("Llama4ForConditionalGeneration")
2110-
class Llama4VisionModel(VisionModel):
2125+
class Llama4VisionModel(MmprojModel):
21112126
def set_gguf_parameters(self):
21122127
super().set_gguf_parameters()
2113-
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.LLAMA4)
2128+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LLAMA4)
21142129
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams["norm_eps"])
21152130
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / self.hparams["pixel_shuffle_ratio"]))
21162131
assert self.hparams["hidden_act"] == "gelu"
@@ -2683,7 +2698,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
26832698

26842699

26852700
@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
2686-
class Qwen2VLVisionModel(VisionModel):
2701+
class Qwen2VLVisionModel(MmprojModel):
26872702
def __init__(self, *args, **kwargs):
26882703
super().__init__(*args, **kwargs)
26892704
self.hparams["image_size"] = self.hparams.get("image_size", 560)
@@ -2698,9 +2713,9 @@ def set_gguf_parameters(self):
26982713
super().set_gguf_parameters()
26992714
hparams = self.hparams
27002715
if self.global_config['model_type'] == 'qwen2_vl':
2701-
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN2VL)
2716+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL)
27022717
elif self.global_config['model_type'] == 'qwen2_5_vl':
2703-
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN25VL)
2718+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
27042719
self.gguf_writer.add_vision_use_silu(True)
27052720
# find n_wa_pattern (window attention pattern)
27062721
fullatt_block_indexes = hparams.get("fullatt_block_indexes")
@@ -2759,11 +2774,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27592774

27602775

27612776
@ModelBase.register("InternVisionModel")
2762-
class InternVisionModel(VisionModel):
2777+
class InternVisionModel(MmprojModel):
27632778
def set_gguf_parameters(self):
27642779
super().set_gguf_parameters()
27652780
hparams = self.hparams
2766-
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.INTERNVL)
2781+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL)
27672782
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
27682783
# hidden_act
27692784
if hparams["hidden_act"] == "silu":
@@ -4021,11 +4036,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
40214036

40224037

40234038
@ModelBase.register("Gemma3ForConditionalGeneration")
4024-
class Gemma3VisionModel(VisionModel):
4039+
class Gemma3VisionModel(MmprojModel):
40254040
def set_gguf_parameters(self):
40264041
super().set_gguf_parameters()
40274042
hparams = self.hparams
4028-
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.GEMMA3)
4043+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3)
40294044
# default values below are taken from HF tranformers code
40304045
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
40314046
self.gguf_writer.add_vision_use_gelu(True)
@@ -5982,24 +5997,22 @@ def __init__(self, *args, **kwargs):
59825997

59835998

59845999
@ModelBase.register("UltravoxModel")
5985-
class UltravoxAudioModel(VisionModel):
6000+
class UltravoxAudioModel(MmprojModel):
6001+
has_vision_encoder = False # no vision encoder
6002+
has_audio_encoder = True
6003+
59866004
def __init__(self, *args, **kwargs):
59876005
super().__init__(*args, **kwargs)
5988-
self.has_vision_encoder = False
5989-
self.has_audio_encoder = True
5990-
self.hparams["image_size"] = self.hparams["num_mel_bins"]
5991-
self.hparams["patch_size"] = self.hparams["num_mel_bins"]
59926006
self.hparams["hidden_size"] = self.hparams["d_model"]
59936007
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
59946008
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
5995-
self.preprocessor_config["image_mean"] = [0, 0, 0]
5996-
self.preprocessor_config["image_std"] = [0, 0, 0]
59976009

59986010
def set_gguf_parameters(self):
59996011
super().set_gguf_parameters()
6000-
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.ULTRAVOX)
6001-
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
6002-
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.Projector.STACK_FACTOR, self.global_config["stack_factor"])
6012+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX)
6013+
self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
6014+
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
6015+
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
60036016

60046017
def tensor_force_quant(self, name, new_name, bid, n_dims):
60056018
del bid, new_name, n_dims # unused
@@ -6195,13 +6208,15 @@ def split_str_to_n_bytes(split_str: str) -> int:
61956208

61966209

61976210
def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str:
6211+
# TODO @ngxson : this won't work correctly if the model has both audio & vision encoders
6212+
# maybe we should fallback to text model's arch in that case, since not many models have both
61986213
text_config = hparams.get("text_config", {})
61996214
vision_config = hparams.get("vision_config", {})
62006215
arch = hparams["architectures"][0]
62016216
# if "architectures" is found in the sub-config, use that instead
62026217
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
62036218
arch = text_config["architectures"][0]
6204-
elif model_type == ModelType.VISION and vision_config.get("architectures") is not None:
6219+
elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
62056220
arch = vision_config["architectures"][0]
62066221
return arch
62076222

@@ -6264,7 +6279,7 @@ def main() -> None:
62646279

62656280
with torch.inference_mode():
62666281
output_type = ftype_map[args.outtype]
6267-
model_type = ModelType.VISION if args.mmproj else ModelType.TEXT
6282+
model_type = ModelType.MMPROJ if args.mmproj else ModelType.TEXT
62686283
hparams = ModelBase.load_hparams(dir_model)
62696284
model_architecture = get_model_architecture(hparams, model_type)
62706285
logger.info(f"Model architecture: {model_architecture}")

gguf-py/gguf/constants.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -219,11 +219,13 @@ class Adapter:
219219
TYPE = "adapter.type"
220220
LORA_ALPHA = "adapter.lora.alpha"
221221

222-
class ClipVision:
222+
class Clip:
223223
PROJECTOR_TYPE = "clip.projector_type"
224224
HAS_VISION_ENCODER = "clip.has_vision_encoder"
225225
HAS_AUDIO_ENCODER = "clip.has_audio_encoder"
226226
HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
227+
228+
class ClipVision:
227229
IMAGE_SIZE = "clip.vision.image_size"
228230
PATCH_SIZE = "clip.vision.patch_size"
229231
EMBEDDING_LENGTH = "clip.vision.embedding_length"
@@ -243,6 +245,19 @@ class Attention:
243245

244246
class Projector:
245247
SCALE_FACTOR = "clip.vision.projector.scale_factor"
248+
249+
class ClipAudio:
250+
NUM_MEL_BINS = "clip.audio.num_mel_bins"
251+
EMBEDDING_LENGTH = "clip.audio.embedding_length"
252+
FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length"
253+
PROJECTION_DIM = "clip.audio.projection_dim"
254+
BLOCK_COUNT = "clip.audio.block_count"
255+
256+
class Attention:
257+
HEAD_COUNT = "clip.audio.attention.head_count"
258+
LAYERNORM_EPS = "clip.audio.attention.layer_norm_epsilon"
259+
260+
class Projector:
246261
STACK_FACTOR = "clip.audio.projector.stack_factor"
247262

248263
#
@@ -251,13 +266,13 @@ class Projector:
251266

252267

253268
class GGUFType:
254-
MODEL = "model"
255-
ADAPTER = "adapter"
256-
CLIP_VISION = "clip-vision"
269+
MODEL = "model"
270+
ADAPTER = "adapter"
271+
MMPROJ = "mmproj" # dummy, unused for now
257272

258273

259274
class MODEL_ARCH(IntEnum):
260-
CLIP_VISION = auto() # dummy arch for clip.cpp
275+
MMPROJ = auto() # dummy arch for clip.cpp
261276
LLAMA = auto()
262277
LLAMA4 = auto()
263278
DECI = auto()
@@ -536,7 +551,7 @@ class MODEL_TENSOR(IntEnum):
536551

537552

538553
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
539-
MODEL_ARCH.CLIP_VISION: "clip", # dummy arch for clip.cpp
554+
MODEL_ARCH.MMPROJ: "clip", # dummy arch for clip.cpp
540555
MODEL_ARCH.LLAMA: "llama",
541556
MODEL_ARCH.LLAMA4: "llama4",
542557
MODEL_ARCH.DECI: "deci",
@@ -815,7 +830,7 @@ class MODEL_TENSOR(IntEnum):
815830
}
816831

817832
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
818-
MODEL_ARCH.CLIP_VISION: [
833+
MODEL_ARCH.MMPROJ: [
819834
MODEL_TENSOR.V_MMPROJ,
820835
MODEL_TENSOR.V_MMPROJ_FC,
821836
MODEL_TENSOR.V_MMPROJ_MLP,

0 commit comments

Comments
 (0)