Skip to content

Commit 1317028

Browse files
[Model] Gemma3: Fix GGUF loading and quantization (#26189)
Signed-off-by: Luciano Martins <lucianommartins@users.noreply.github.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Luciano Martins <lucianommartins@users.noreply.github.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
1 parent 5e49c3e commit 1317028

File tree

2 files changed

+14
-0
lines changed

2 files changed

+14
-0
lines changed

vllm/model_executor/model_loader/gguf_loader.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,10 @@ def _get_gguf_weights_map(self, model_config: ModelConfig):
7272
# hack: ggufs have a different name than transformers
7373
if model_type == "cohere":
7474
model_type = "command-r"
75+
if model_type == "gemma3_text":
76+
# Gemma3 models use "gemma3_text" in HuggingFace but
77+
# "gemma3" in GGUF architecture naming
78+
model_type = "gemma3"
7579
if model_type in ("deepseek_v3", "deepseek_v2"):
7680
model_type = "deepseek2"
7781
# GGUF layer map assumes that we will have a merged expert weights

vllm/model_executor/models/gemma3.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
372372
self.embed_tokens = VocabParallelEmbedding(
373373
config.vocab_size,
374374
config.hidden_size,
375+
quant_config=quant_config,
375376
prefix=f"{prefix}.embed_tokens",
376377
)
377378
self.start_layer, self.end_layer, self.layers = make_layers(
@@ -442,6 +443,15 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
442443
params_dict = dict(self.named_parameters())
443444
loaded_params: set[str] = set()
444445
for name, loaded_weight in weights:
446+
# Revert +1 during llama.cpp conversion
447+
# see: https://github.com/ggml-org/llama.cpp/blob/be7c3034108473beda214fd1d7c98fd6a7a3bdf5/convert_hf_to_gguf.py#L3397-L3400
448+
if (
449+
self.quant_config
450+
and self.quant_config.get_name() == "gguf"
451+
and name.endswith("norm.weight")
452+
):
453+
loaded_weight -= 1
454+
445455
if self.quant_config is not None and (
446456
scale_name := self.quant_config.get_cache_scale(name)
447457
):

0 commit comments

Comments
 (0)