[Model] Gemma3: Fix GGUF loading and quantization (#26189)

Signed-off-by: Luciano Martins <lucianommartins@users.noreply.github.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Luciano Martins <lucianommartins@users.noreply.github.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-12-15 13:55:38 +08:00 · 2025-10-09 04:00:53 -03:00 · 2025-10-09 04:00:53 -03:00 · 1317028aa8
commit 1317028aa8
parent 5e49c3e777
2 changed files with 14 additions and 0 deletions
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@ -72,6 +72,10 @@ class GGUFModelLoader(BaseModelLoader):
        # hack: ggufs have a different name than transformers
        if model_type == "cohere":
            model_type = "command-r"
        if model_type == "gemma3_text":
            # Gemma3 models use "gemma3_text" in HuggingFace but
            # "gemma3" in GGUF architecture naming
            model_type = "gemma3"
        if model_type in ("deepseek_v3", "deepseek_v2"):
            model_type = "deepseek2"
            # GGUF layer map assumes that we will have a merged expert weights
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@ -372,6 +372,7 @@ class Gemma3Model(nn.Module):
        self.embed_tokens = VocabParallelEmbedding(
            config.vocab_size,
            config.hidden_size,
            quant_config=quant_config,
            prefix=f"{prefix}.embed_tokens",
        )
        self.start_layer, self.end_layer, self.layers = make_layers(
@ -442,6 +443,15 @@ class Gemma3Model(nn.Module):
        params_dict = dict(self.named_parameters())
        loaded_params: set[str] = set()
        for name, loaded_weight in weights:
            # Revert +1 during llama.cpp conversion
            # see: https://github.com/ggml-org/llama.cpp/blob/be7c3034108473beda214fd1d7c98fd6a7a3bdf5/convert_hf_to_gguf.py#L3397-L3400
            if (
                self.quant_config
                and self.quant_config.get_name() == "gguf"
                and name.endswith("norm.weight")
            ):
                loaded_weight -= 1
            if self.quant_config is not None and (
                scale_name := self.quant_config.get_cache_scale(name)
            ):