mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 13:55:38 +08:00
[Model] Gemma3: Fix GGUF loading and quantization (#26189)
Signed-off-by: Luciano Martins <lucianommartins@users.noreply.github.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Luciano Martins <lucianommartins@users.noreply.github.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
parent
5e49c3e777
commit
1317028aa8
@ -72,6 +72,10 @@ class GGUFModelLoader(BaseModelLoader):
|
|||||||
# hack: ggufs have a different name than transformers
|
# hack: ggufs have a different name than transformers
|
||||||
if model_type == "cohere":
|
if model_type == "cohere":
|
||||||
model_type = "command-r"
|
model_type = "command-r"
|
||||||
|
if model_type == "gemma3_text":
|
||||||
|
# Gemma3 models use "gemma3_text" in HuggingFace but
|
||||||
|
# "gemma3" in GGUF architecture naming
|
||||||
|
model_type = "gemma3"
|
||||||
if model_type in ("deepseek_v3", "deepseek_v2"):
|
if model_type in ("deepseek_v3", "deepseek_v2"):
|
||||||
model_type = "deepseek2"
|
model_type = "deepseek2"
|
||||||
# GGUF layer map assumes that we will have a merged expert weights
|
# GGUF layer map assumes that we will have a merged expert weights
|
||||||
|
|||||||
@ -372,6 +372,7 @@ class Gemma3Model(nn.Module):
|
|||||||
self.embed_tokens = VocabParallelEmbedding(
|
self.embed_tokens = VocabParallelEmbedding(
|
||||||
config.vocab_size,
|
config.vocab_size,
|
||||||
config.hidden_size,
|
config.hidden_size,
|
||||||
|
quant_config=quant_config,
|
||||||
prefix=f"{prefix}.embed_tokens",
|
prefix=f"{prefix}.embed_tokens",
|
||||||
)
|
)
|
||||||
self.start_layer, self.end_layer, self.layers = make_layers(
|
self.start_layer, self.end_layer, self.layers = make_layers(
|
||||||
@ -442,6 +443,15 @@ class Gemma3Model(nn.Module):
|
|||||||
params_dict = dict(self.named_parameters())
|
params_dict = dict(self.named_parameters())
|
||||||
loaded_params: set[str] = set()
|
loaded_params: set[str] = set()
|
||||||
for name, loaded_weight in weights:
|
for name, loaded_weight in weights:
|
||||||
|
# Revert +1 during llama.cpp conversion
|
||||||
|
# see: https://github.com/ggml-org/llama.cpp/blob/be7c3034108473beda214fd1d7c98fd6a7a3bdf5/convert_hf_to_gguf.py#L3397-L3400
|
||||||
|
if (
|
||||||
|
self.quant_config
|
||||||
|
and self.quant_config.get_name() == "gguf"
|
||||||
|
and name.endswith("norm.weight")
|
||||||
|
):
|
||||||
|
loaded_weight -= 1
|
||||||
|
|
||||||
if self.quant_config is not None and (
|
if self.quant_config is not None and (
|
||||||
scale_name := self.quant_config.get_cache_scale(name)
|
scale_name := self.quant_config.get_cache_scale(name)
|
||||||
):
|
):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user