mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-16 09:37:25 +08:00
[Model][Quantization] Restore MoE + GGUF models support (incl. Qwen3 MoE) by allowing Sideload Parameters (#30116)
Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
parent
c2e1987a6e
commit
58d5b3f514
@ -82,6 +82,7 @@ class GGUFConfig(QuantizationConfig):
|
||||
return UnquantizedEmbeddingMethod()
|
||||
return GGUFEmbeddingMethod(self)
|
||||
elif isinstance(layer, FusedMoE):
|
||||
# TODO: Select UnquantizedFusedMoEMethod on unquantized layers.
|
||||
return GGUFMoEMethod(self, layer.moe_config)
|
||||
return None
|
||||
|
||||
|
||||
@ -4,6 +4,7 @@ import os
|
||||
from collections.abc import Generator
|
||||
|
||||
import gguf
|
||||
import regex as re
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from huggingface_hub import hf_hub_download
|
||||
@ -94,6 +95,7 @@ class GGUFModelLoader(BaseModelLoader):
|
||||
hasattr(config, "vision_config") and config.vision_config is not None
|
||||
)
|
||||
gguf_to_hf_name_map = {}
|
||||
sideload_params: list[re.Pattern] = []
|
||||
# hack: ggufs have a different name than transformers
|
||||
if model_type == "cohere":
|
||||
model_type = "command-r"
|
||||
@ -118,6 +120,12 @@ class GGUFModelLoader(BaseModelLoader):
|
||||
gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
|
||||
f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
|
||||
)
|
||||
sideload_params.append(
|
||||
re.compile(
|
||||
f"model\\.layers\\.{idx}"
|
||||
r"\.mlp\.experts\.[0-9]+\.(gate|up|down)_proj\.weight"
|
||||
)
|
||||
)
|
||||
if model_type in ("qwen2_moe", "qwen3_moe"):
|
||||
model_type = model_type.replace("_", "")
|
||||
# GGUF layer map assumes that we will have a merged expert weights
|
||||
@ -132,6 +140,12 @@ class GGUFModelLoader(BaseModelLoader):
|
||||
gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
|
||||
f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
|
||||
)
|
||||
sideload_params.append(
|
||||
re.compile(
|
||||
f"model\\.layers\\.{idx}"
|
||||
r"\.mlp\.experts\.[0-9]+\.(gate|up|down)_proj\.weight"
|
||||
)
|
||||
)
|
||||
|
||||
arch = None
|
||||
for key, value in gguf.MODEL_ARCH_NAMES.items():
|
||||
@ -241,7 +255,15 @@ class GGUFModelLoader(BaseModelLoader):
|
||||
# Parameter not in manual overrides either
|
||||
unmapped_params.append(hf_name)
|
||||
|
||||
# All parameters must be mapped: both vision/projector and backbone
|
||||
# All parameters (except those initialized by other means) must be mapped:
|
||||
# both vision/projector and backbone
|
||||
if unmapped_params:
|
||||
unmapped_params = list(
|
||||
filter(
|
||||
lambda x: not any(re.fullmatch(p, x) for p in sideload_params),
|
||||
unmapped_params,
|
||||
)
|
||||
)
|
||||
if unmapped_params:
|
||||
raise RuntimeError(
|
||||
f"Failed to map GGUF parameters "
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user