mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-18 03:25:01 +08:00
[Model][Quantization] Restore MoE + GGUF models support (incl. Qwen3 MoE) by allowing Sideload Parameters (#30116)
Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
parent
c2e1987a6e
commit
58d5b3f514
@ -82,6 +82,7 @@ class GGUFConfig(QuantizationConfig):
|
|||||||
return UnquantizedEmbeddingMethod()
|
return UnquantizedEmbeddingMethod()
|
||||||
return GGUFEmbeddingMethod(self)
|
return GGUFEmbeddingMethod(self)
|
||||||
elif isinstance(layer, FusedMoE):
|
elif isinstance(layer, FusedMoE):
|
||||||
|
# TODO: Select UnquantizedFusedMoEMethod on unquantized layers.
|
||||||
return GGUFMoEMethod(self, layer.moe_config)
|
return GGUFMoEMethod(self, layer.moe_config)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
@ -4,6 +4,7 @@ import os
|
|||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
|
|
||||||
import gguf
|
import gguf
|
||||||
|
import regex as re
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from huggingface_hub import hf_hub_download
|
from huggingface_hub import hf_hub_download
|
||||||
@ -94,6 +95,7 @@ class GGUFModelLoader(BaseModelLoader):
|
|||||||
hasattr(config, "vision_config") and config.vision_config is not None
|
hasattr(config, "vision_config") and config.vision_config is not None
|
||||||
)
|
)
|
||||||
gguf_to_hf_name_map = {}
|
gguf_to_hf_name_map = {}
|
||||||
|
sideload_params: list[re.Pattern] = []
|
||||||
# hack: ggufs have a different name than transformers
|
# hack: ggufs have a different name than transformers
|
||||||
if model_type == "cohere":
|
if model_type == "cohere":
|
||||||
model_type = "command-r"
|
model_type = "command-r"
|
||||||
@ -118,6 +120,12 @@ class GGUFModelLoader(BaseModelLoader):
|
|||||||
gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
|
gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
|
||||||
f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
|
f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
|
||||||
)
|
)
|
||||||
|
sideload_params.append(
|
||||||
|
re.compile(
|
||||||
|
f"model\\.layers\\.{idx}"
|
||||||
|
r"\.mlp\.experts\.[0-9]+\.(gate|up|down)_proj\.weight"
|
||||||
|
)
|
||||||
|
)
|
||||||
if model_type in ("qwen2_moe", "qwen3_moe"):
|
if model_type in ("qwen2_moe", "qwen3_moe"):
|
||||||
model_type = model_type.replace("_", "")
|
model_type = model_type.replace("_", "")
|
||||||
# GGUF layer map assumes that we will have a merged expert weights
|
# GGUF layer map assumes that we will have a merged expert weights
|
||||||
@ -132,6 +140,12 @@ class GGUFModelLoader(BaseModelLoader):
|
|||||||
gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
|
gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
|
||||||
f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
|
f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
|
||||||
)
|
)
|
||||||
|
sideload_params.append(
|
||||||
|
re.compile(
|
||||||
|
f"model\\.layers\\.{idx}"
|
||||||
|
r"\.mlp\.experts\.[0-9]+\.(gate|up|down)_proj\.weight"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
arch = None
|
arch = None
|
||||||
for key, value in gguf.MODEL_ARCH_NAMES.items():
|
for key, value in gguf.MODEL_ARCH_NAMES.items():
|
||||||
@ -241,7 +255,15 @@ class GGUFModelLoader(BaseModelLoader):
|
|||||||
# Parameter not in manual overrides either
|
# Parameter not in manual overrides either
|
||||||
unmapped_params.append(hf_name)
|
unmapped_params.append(hf_name)
|
||||||
|
|
||||||
# All parameters must be mapped: both vision/projector and backbone
|
# All parameters (except those initialized by other means) must be mapped:
|
||||||
|
# both vision/projector and backbone
|
||||||
|
if unmapped_params:
|
||||||
|
unmapped_params = list(
|
||||||
|
filter(
|
||||||
|
lambda x: not any(re.fullmatch(p, x) for p in sideload_params),
|
||||||
|
unmapped_params,
|
||||||
|
)
|
||||||
|
)
|
||||||
if unmapped_params:
|
if unmapped_params:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Failed to map GGUF parameters "
|
f"Failed to map GGUF parameters "
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user