[Model][Quantization] Restore MoE + GGUF models support (incl. Qwen3 MoE) by allowing Sideload Parameters (#30116)

Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-12-18 03:25:01 +08:00 · 2025-12-09 14:30:05 +09:00 · 2025-12-09 14:30:05 +09:00 · 58d5b3f514
commit 58d5b3f514
parent c2e1987a6e
2 changed files with 24 additions and 1 deletions
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@ -82,6 +82,7 @@ class GGUFConfig(QuantizationConfig):
                return UnquantizedEmbeddingMethod()
            return GGUFEmbeddingMethod(self)
        elif isinstance(layer, FusedMoE):
            # TODO: Select UnquantizedFusedMoEMethod on unquantized layers.
            return GGUFMoEMethod(self, layer.moe_config)
        return None
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@ -4,6 +4,7 @@ import os
 from collections.abc import Generator
 import gguf
 import regex as re
 import torch
 import torch.nn as nn
 from huggingface_hub import hf_hub_download
@ -94,6 +95,7 @@ class GGUFModelLoader(BaseModelLoader):
            hasattr(config, "vision_config") and config.vision_config is not None
        )
        gguf_to_hf_name_map = {}
        sideload_params: list[re.Pattern] = []
        # hack: ggufs have a different name than transformers
        if model_type == "cohere":
            model_type = "command-r"
@ -118,6 +120,12 @@ class GGUFModelLoader(BaseModelLoader):
                gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
                    f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
                )
                sideload_params.append(
                    re.compile(
                        f"model\\.layers\\.{idx}"
                        r"\.mlp\.experts\.[0-9]+\.(gate|up|down)_proj\.weight"
                    )
                )
        if model_type in ("qwen2_moe", "qwen3_moe"):
            model_type = model_type.replace("_", "")
            # GGUF layer map assumes that we will have a merged expert weights
@ -132,6 +140,12 @@ class GGUFModelLoader(BaseModelLoader):
                gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
                    f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
                )
                sideload_params.append(
                    re.compile(
                        f"model\\.layers\\.{idx}"
                        r"\.mlp\.experts\.[0-9]+\.(gate|up|down)_proj\.weight"
                    )
                )
        arch = None
        for key, value in gguf.MODEL_ARCH_NAMES.items():
@ -241,7 +255,15 @@ class GGUFModelLoader(BaseModelLoader):
                # Parameter not in manual overrides either
                unmapped_params.append(hf_name)
-        # All parameters must be mapped: both vision/projector and backbone
+        # All parameters (except those initialized by other means) must be mapped:
        # both vision/projector and backbone
        if unmapped_params:
            unmapped_params = list(
                filter(
                    lambda x: not any(re.fullmatch(p, x) for p in sideload_params),
                    unmapped_params,
                )
            )
        if unmapped_params:
            raise RuntimeError(
                f"Failed to map GGUF parameters "