[Model][Quantization] Restore MoE + GGUF models support (incl. Qwen3 MoE) by allowing Sideload Parameters (#30116)

Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Tsukasa OI 2025-12-09 14:30:05 +09:00 committed by GitHub
parent c2e1987a6e
commit 58d5b3f514
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 24 additions and 1 deletions

View File

@ -82,6 +82,7 @@ class GGUFConfig(QuantizationConfig):
return UnquantizedEmbeddingMethod() return UnquantizedEmbeddingMethod()
return GGUFEmbeddingMethod(self) return GGUFEmbeddingMethod(self)
elif isinstance(layer, FusedMoE): elif isinstance(layer, FusedMoE):
# TODO: Select UnquantizedFusedMoEMethod on unquantized layers.
return GGUFMoEMethod(self, layer.moe_config) return GGUFMoEMethod(self, layer.moe_config)
return None return None

View File

@ -4,6 +4,7 @@ import os
from collections.abc import Generator from collections.abc import Generator
import gguf import gguf
import regex as re
import torch import torch
import torch.nn as nn import torch.nn as nn
from huggingface_hub import hf_hub_download from huggingface_hub import hf_hub_download
@ -94,6 +95,7 @@ class GGUFModelLoader(BaseModelLoader):
hasattr(config, "vision_config") and config.vision_config is not None hasattr(config, "vision_config") and config.vision_config is not None
) )
gguf_to_hf_name_map = {} gguf_to_hf_name_map = {}
sideload_params: list[re.Pattern] = []
# hack: ggufs have a different name than transformers # hack: ggufs have a different name than transformers
if model_type == "cohere": if model_type == "cohere":
model_type = "command-r" model_type = "command-r"
@ -118,6 +120,12 @@ class GGUFModelLoader(BaseModelLoader):
gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = ( gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
f"model.layers.{idx}.mlp.experts.0.up_proj.weight" f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
) )
sideload_params.append(
re.compile(
f"model\\.layers\\.{idx}"
r"\.mlp\.experts\.[0-9]+\.(gate|up|down)_proj\.weight"
)
)
if model_type in ("qwen2_moe", "qwen3_moe"): if model_type in ("qwen2_moe", "qwen3_moe"):
model_type = model_type.replace("_", "") model_type = model_type.replace("_", "")
# GGUF layer map assumes that we will have a merged expert weights # GGUF layer map assumes that we will have a merged expert weights
@ -132,6 +140,12 @@ class GGUFModelLoader(BaseModelLoader):
gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = ( gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = (
f"model.layers.{idx}.mlp.experts.0.up_proj.weight" f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
) )
sideload_params.append(
re.compile(
f"model\\.layers\\.{idx}"
r"\.mlp\.experts\.[0-9]+\.(gate|up|down)_proj\.weight"
)
)
arch = None arch = None
for key, value in gguf.MODEL_ARCH_NAMES.items(): for key, value in gguf.MODEL_ARCH_NAMES.items():
@ -241,7 +255,15 @@ class GGUFModelLoader(BaseModelLoader):
# Parameter not in manual overrides either # Parameter not in manual overrides either
unmapped_params.append(hf_name) unmapped_params.append(hf_name)
# All parameters must be mapped: both vision/projector and backbone # All parameters (except those initialized by other means) must be mapped:
# both vision/projector and backbone
if unmapped_params:
unmapped_params = list(
filter(
lambda x: not any(re.fullmatch(p, x) for p in sideload_params),
unmapped_params,
)
)
if unmapped_params: if unmapped_params:
raise RuntimeError( raise RuntimeError(
f"Failed to map GGUF parameters " f"Failed to map GGUF parameters "