From 73a484caa1ad320d6e695f098c25c479a71e6774 Mon Sep 17 00:00:00 2001 From: Tsukasa OI Date: Wed, 10 Dec 2025 04:13:10 +0900 Subject: [PATCH] [Model][Quantization] Fix / Add GGUF support for Qwen2 MoE models (#30307) Signed-off-by: Tsukasa OI --- vllm/model_executor/models/qwen2_moe.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 5a428740082f6..cbc618f1abd08 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -367,6 +367,8 @@ class Qwen2MoeModel(nn.Module): self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens", ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, @@ -512,6 +514,12 @@ class Qwen2MoeModel(nn.Module): continue else: name = remapped_kv_scale_name + # GGUF: make sure that shared_expert_gate is a 2D tensor. + if ( + "mlp.shared_expert_gate" in name + and len(loaded_weight.shape) == 1 + ): + loaded_weight = loaded_weight[None, :] param = params_dict[name] weight_loader = getattr( param, "weight_loader", default_weight_loader