From 9b94d6ec8f5f2e8ec2d3897ed05fb2b13cc012da Mon Sep 17 00:00:00 2001 From: Andy Chen <37168711+py-andy-c@users.noreply.github.com> Date: Mon, 11 Aug 2025 19:02:14 -0700 Subject: [PATCH] Enable 4bit bnb prequant MOE (#21548) Signed-off-by: Jee Jee Li Co-authored-by: Jee Jee Li --- .../model_executor/model_loader/bitsandbytes_loader.py | 10 +++------- vllm/model_executor/models/qwen3_moe.py | 2 +- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index ea2fb2e3ac14..b8393956eed3 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -427,14 +427,10 @@ class BitsAndBytesModelLoader(BaseModelLoader): elif isinstance(module, FusedMoE) and hasattr( module.quant_method, "quant_config"): # TODO: support FusedMoE with prequant and 8bit. - if self.pre_quant: + if self.pre_quant and self.load_8bit: raise ValueError( - "Prequant BitsAndBytes models with FusedMoE is not " - "supported yet.") - if self.load_8bit: - raise ValueError( - "BitsAndBytes 8bit quantization with FusedMoE is not " - "supported yet.") + "Prequant BitsAndBytes 8bit models with FusedMoE " + "is not supported yet.") # Get the corresponding weight name using module name and # expert_params_mapping. diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 9b49952f3724..085fc90b47b5 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -684,4 +684,4 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, return loader.load_weights(weights) def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return self.model.get_expert_mapping() + return self.model.get_expert_mapping() \ No newline at end of file