From 9b94d6ec8f5f2e8ec2d3897ed05fb2b13cc012da Mon Sep 17 00:00:00 2001
From: Andy Chen <37168711+py-andy-c@users.noreply.github.com>
Date: Mon, 11 Aug 2025 19:02:14 -0700
Subject: [PATCH] Enable 4bit bnb prequant MOE (#21548)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../model_executor/model_loader/bitsandbytes_loader.py | 10 +++-------
 vllm/model_executor/models/qwen3_moe.py                |  2 +-
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index ea2fb2e3ac14..b8393956eed3 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -427,14 +427,10 @@ class BitsAndBytesModelLoader(BaseModelLoader):
             elif isinstance(module, FusedMoE) and hasattr(
                     module.quant_method, "quant_config"):
                 # TODO: support FusedMoE with prequant and 8bit.
-                if self.pre_quant:
+                if self.pre_quant and self.load_8bit:
                     raise ValueError(
-                        "Prequant BitsAndBytes models with FusedMoE is not "
-                        "supported yet.")
-                if self.load_8bit:
-                    raise ValueError(
-                        "BitsAndBytes 8bit quantization with FusedMoE is not "
-                        "supported yet.")
+                        "Prequant BitsAndBytes 8bit models with FusedMoE "
+                        "is not supported yet.")
                 # Get the corresponding weight name using module name and
                 # expert_params_mapping.
 
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 9b49952f3724..085fc90b47b5 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -684,4 +684,4 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA,
         return loader.load_weights(weights)
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
-        return self.model.get_expert_mapping()
+        return self.model.get_expert_mapping()
\ No newline at end of file