diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index ea2fb2e3ac14..b8393956eed3 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -427,14 +427,10 @@ class BitsAndBytesModelLoader(BaseModelLoader): elif isinstance(module, FusedMoE) and hasattr( module.quant_method, "quant_config"): # TODO: support FusedMoE with prequant and 8bit. - if self.pre_quant: + if self.pre_quant and self.load_8bit: raise ValueError( - "Prequant BitsAndBytes models with FusedMoE is not " - "supported yet.") - if self.load_8bit: - raise ValueError( - "BitsAndBytes 8bit quantization with FusedMoE is not " - "supported yet.") + "Prequant BitsAndBytes 8bit models with FusedMoE " + "is not supported yet.") # Get the corresponding weight name using module name and # expert_params_mapping. diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 9b49952f3724..085fc90b47b5 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -684,4 +684,4 @@ class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA, return loader.load_weights(weights) def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - return self.model.get_expert_mapping() + return self.model.get_expert_mapping() \ No newline at end of file