From c0bd6a684aee2b47fe52b75ec97379e40ca5d36c Mon Sep 17 00:00:00 2001 From: RoadToNowhereX <37441177+RoadToNowhereX@users.noreply.github.com> Date: Wed, 10 Sep 2025 23:22:31 +1000 Subject: [PATCH] Fix Auto_Round Quatization Loading on SM75 and Lower GPUs (#24217) Signed-off-by: RoadToNowhereX <37441177+RoadToNowhereX@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- vllm/model_executor/layers/quantization/auto_round.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py index fb285413ba9ef..1ca92273430dd 100644 --- a/vllm/model_executor/layers/quantization/auto_round.py +++ b/vllm/model_executor/layers/quantization/auto_round.py @@ -327,6 +327,8 @@ class AutoRoundConfig(QuantizationConfig): if isinstance(layer, FusedMoE): if use_marlin: + return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe) + else: from vllm.model_executor.layers.quantization.moe_wna16 import ( MoeWNA16Config) @@ -339,7 +341,6 @@ class AutoRoundConfig(QuantizationConfig): } return MoeWNA16Config.from_config(config).get_quant_method( layer, prefix) - return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe) if isinstance(layer, (LinearBase, ParallelLMHead)): if use_marlin: