Fix Auto_Round Quatization Loading on SM75 and Lower GPUs (#24217)

Signed-off-by: RoadToNowhereX <37441177+RoadToNowhereX@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
This commit is contained in:
RoadToNowhereX 2025-09-10 23:22:31 +10:00 committed by GitHub
parent 3144d90217
commit c0bd6a684a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -327,6 +327,8 @@ class AutoRoundConfig(QuantizationConfig):
if isinstance(layer, FusedMoE):
if use_marlin:
return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe)
else:
from vllm.model_executor.layers.quantization.moe_wna16 import (
MoeWNA16Config)
@ -339,7 +341,6 @@ class AutoRoundConfig(QuantizationConfig):
}
return MoeWNA16Config.from_config(config).get_quant_method(
layer, prefix)
return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe)
if isinstance(layer, (LinearBase, ParallelLMHead)):
if use_marlin: