From fcb9f879c1750774c03341c201ad8c1392d3ed23 Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Tue, 15 Jul 2025 19:53:42 -0700 Subject: [PATCH] =?UTF-8?q?[Bugfix]=20Correct=20per=5Fact=5Ftoken=20in=20C?= =?UTF-8?q?ompressedTensorsW8A8Fp8MoECutlassM=E2=80=A6=20(#20937)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ming Yang --- .../compressed_tensors/compressed_tensors_moe.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index baf4fec3cc686..c636e7e79bf57 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -929,10 +929,8 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod): scoring_func=scoring_func, e_score_correction_bias=e_score_correction_bias) - a1_scale = layer.w13_input_scale - a2_scale = layer.w2_input_scale - per_act_token = a1_scale.numel() != 1 if a1_scale is not None else ( - a2_scale.numel() != 1 if a2_scale is not None else False) + per_act_token = ( + self.input_quant.strategy == QuantizationStrategy.TOKEN) if self.fused_experts is None: # If no modular kernel is provided, use cutlass_moe_fp8 @@ -950,8 +948,8 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod): expert_map=None if self.disable_expert_map else expert_map, w1_scale=layer.w13_weight_scale, w2_scale=layer.w2_weight_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, ) else: return self.fused_experts(