[Bug] Fix R1 Accuracy 0 Bug (#23294)

Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-12-24 13:06:01 +08:00 · 2025-08-21 13:11:28 -04:00 · 2025-08-21 13:11:28 -04:00 · 48bfb0c9b7
commit 48bfb0c9b7
parent f8ce022948
1 changed files with 15 additions and 7 deletions
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@ -1099,8 +1099,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                    apply_router_weight_on_input=apply_router_weight_on_input,
                )
        else:
-            from vllm.model_executor.layers.fused_moe import fused_experts
-            return fused_experts(
+            common_kwargs = dict(
                hidden_states=x,
                w1=layer.w13_weight,
                w2=layer.w2_weight,
@ -1117,11 +1116,20 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                          if self.block_quant else layer.w2_weight_scale),
                a1_scale=layer.w13_input_scale,
                a2_scale=layer.w2_input_scale,
-                use_fp8_w8a8=True,
-                block_shape=self.quant_config.weight_block_size,
-                allow_deep_gemm=self.allow_deep_gemm,
-                allow_cutlass_block_scaled_grouped_gemm=(
-                    self.allow_cutlass_block_scaled_grouped_gemm))
+            )
+
+            if self.fused_experts is not None:
+                return self.fused_experts(**common_kwargs)
+            else:
+                from vllm.model_executor.layers.fused_moe import fused_experts
+                return fused_experts(
+                    **common_kwargs,
+                    use_fp8_w8a8=True,
+                    block_shape=self.quant_config.weight_block_size,
+                    allow_deep_gemm=self.allow_deep_gemm,
+                    allow_cutlass_block_scaled_grouped_gemm=(
+                        self.allow_cutlass_block_scaled_grouped_gemm),
+                )


 class Fp8KVCacheMethod(BaseKVCacheMethod):