[BugFix][Performance] Restore flashinfer autotuning for all scenarios (#27904)

2025-12-10 19:14:57 +08:00 · 2025-11-04 02:56:21 -05:00 · 2025-11-04 02:56:21 -05:00 · 4022a9d279
commit 4022a9d279
parent 53f6e81dfd
4 changed files with 14 additions and 44 deletions
--- a/tests/quantization/test_blackwell_moe.py
+++ b/tests/quantization/test_blackwell_moe.py
@ -172,21 +172,9 @@ def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch
    can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT)
-def test_gptoss_dp2_mxfp4mxfp8_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
+def test_gptoss_eager(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
    monkeypatch.setenv("VLLM_ALL2ALL_BACKEND", "deepep_high_throughput")
    can_initialize(
        "openai/gpt-oss-20b",
        extra_args=["--data-parallel-size", "2", "--enable-expert-parallel"],
        hf_overrides=HF_OVERRIDE_TEXT,
    )
 def test_gptoss_dp2_mxfp4bf16_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1")
    monkeypatch.setenv("VLLM_ALL2ALL_BACKEND", "deepep_high_throughput")
    can_initialize(
        "openai/gpt-oss-20b",
        extra_args=["--data-parallel-size", "2", "--enable-expert-parallel"],
        hf_overrides=HF_OVERRIDE_TEXT,
        extra_args=["--enforce-eager"],
    )
--- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
@ -127,10 +127,17 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
            "routing_method_type": 1,
            "do_finalize": True,
            "output": output,
-            "tune_max_num_tokens": self.max_capture_size,
+            "tune_max_num_tokens": max(self.max_capture_size, 1),
        }
        from flashinfer import trtllm_fp4_block_scale_routed_moe
        from vllm.utils.flashinfer import autotune
        with autotune(False):
            # Enable autotune when,
            # https://github.com/flashinfer-ai/flashinfer/issues/2023 is
            # resolved.
            trtllm_fp4_block_scale_routed_moe(**kwargs)
        return output
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@ -1047,7 +1047,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                None,
                1 if renormalize else 0,  # routing_method_type, renormalize
                True,  # do finalize
-                tune_max_num_tokens=self.max_capture_size,
+                tune_max_num_tokens=max(self.max_capture_size, 1),
            )[0]
            return trtllm_gen_output
        elif (
@ -1122,7 +1122,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
                tp_rank=self.moe.tp_rank,
                ep_size=self.moe.ep_size,
                ep_rank=self.moe.ep_rank,
-                tune_max_num_tokens=self.max_capture_size,
+                tune_max_num_tokens=max(self.max_capture_size, 1),
                **extra_kwargs,
            )
--- a/vllm/model_executor/warmup/kernel_warmup.py
+++ b/vllm/model_executor/warmup/kernel_warmup.py
@ -11,7 +11,6 @@ from typing import TYPE_CHECKING
 import torch
 import vllm.envs as envs
 from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.warmup.deep_gemm_warmup import deep_gemm_warmup
 from vllm.platforms import current_platform
@ -25,26 +24,6 @@ if TYPE_CHECKING:
 logger = init_logger(__name__)
 def flashinfer_autotune_supported(vllm_config: VllmConfig) -> bool:
    """
    Record known issues with vllm + flashinfer autotune here. Return True if
    and only if flashinfer autotune will run through without issues.
    """
    is_tp_or_dp = (vllm_config.parallel_config.data_parallel_size > 1) or (
        vllm_config.parallel_config.tensor_parallel_size > 1
    )
    is_fi_mxfp4_backend = (
        envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
        or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
        or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS
    ) or (
        current_platform.is_cuda() and current_platform.is_device_capability(100)
    )  # on >=sm100, default mxfp4 backend is flashinfer
    is_eager = vllm_config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
    return not (is_tp_or_dp and is_fi_mxfp4_backend and is_eager)
 def kernel_warmup(worker: "Worker"):
    # Deep GEMM warmup
    do_deep_gemm_warmup = (
@ -58,11 +37,7 @@ def kernel_warmup(worker: "Worker"):
        deep_gemm_warmup(model, max_tokens)
    # FlashInfer autotune for Hopper (SM 9.0) and Blackwell (SM 10.0) GPUs
-    if (
+    if has_flashinfer() and current_platform.has_device_capability(90):
        has_flashinfer()
        and current_platform.has_device_capability(90)
        and flashinfer_autotune_supported(worker.vllm_config)
    ):
        flashinfer_autotune(worker.model_runner)
    # FlashInfer attention warmup