From e5e076cad7c1c922fa6d48049c45bead505f52a6 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Thu, 30 Oct 2025 11:24:31 -0400 Subject: [PATCH] [BugFix] Stopgap - Flashinfer Autotuner + GPT-OSS + DP/TP (#27762) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- vllm/model_executor/warmup/kernel_warmup.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py index 79d1927d32103..ffa3bc8f021ef 100644 --- a/vllm/model_executor/warmup/kernel_warmup.py +++ b/vllm/model_executor/warmup/kernel_warmup.py @@ -11,7 +11,7 @@ from typing import TYPE_CHECKING import torch import vllm.envs as envs -from vllm.config import VllmConfig +from vllm.config import CUDAGraphMode, VllmConfig from vllm.logger import init_logger from vllm.model_executor.warmup.deep_gemm_warmup import deep_gemm_warmup from vllm.platforms import current_platform @@ -30,13 +30,19 @@ def flashinfer_autotune_supported(vllm_config: VllmConfig) -> bool: Record known issues with vllm + flashinfer autotune here. Return True if and only if flashinfer autotune will run through without issues. """ - return not ( - vllm_config.parallel_config.data_parallel_size > 1 - and ( - envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 - or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 - ) + is_tp_or_dp = (vllm_config.parallel_config.data_parallel_size > 1) or ( + vllm_config.parallel_config.tensor_parallel_size > 1 ) + is_fi_mxfp4_backend = ( + envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 + or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 + or envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS + ) or ( + current_platform.is_cuda() and current_platform.is_device_capability(100) + ) # on >=sm100, default mxfp4 backend is flashinfer + is_eager = vllm_config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE + + return not (is_tp_or_dp and is_fi_mxfp4_backend and is_eager) def kernel_warmup(worker: "Worker"):