From 6b6e98775f2452a105fa2b0b4b318ffe0cffb06d Mon Sep 17 00:00:00 2001 From: Jason Li Date: Sun, 5 Oct 2025 16:24:37 -0400 Subject: [PATCH] [NVIDIA] flashinfer TRTLLM attention prefill token limit (#25998) Signed-off-by: jasonlizhengjian Signed-off-by: jasonlizhengjian --- vllm/utils/flashinfer.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index ab0cf2051f875..1d707d56daba5 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -283,11 +283,18 @@ def use_trtllm_attention( if force_use_trtllm is None: # Environment variable not set - use auto-detection - use_trtllm = ( - num_tokens <= 256 and max_seq_len <= 131072 and kv_cache_dtype == "auto" - ) - if use_trtllm: - logger.warning_once("Using TRTLLM attention (auto-detected).") + if is_prefill: + # Prefill auto-detection + use_trtllm = max_seq_len <= 131072 and kv_cache_dtype == "auto" + if use_trtllm: + logger.warning_once("Using TRTLLM prefill attention (auto-detected).") + else: + # Decode auto-detection + use_trtllm = ( + num_tokens <= 256 and max_seq_len <= 131072 and kv_cache_dtype == "auto" + ) + if use_trtllm: + logger.warning_once("Using TRTLLM decode attention (auto-detected).") return use_trtllm # Environment variable is set to 1 - respect it