From eea41804a4b4f84a80f63375ce2e77668d70bda5 Mon Sep 17 00:00:00 2001
From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com>
Date: Thu, 11 Dec 2025 03:18:51 +0800
Subject: [PATCH] [bug] Fix "Current vLLM config is not set." warnings when
 FlashInfer attention is used (#30241)

Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
---
 vllm/utils/flashinfer.py                 | 5 ++++-
 vllm/v1/attention/backends/flashinfer.py | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 7aaf690cbaa13..9a66049350cd8 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -269,6 +269,8 @@ def supports_trtllm_attention() -> bool:
 
 def force_use_trtllm_attention() -> bool | None:
     """
+    This function should only be called during initialization stage when vllm config
+    is set.
     Return `None` if --attention-config.use_trtllm_attention is not set,
     return `True` if TRTLLM attention is forced to be used,
     return `False` if TRTLLM attention is forced to be not used.
@@ -296,11 +298,12 @@ def use_trtllm_attention(
     kv_cache_dtype: str,
     q_dtype: torch.dtype,
     is_prefill: bool,
+    # None means auto-detection, True means force on, False means force off
+    force_use_trtllm: bool | None = None,
     has_sinks: bool = False,
     has_spec: bool = False,
 ) -> bool:
     """Return `True` if TRTLLM attention is used."""
-    force_use_trtllm = force_use_trtllm_attention()
 
     # CLI argument is set to 0 - respect it
     if force_use_trtllm is not None and not force_use_trtllm:
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 8e9d764e4a123..4174b80ee312e 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -429,6 +429,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         super().__init__(kv_cache_spec, layer_names, vllm_config, device)
         self.cache_config = vllm_config.cache_config
         self.model_config = vllm_config.model_config
+        self.attention_config = vllm_config.attention_config
         self._workspace_buffer = None
         self._prefill_wrapper: (
             BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
@@ -779,6 +780,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
             self.cache_dtype,
             self.q_data_type,
             is_prefill=True,
+            force_use_trtllm=self.attention_config.use_trtllm_attention,
             has_sinks=self.has_sinks,
             has_spec=uses_spec_reorder,
         )