From eea41804a4b4f84a80f63375ce2e77668d70bda5 Mon Sep 17 00:00:00 2001 From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com> Date: Thu, 11 Dec 2025 03:18:51 +0800 Subject: [PATCH] [bug] Fix "Current vLLM config is not set." warnings when FlashInfer attention is used (#30241) Signed-off-by: Po-Han Huang --- vllm/utils/flashinfer.py | 5 ++++- vllm/v1/attention/backends/flashinfer.py | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 7aaf690cbaa13..9a66049350cd8 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -269,6 +269,8 @@ def supports_trtllm_attention() -> bool: def force_use_trtllm_attention() -> bool | None: """ + This function should only be called during initialization stage when vllm config + is set. Return `None` if --attention-config.use_trtllm_attention is not set, return `True` if TRTLLM attention is forced to be used, return `False` if TRTLLM attention is forced to be not used. @@ -296,11 +298,12 @@ def use_trtllm_attention( kv_cache_dtype: str, q_dtype: torch.dtype, is_prefill: bool, + # None means auto-detection, True means force on, False means force off + force_use_trtllm: bool | None = None, has_sinks: bool = False, has_spec: bool = False, ) -> bool: """Return `True` if TRTLLM attention is used.""" - force_use_trtllm = force_use_trtllm_attention() # CLI argument is set to 0 - respect it if force_use_trtllm is not None and not force_use_trtllm: diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 8e9d764e4a123..4174b80ee312e 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -429,6 +429,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): super().__init__(kv_cache_spec, layer_names, vllm_config, device) self.cache_config = vllm_config.cache_config self.model_config = vllm_config.model_config + self.attention_config = vllm_config.attention_config self._workspace_buffer = None self._prefill_wrapper: ( BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None @@ -779,6 +780,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self.cache_dtype, self.q_data_type, is_prefill=True, + force_use_trtllm=self.attention_config.use_trtllm_attention, has_sinks=self.has_sinks, has_spec=uses_spec_reorder, )