[bug] Fix "Current vLLM config is not set." warnings when FlashInfer attention is used (#30241)

Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
2026-03-18 09:57:10 +08:00 · 2025-12-11 03:18:51 +08:00 · 2025-12-11 03:18:51 +08:00 · eea41804a4
commit eea41804a4
parent 9f042ba26b
2 changed files with 6 additions and 1 deletions
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@ -269,6 +269,8 @@ def supports_trtllm_attention() -> bool:

 def force_use_trtllm_attention() -> bool | None:
    """
+    This function should only be called during initialization stage when vllm config
+    is set.
    Return `None` if --attention-config.use_trtllm_attention is not set,
    return `True` if TRTLLM attention is forced to be used,
    return `False` if TRTLLM attention is forced to be not used.
@ -296,11 +298,12 @@ def use_trtllm_attention(
    kv_cache_dtype: str,
    q_dtype: torch.dtype,
    is_prefill: bool,
+    # None means auto-detection, True means force on, False means force off
+    force_use_trtllm: bool | None = None,
    has_sinks: bool = False,
    has_spec: bool = False,
 ) -> bool:
    """Return `True` if TRTLLM attention is used."""
-    force_use_trtllm = force_use_trtllm_attention()

    # CLI argument is set to 0 - respect it
    if force_use_trtllm is not None and not force_use_trtllm:
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@ -429,6 +429,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
        self.cache_config = vllm_config.cache_config
        self.model_config = vllm_config.model_config
+        self.attention_config = vllm_config.attention_config
        self._workspace_buffer = None
        self._prefill_wrapper: (
            BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
@ -779,6 +780,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
            self.cache_dtype,
            self.q_data_type,
            is_prefill=True,
+            force_use_trtllm=self.attention_config.use_trtllm_attention,
            has_sinks=self.has_sinks,
            has_spec=uses_spec_reorder,
        )