[bug] Fix "Current vLLM config is not set." warnings when FlashInfer attention is used (#30241)

Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
This commit is contained in:
Po-Han Huang (NVIDIA) 2025-12-11 03:18:51 +08:00 committed by GitHub
parent 9f042ba26b
commit eea41804a4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 6 additions and 1 deletions

View File

@ -269,6 +269,8 @@ def supports_trtllm_attention() -> bool:
def force_use_trtllm_attention() -> bool | None:
"""
This function should only be called during initialization stage when vllm config
is set.
Return `None` if --attention-config.use_trtllm_attention is not set,
return `True` if TRTLLM attention is forced to be used,
return `False` if TRTLLM attention is forced to be not used.
@ -296,11 +298,12 @@ def use_trtllm_attention(
kv_cache_dtype: str,
q_dtype: torch.dtype,
is_prefill: bool,
# None means auto-detection, True means force on, False means force off
force_use_trtllm: bool | None = None,
has_sinks: bool = False,
has_spec: bool = False,
) -> bool:
"""Return `True` if TRTLLM attention is used."""
force_use_trtllm = force_use_trtllm_attention()
# CLI argument is set to 0 - respect it
if force_use_trtllm is not None and not force_use_trtllm:

View File

@ -429,6 +429,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
super().__init__(kv_cache_spec, layer_names, vllm_config, device)
self.cache_config = vllm_config.cache_config
self.model_config = vllm_config.model_config
self.attention_config = vllm_config.attention_config
self._workspace_buffer = None
self._prefill_wrapper: (
BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
@ -779,6 +780,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
self.cache_dtype,
self.q_data_type,
is_prefill=True,
force_use_trtllm=self.attention_config.use_trtllm_attention,
has_sinks=self.has_sinks,
has_spec=uses_spec_reorder,
)