mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-11 03:18:01 +08:00
[bug] Fix "Current vLLM config is not set." warnings when FlashInfer attention is used (#30241)
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
This commit is contained in:
parent
9f042ba26b
commit
eea41804a4
@ -269,6 +269,8 @@ def supports_trtllm_attention() -> bool:
|
|||||||
|
|
||||||
def force_use_trtllm_attention() -> bool | None:
|
def force_use_trtllm_attention() -> bool | None:
|
||||||
"""
|
"""
|
||||||
|
This function should only be called during initialization stage when vllm config
|
||||||
|
is set.
|
||||||
Return `None` if --attention-config.use_trtllm_attention is not set,
|
Return `None` if --attention-config.use_trtllm_attention is not set,
|
||||||
return `True` if TRTLLM attention is forced to be used,
|
return `True` if TRTLLM attention is forced to be used,
|
||||||
return `False` if TRTLLM attention is forced to be not used.
|
return `False` if TRTLLM attention is forced to be not used.
|
||||||
@ -296,11 +298,12 @@ def use_trtllm_attention(
|
|||||||
kv_cache_dtype: str,
|
kv_cache_dtype: str,
|
||||||
q_dtype: torch.dtype,
|
q_dtype: torch.dtype,
|
||||||
is_prefill: bool,
|
is_prefill: bool,
|
||||||
|
# None means auto-detection, True means force on, False means force off
|
||||||
|
force_use_trtllm: bool | None = None,
|
||||||
has_sinks: bool = False,
|
has_sinks: bool = False,
|
||||||
has_spec: bool = False,
|
has_spec: bool = False,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""Return `True` if TRTLLM attention is used."""
|
"""Return `True` if TRTLLM attention is used."""
|
||||||
force_use_trtllm = force_use_trtllm_attention()
|
|
||||||
|
|
||||||
# CLI argument is set to 0 - respect it
|
# CLI argument is set to 0 - respect it
|
||||||
if force_use_trtllm is not None and not force_use_trtllm:
|
if force_use_trtllm is not None and not force_use_trtllm:
|
||||||
|
|||||||
@ -429,6 +429,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
|||||||
super().__init__(kv_cache_spec, layer_names, vllm_config, device)
|
super().__init__(kv_cache_spec, layer_names, vllm_config, device)
|
||||||
self.cache_config = vllm_config.cache_config
|
self.cache_config = vllm_config.cache_config
|
||||||
self.model_config = vllm_config.model_config
|
self.model_config = vllm_config.model_config
|
||||||
|
self.attention_config = vllm_config.attention_config
|
||||||
self._workspace_buffer = None
|
self._workspace_buffer = None
|
||||||
self._prefill_wrapper: (
|
self._prefill_wrapper: (
|
||||||
BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
|
BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
|
||||||
@ -779,6 +780,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
|||||||
self.cache_dtype,
|
self.cache_dtype,
|
||||||
self.q_data_type,
|
self.q_data_type,
|
||||||
is_prefill=True,
|
is_prefill=True,
|
||||||
|
force_use_trtllm=self.attention_config.use_trtllm_attention,
|
||||||
has_sinks=self.has_sinks,
|
has_sinks=self.has_sinks,
|
||||||
has_spec=uses_spec_reorder,
|
has_spec=uses_spec_reorder,
|
||||||
)
|
)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user