mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-07 15:12:19 +08:00
[Bugfix] Switch bailout logic for kv-cache-dtype with SM100 Flashinfer (#20934)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
This commit is contained in:
parent
016b8d1b7f
commit
9ad0a4588b
@ -1418,14 +1418,15 @@ class EngineArgs:
|
|||||||
and not envs.is_set("VLLM_ATTENTION_BACKEND")
|
and not envs.is_set("VLLM_ATTENTION_BACKEND")
|
||||||
) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
|
) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
|
||||||
supported = False
|
supported = False
|
||||||
if current_platform.is_rocm():
|
if current_platform.is_rocm() or (
|
||||||
|
current_platform.is_cuda()
|
||||||
|
and current_platform.is_device_capability(100)):
|
||||||
supported = True
|
supported = True
|
||||||
elif fp8_attention and will_use_fa:
|
elif fp8_attention and will_use_fa:
|
||||||
from vllm.attention.utils.fa_utils import (
|
from vllm.attention.utils.fa_utils import (
|
||||||
flash_attn_supports_fp8)
|
flash_attn_supports_fp8)
|
||||||
supported = flash_attn_supports_fp8()
|
supported = flash_attn_supports_fp8()
|
||||||
elif envs.VLLM_USE_TRTLLM_DECODE_ATTENTION:
|
|
||||||
supported = True
|
|
||||||
if not supported:
|
if not supported:
|
||||||
_raise_or_fallback(feature_name="--kv-cache-dtype",
|
_raise_or_fallback(feature_name="--kv-cache-dtype",
|
||||||
recommend_to_remove=False)
|
recommend_to_remove=False)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user