mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-04 02:17:51 +08:00
[CI Failure] Fix fp8 kv cache on <SM90 (#25396)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
parent
1d7f95b85c
commit
239ef0c1ac
@ -286,6 +286,9 @@ class CudaPlatformBase(Platform):
|
|||||||
TREE_ATTN_V1 = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend" # noqa: E501
|
TREE_ATTN_V1 = "vllm.v1.attention.backends.tree_attn.TreeAttentionBackend" # noqa: E501
|
||||||
XFORMERS_V1 = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend" # noqa: E501
|
XFORMERS_V1 = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend" # noqa: E501
|
||||||
|
|
||||||
|
use_fp8_kv_cache = (kv_cache_dtype is not None
|
||||||
|
and kv_cache_dtype.startswith("fp8"))
|
||||||
|
|
||||||
if selected_backend == _Backend.FLASHINFER:
|
if selected_backend == _Backend.FLASHINFER:
|
||||||
logger.info_once("Using FlashInfer backend on V1 engine.")
|
logger.info_once("Using FlashInfer backend on V1 engine.")
|
||||||
if cls.has_device_capability(100):
|
if cls.has_device_capability(100):
|
||||||
@ -334,10 +337,11 @@ class CudaPlatformBase(Platform):
|
|||||||
|
|
||||||
# FlashAttention is the default for SM 8.0+ GPUs
|
# FlashAttention is the default for SM 8.0+ GPUs
|
||||||
if cls.has_device_capability(80):
|
if cls.has_device_capability(80):
|
||||||
if has_sink and not cls.is_device_capability(90):
|
if (has_sink or
|
||||||
|
use_fp8_kv_cache) and not cls.is_device_capability(90):
|
||||||
logger.info_once("Using Triton backend on V1 engine.")
|
logger.info_once("Using Triton backend on V1 engine.")
|
||||||
return TRITON_ATTN_VLLM_V1
|
return TRITON_ATTN_VLLM_V1
|
||||||
if is_default_backend_supported := is_attn_backend_supported(
|
elif is_default_backend_supported := is_attn_backend_supported(
|
||||||
FLASH_ATTN_V1, head_size, dtype,
|
FLASH_ATTN_V1, head_size, dtype,
|
||||||
allow_import_error=False):
|
allow_import_error=False):
|
||||||
logger.info_once("Using Flash Attention backend on "
|
logger.info_once("Using Flash Attention backend on "
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user