mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 19:35:44 +08:00
[Bugfix] Disable cascade attention with FlashInfer (#26130)
Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
parent
13cdc02173
commit
f1fc2107a3
@ -29,7 +29,6 @@ from vllm.utils.flashinfer import (can_use_trtllm_attention,
|
|||||||
flashinfer_disable_q_quantization,
|
flashinfer_disable_q_quantization,
|
||||||
supports_trtllm_attention,
|
supports_trtllm_attention,
|
||||||
use_trtllm_attention)
|
use_trtllm_attention)
|
||||||
from vllm.v1.attention.backends.flash_attn import use_cascade_attention
|
|
||||||
# yapf conflicts with isort for this block
|
# yapf conflicts with isort for this block
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
from vllm.v1.attention.backends.utils import (AttentionCGSupport,
|
from vllm.v1.attention.backends.utils import (AttentionCGSupport,
|
||||||
@ -677,7 +676,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
|||||||
# TODO: The cascade wrapper currently does not support setting
|
# TODO: The cascade wrapper currently does not support setting
|
||||||
# kv cache dtype to something different from query dtype.
|
# kv cache dtype to something different from query dtype.
|
||||||
return False
|
return False
|
||||||
return use_cascade_attention(*args, **kwargs)
|
# TODO: Cascade attention doesn't work, disable it for now
|
||||||
|
# return use_cascade_attention(*args, **kwargs)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
class FlashInferImpl(AttentionImpl):
|
class FlashInferImpl(AttentionImpl):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user