mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-05 08:57:57 +08:00
[Bugfix] Fix fp8 tests for triton_unified_attention for Triton 3.3 (#18013)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com> Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
This commit is contained in:
parent
26d0419309
commit
e6b8e65d2d
@ -99,6 +99,9 @@ def test_triton_unified_attn(
|
|||||||
) -> None:
|
) -> None:
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
|
|
||||||
|
if q_dtype is not None and q_dtype.itemsize < 2 and block_size < 32:
|
||||||
|
pytest.skip("block size must be at least 32 for fp8")
|
||||||
|
|
||||||
current_platform.seed_everything(0)
|
current_platform.seed_everything(0)
|
||||||
num_seqs = len(seq_lens)
|
num_seqs = len(seq_lens)
|
||||||
query_lens = [x[0] for x in seq_lens]
|
query_lens = [x[0] for x in seq_lens]
|
||||||
@ -268,6 +268,10 @@ def unified_attention(
|
|||||||
assert causal, "Only causal attention is supported"
|
assert causal, "Only causal attention is supported"
|
||||||
assert q_descale is None, "Q scales not supported"
|
assert q_descale is None, "Q scales not supported"
|
||||||
|
|
||||||
|
block_size = v.shape[1]
|
||||||
|
assert q.element_size() >= 2 or block_size >= 32, \
|
||||||
|
"Block size must be at least 32 for fp8"
|
||||||
|
|
||||||
use_alibi_slopes = alibi_slopes is not None
|
use_alibi_slopes = alibi_slopes is not None
|
||||||
|
|
||||||
block_size = v.shape[1]
|
block_size = v.shape[1]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user