From e6b8e65d2d68fc96871bc2f07999cb495e054ced Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Thu, 15 May 2025 07:26:34 +0200 Subject: [PATCH] [Bugfix] Fix fp8 tests for triton_unified_attention for Triton 3.3 (#18013) Signed-off-by: Thomas Parnell Co-authored-by: Lucas Wilkinson --- .../kernels/{ => attention}/test_triton_unified_attention.py | 3 +++ vllm/attention/ops/triton_unified_attention.py | 4 ++++ 2 files changed, 7 insertions(+) rename tests/kernels/{ => attention}/test_triton_unified_attention.py (98%) diff --git a/tests/kernels/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py similarity index 98% rename from tests/kernels/test_triton_unified_attention.py rename to tests/kernels/attention/test_triton_unified_attention.py index 50da8e5fd5cd5..4e15d00255a4f 100644 --- a/tests/kernels/test_triton_unified_attention.py +++ b/tests/kernels/attention/test_triton_unified_attention.py @@ -99,6 +99,9 @@ def test_triton_unified_attn( ) -> None: torch.set_default_device("cuda") + if q_dtype is not None and q_dtype.itemsize < 2 and block_size < 32: + pytest.skip("block size must be at least 32 for fp8") + current_platform.seed_everything(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index 8c0cf9267f359..f08000a75bc7c 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -268,6 +268,10 @@ def unified_attention( assert causal, "Only causal attention is supported" assert q_descale is None, "Q scales not supported" + block_size = v.shape[1] + assert q.element_size() >= 2 or block_size >= 32, \ + "Block size must be at least 32 for fp8" + use_alibi_slopes = alibi_slopes is not None block_size = v.shape[1]