From e6b8e65d2d68fc96871bc2f07999cb495e054ced Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Thu, 15 May 2025 07:26:34 +0200
Subject: [PATCH] [Bugfix] Fix fp8 tests for triton_unified_attention for
 Triton 3.3 (#18013)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
---
 .../kernels/{ => attention}/test_triton_unified_attention.py  | 3 +++
 vllm/attention/ops/triton_unified_attention.py                | 4 ++++
 2 files changed, 7 insertions(+)
 rename tests/kernels/{ => attention}/test_triton_unified_attention.py (98%)

diff --git a/tests/kernels/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py
similarity index 98%
rename from tests/kernels/test_triton_unified_attention.py
rename to tests/kernels/attention/test_triton_unified_attention.py
index 50da8e5fd5cd5..4e15d00255a4f 100644
--- a/tests/kernels/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@@ -99,6 +99,9 @@ def test_triton_unified_attn(
 ) -> None:
     torch.set_default_device("cuda")
 
+    if q_dtype is not None and q_dtype.itemsize < 2 and block_size < 32:
+        pytest.skip("block size must be at least 32 for fp8")
+
     current_platform.seed_everything(0)
     num_seqs = len(seq_lens)
     query_lens = [x[0] for x in seq_lens]
diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index 8c0cf9267f359..f08000a75bc7c 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -268,6 +268,10 @@ def unified_attention(
     assert causal, "Only causal attention is supported"
     assert q_descale is None, "Q scales not supported"
 
+    block_size = v.shape[1]
+    assert q.element_size() >= 2 or block_size >= 32, \
+        "Block size must be at least 32 for fp8"
+
     use_alibi_slopes = alibi_slopes is not None
 
     block_size = v.shape[1]