From d04973ad5446fe05c06035f6b2d99402fc3ac7bf Mon Sep 17 00:00:00 2001 From: Bellk17 Date: Fri, 12 Apr 2024 16:41:26 -0700 Subject: [PATCH] Fix triton compilation issue (#3984) Co-authored-by: Woosuk Kwon --- vllm/attention/ops/triton_flash_attention.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py index 87cf30cbef79a..e160411859f0b 100644 --- a/vllm/attention/ops/triton_flash_attention.py +++ b/vllm/attention/ops/triton_flash_attention.py @@ -415,7 +415,11 @@ def attn_fwd( return is_mqa = hq != hk - off_h_k = off_h_q % hk if is_mqa else off_h_q + if is_mqa: # noqa: SIM108 + off_h_k = off_h_q % hk + else: + off_h_k = off_h_q + n_extra_tokens = 0 if seqlen_k < BLOCK_N: n_extra_tokens = BLOCK_N - seqlen_k