From 084a9dae801c6756cb50c03daa313bca96fa660b Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 23 Oct 2025 00:39:08 +0800 Subject: [PATCH] [Bugfix] Disable FlexAttention direct block mask building for encoder-only models (#27344) Signed-off-by: Isotr0py --- vllm/v1/attention/backends/flex_attention.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index e1fb48b309932..ffea14ec63f86 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -658,7 +658,10 @@ class FlexAttentionMetadataBuilder(AttentionMetadataBuilder[FlexAttentionMetadat total_cache_tokens=total_cache_tokens, decode_offset=offset_tensor, num_blocks_per_seq=num_blocks_per_seq, - direct_build=self.direct_build, + # FIXME(Isotr0py): direct build has issue to build bidirectional + # attention block mask for encoder-only models, disable it temporarily. + # see: https://github.com/vllm-project/vllm/pull/27329#issuecomment-3431484053 + direct_build=(self.direct_build and common_attn_metadata.causal), q_block_size=self.q_block_size, kv_block_size=self.kv_block_size, )