mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-15 10:13:37 +08:00
[Bugfix] Disable FlexAttention direct block mask building for encoder-only models (#27344)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
parent
c9461e05a4
commit
084a9dae80
@ -658,7 +658,10 @@ class FlexAttentionMetadataBuilder(AttentionMetadataBuilder[FlexAttentionMetadat
|
|||||||
total_cache_tokens=total_cache_tokens,
|
total_cache_tokens=total_cache_tokens,
|
||||||
decode_offset=offset_tensor,
|
decode_offset=offset_tensor,
|
||||||
num_blocks_per_seq=num_blocks_per_seq,
|
num_blocks_per_seq=num_blocks_per_seq,
|
||||||
direct_build=self.direct_build,
|
# FIXME(Isotr0py): direct build has issue to build bidirectional
|
||||||
|
# attention block mask for encoder-only models, disable it temporarily.
|
||||||
|
# see: https://github.com/vllm-project/vllm/pull/27329#issuecomment-3431484053
|
||||||
|
direct_build=(self.direct_build and common_attn_metadata.causal),
|
||||||
q_block_size=self.q_block_size,
|
q_block_size=self.q_block_size,
|
||||||
kv_block_size=self.kv_block_size,
|
kv_block_size=self.kv_block_size,
|
||||||
)
|
)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user