mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-25 14:34:02 +08:00
[Bugfix] Remove tile_size=64 for mm_prefix triton attention (#30973)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
parent
b8c477c115
commit
d2dc5dfc6e
@ -800,7 +800,6 @@ def _get_tile_size(
|
||||
head_size: int,
|
||||
sliding_window: int,
|
||||
element_size: int,
|
||||
is_mm_prefix: bool,
|
||||
is_prefill: bool,
|
||||
) -> int:
|
||||
"""Select tile size with Gemma3-specific optimization.
|
||||
@ -809,10 +808,6 @@ def _get_tile_size(
|
||||
the larger head dimension (128/256). For other models, use
|
||||
the default vLLM behavior.
|
||||
"""
|
||||
if is_mm_prefix:
|
||||
# Multimodal bidirectional attention needs a larger tile size
|
||||
return 64
|
||||
|
||||
if _is_gemma3_attention(head_size, sliding_window):
|
||||
# Gemma3: use 32 for decode (default is 16)
|
||||
return 32
|
||||
@ -903,14 +898,12 @@ def unified_attention(
|
||||
head_size,
|
||||
sliding_window_val,
|
||||
q.element_size(),
|
||||
is_mm_prefix=use_mm_prefix,
|
||||
is_prefill=True,
|
||||
)
|
||||
TILE_SIZE_DECODE = _get_tile_size(
|
||||
head_size,
|
||||
sliding_window_val,
|
||||
q.element_size(),
|
||||
is_mm_prefix=use_mm_prefix,
|
||||
is_prefill=False,
|
||||
)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user