[Bugfix] Remove tile_size=64 for mm_prefix triton attention (#30973)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> (cherry picked from commit d2dc5dfc6ecafbd3d725c1c42dd019db2b1efd30)
2026-05-23 05:24:25 +08:00 · 2025-12-19 03:42:32 +08:00 · 2025-12-19 03:42:32 +08:00 · b2eb84de77
commit b2eb84de77
parent ac43367ced
1 changed files with 0 additions and 7 deletions
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@ -800,7 +800,6 @@ def _get_tile_size(
    head_size: int,
    sliding_window: int,
    element_size: int,
-    is_mm_prefix: bool,
    is_prefill: bool,
 ) -> int:
    """Select tile size with Gemma3-specific optimization.
@ -809,10 +808,6 @@ def _get_tile_size(
    the larger head dimension (128/256). For other models, use
    the default vLLM behavior.
    """
-    if is_mm_prefix:
-        # Multimodal bidirectional attention needs a larger tile size
-        return 64
-
    if _is_gemma3_attention(head_size, sliding_window):
        # Gemma3: use 32 for decode (default is 16)
        return 32
@ -903,14 +898,12 @@ def unified_attention(
        head_size,
        sliding_window_val,
        q.element_size(),
-        is_mm_prefix=use_mm_prefix,
        is_prefill=True,
    )
    TILE_SIZE_DECODE = _get_tile_size(
        head_size,
        sliding_window_val,
        q.element_size(),
-        is_mm_prefix=use_mm_prefix,
        is_prefill=False,
    )