From d2dc5dfc6ecafbd3d725c1c42dd019db2b1efd30 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 19 Dec 2025 03:42:32 +0800 Subject: [PATCH] [Bugfix] Remove `tile_size=64` for mm_prefix triton attention (#30973) Signed-off-by: Isotr0py --- vllm/attention/ops/triton_unified_attention.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index ae5a48ec3d26d..f61c8e9b89c24 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -800,7 +800,6 @@ def _get_tile_size( head_size: int, sliding_window: int, element_size: int, - is_mm_prefix: bool, is_prefill: bool, ) -> int: """Select tile size with Gemma3-specific optimization. @@ -809,10 +808,6 @@ def _get_tile_size( the larger head dimension (128/256). For other models, use the default vLLM behavior. """ - if is_mm_prefix: - # Multimodal bidirectional attention needs a larger tile size - return 64 - if _is_gemma3_attention(head_size, sliding_window): # Gemma3: use 32 for decode (default is 16) return 32 @@ -903,14 +898,12 @@ def unified_attention( head_size, sliding_window_val, q.element_size(), - is_mm_prefix=use_mm_prefix, is_prefill=True, ) TILE_SIZE_DECODE = _get_tile_size( head_size, sliding_window_val, q.element_size(), - is_mm_prefix=use_mm_prefix, is_prefill=False, )