Merge branch 'main' into woosuk/input-prep

2026-07-08 08:37:08 +08:00 · 2025-09-14 08:03:28 -07:00 · 2025-09-14 08:03:28 -07:00 · 633f9f006d
commit 633f9f006d
parent eb3742c72a fec347dee1
2 changed files with 16 additions and 9 deletions
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@ -363,8 +363,10 @@ class Qwen2_5_VisionAttention(nn.Module):
        q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
                   for x in (q, k, v))
        if rotary_pos_emb is not None:
-            q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
-            k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
+            # [2 * b, s, heads, head_dim]
+            qk_concat = torch.cat([q, k], dim=0)
+            qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
+            q, k = torch.chunk(qk_rotated, 2, dim=0)

        if self.is_flash_attn_backend:
            if self.attn_backend == _Backend.ROCM_AITER_FA:
@ -388,8 +390,8 @@ class Qwen2_5_VisionAttention(nn.Module):
                                            causal=False)

            context_layer = rearrange(output,
-                                      "(b s) ... -> b s ...",
-                                      b=batch_size)
+                                      "(b s) h d -> s b (h d)",
+                                      b=batch_size).contiguous()
        elif self.attn_backend == _Backend.TORCH_SDPA:
            # Execute attention entry by entry for speed & less VRAM.
            outputs = []
@ -408,6 +410,8 @@ class Qwen2_5_VisionAttention(nn.Module):
                output_i = rearrange(output_i, "b h s d -> b s h d ")
                outputs.append(output_i)
            context_layer = torch.cat(outputs, dim=1)
+            context_layer = rearrange(context_layer,
+                                      "b s h d -> s b (h d)").contiguous()
        elif self.attn_backend == _Backend.XFORMERS:
            from xformers import ops as xops
            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
@ -418,8 +422,8 @@ class Qwen2_5_VisionAttention(nn.Module):

            context_layer = xops.memory_efficient_attention_forward(
                q, k, v, attn_bias=attn_bias, p=0, scale=None)
-        context_layer = rearrange(context_layer,
-                                  "b s h d -> s b (h d)").contiguous()
+            context_layer = rearrange(context_layer,
+                                      "b s h d -> s b (h d)").contiguous()

        output, _ = self.proj(context_layer)
        return output
--- a/vllm/transformers_utils/s3_utils.py
+++ b/vllm/transformers_utils/s3_utils.py
@ -2,10 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import fnmatch
-from typing import Any, Optional
+from typing import TYPE_CHECKING, Optional

 from vllm.utils import PlaceholderModule

+if TYPE_CHECKING:
+    from botocore.client import BaseClient
+
 try:
    import boto3
 except ImportError:
@ -26,7 +29,7 @@ def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]:
    ]


-def glob(s3: Optional[Any] = None,
+def glob(s3: Optional["BaseClient"] = None,
         path: str = "",
         allow_pattern: Optional[list[str]] = None) -> list[str]:
    """
@ -51,7 +54,7 @@ def glob(s3: Optional[Any] = None,


 def list_files(
-        s3: Any,
+        s3: "BaseClient",
        path: str,
        allow_pattern: Optional[list[str]] = None,
        ignore_pattern: Optional[list[str]] = None