diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index fc028aa2287a2..dbf486374bcf3 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -363,8 +363,10 @@ class Qwen2_5_VisionAttention(nn.Module): q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)) if rotary_pos_emb is not None: - q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) - k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) + # [2 * b, s, heads, head_dim] + qk_concat = torch.cat([q, k], dim=0) + qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb) + q, k = torch.chunk(qk_rotated, 2, dim=0) if self.is_flash_attn_backend: if self.attn_backend == _Backend.ROCM_AITER_FA: @@ -388,8 +390,8 @@ class Qwen2_5_VisionAttention(nn.Module): causal=False) context_layer = rearrange(output, - "(b s) ... -> b s ...", - b=batch_size) + "(b s) h d -> s b (h d)", + b=batch_size).contiguous() elif self.attn_backend == _Backend.TORCH_SDPA: # Execute attention entry by entry for speed & less VRAM. outputs = [] @@ -408,6 +410,8 @@ class Qwen2_5_VisionAttention(nn.Module): output_i = rearrange(output_i, "b h s d -> b s h d ") outputs.append(output_i) context_layer = torch.cat(outputs, dim=1) + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() elif self.attn_backend == _Backend.XFORMERS: from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalMask @@ -418,8 +422,8 @@ class Qwen2_5_VisionAttention(nn.Module): context_layer = xops.memory_efficient_attention_forward( q, k, v, attn_bias=attn_bias, p=0, scale=None) - context_layer = rearrange(context_layer, - "b s h d -> s b (h d)").contiguous() + context_layer = rearrange(context_layer, + "b s h d -> s b (h d)").contiguous() output, _ = self.proj(context_layer) return output diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py index d17c1afe9b504..b848898ff6dad 100644 --- a/vllm/transformers_utils/s3_utils.py +++ b/vllm/transformers_utils/s3_utils.py @@ -2,10 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import fnmatch -from typing import Any, Optional +from typing import TYPE_CHECKING, Optional from vllm.utils import PlaceholderModule +if TYPE_CHECKING: + from botocore.client import BaseClient + try: import boto3 except ImportError: @@ -26,7 +29,7 @@ def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]: ] -def glob(s3: Optional[Any] = None, +def glob(s3: Optional["BaseClient"] = None, path: str = "", allow_pattern: Optional[list[str]] = None) -> list[str]: """ @@ -51,7 +54,7 @@ def glob(s3: Optional[Any] = None, def list_files( - s3: Any, + s3: "BaseClient", path: str, allow_pattern: Optional[list[str]] = None, ignore_pattern: Optional[list[str]] = None