diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 5690249eb3754..0235140187990 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -508,6 +508,19 @@ VLM_TEST_SETTINGS = { limit_mm_per_prompt={"image": 4}, )], ), + # regression test for https://github.com/vllm-project/vllm/issues/15122 + "qwen2_5_vl-windows-attention": VLMTestInfo( + models=["Qwen/Qwen2.5-VL-3B-Instruct"], + test_type=VLMTestType.CUSTOM_INPUTS, + max_model_len=4096, + max_num_seqs=2, + auto_cls=AutoModelForVision2Seq, + vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, + custom_test_opts=[CustomTestOptions( + inputs=custom_inputs.windows_attention_image_qwen2_5_vl(), + limit_mm_per_prompt={"image": 1}, + )], + ), } # yapf: enable diff --git a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py index 2f03a114ae531..235618ae547ea 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py @@ -1,7 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 """Custom input builders for edge-cases in different models.""" +from io import BytesIO from typing import Callable +import requests +from PIL import Image + from vllm.multimodal.image import rescale_image_size from vllm.multimodal.video import (rescale_video_size, resize_video, sample_frames_from_video) @@ -102,3 +106,17 @@ def different_patch_input_cases_internvl(): build_single_image_inputs(images, formatted_sprompts, wrapped_sf), build_multi_image_inputs([images], formatted_mprompts, wrapped_sf), ] + + +def windows_attention_image_qwen2_5_vl(): + # image from regression issue: https://github.com/vllm-project/vllm/issues/15122 + image_url = "https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg" + image = Image.open(BytesIO(requests.get(image_url).content)) + + question = "Describe the image." + img_prompt = "<|vision_start|><|image_pad|><|vision_end|>" + prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n" + "<|im_start|>assistant\n") + + wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5]) + return build_single_image_inputs([image], [prompt], wrapped_sf) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 8a570d138c6c2..adca97c71c581 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -647,15 +647,17 @@ class Qwen2_5_VisionTransformer(nn.Module): max_seqlen = None seqlens = None - if self.attn_backend == _Backend.FLASH_ATTN: - max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() - elif self.attn_backend == _Backend.XFORMERS: - seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() for layer_num, blk in enumerate(self.blocks): if layer_num in self.fullatt_block_indexes: cu_seqlens_now = cu_seqlens else: cu_seqlens_now = cu_window_seqlens + # pre-compute cu_seqlens for window attn + if self.attn_backend == _Backend.FLASH_ATTN: + max_seqlen = (cu_seqlens_now[1:] - + cu_seqlens_now[:-1]).max().item() + elif self.attn_backend == _Backend.XFORMERS: + seqlens = (cu_seqlens_now[1:] - cu_seqlens_now[:-1]).tolist() hidden_states = blk( hidden_states, cu_seqlens=cu_seqlens_now,