mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 10:40:44 +08:00
[Bugfix] Fix incorrect qwen2.5-vl attention mask pre-computation (#15200)
Signed-off-by: Isotr0py <2037008807@qq.com>
This commit is contained in:
parent
2e0b4cfde0
commit
1e508343e1
@ -508,6 +508,19 @@ VLM_TEST_SETTINGS = {
|
||||
limit_mm_per_prompt={"image": 4},
|
||||
)],
|
||||
),
|
||||
# regression test for https://github.com/vllm-project/vllm/issues/15122
|
||||
"qwen2_5_vl-windows-attention": VLMTestInfo(
|
||||
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForVision2Seq,
|
||||
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
||||
custom_test_opts=[CustomTestOptions(
|
||||
inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)],
|
||||
),
|
||||
}
|
||||
# yapf: enable
|
||||
|
||||
|
||||
@ -1,7 +1,11 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Custom input builders for edge-cases in different models."""
|
||||
from io import BytesIO
|
||||
from typing import Callable
|
||||
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import (rescale_video_size, resize_video,
|
||||
sample_frames_from_video)
|
||||
@ -102,3 +106,17 @@ def different_patch_input_cases_internvl():
|
||||
build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
|
||||
build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
|
||||
]
|
||||
|
||||
|
||||
def windows_attention_image_qwen2_5_vl():
|
||||
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122
|
||||
image_url = "https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg"
|
||||
image = Image.open(BytesIO(requests.get(image_url).content))
|
||||
|
||||
question = "Describe the image."
|
||||
img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n")
|
||||
|
||||
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
|
||||
return build_single_image_inputs([image], [prompt], wrapped_sf)
|
||||
|
||||
@ -647,15 +647,17 @@ class Qwen2_5_VisionTransformer(nn.Module):
|
||||
|
||||
max_seqlen = None
|
||||
seqlens = None
|
||||
if self.attn_backend == _Backend.FLASH_ATTN:
|
||||
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
|
||||
elif self.attn_backend == _Backend.XFORMERS:
|
||||
seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
|
||||
for layer_num, blk in enumerate(self.blocks):
|
||||
if layer_num in self.fullatt_block_indexes:
|
||||
cu_seqlens_now = cu_seqlens
|
||||
else:
|
||||
cu_seqlens_now = cu_window_seqlens
|
||||
# pre-compute cu_seqlens for window attn
|
||||
if self.attn_backend == _Backend.FLASH_ATTN:
|
||||
max_seqlen = (cu_seqlens_now[1:] -
|
||||
cu_seqlens_now[:-1]).max().item()
|
||||
elif self.attn_backend == _Backend.XFORMERS:
|
||||
seqlens = (cu_seqlens_now[1:] - cu_seqlens_now[:-1]).tolist()
|
||||
hidden_states = blk(
|
||||
hidden_states,
|
||||
cu_seqlens=cu_seqlens_now,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user