From c934caee88f65258aac00d71d9ae0ecc4a4e1cd7 Mon Sep 17 00:00:00 2001 From: dongbo910220 <32610838+dongbo910220@users.noreply.github.com> Date: Sat, 15 Nov 2025 00:07:20 +0800 Subject: [PATCH] [Fix] improve aspect ratio in dummy image generation and add common VLM tests for PaddleOCR-VL (#28711) Signed-off-by: dongbo910220 <1275604947@qq.com> --- .../multimodal/generation/test_common.py | 18 ++++++++++++++++++ vllm/model_executor/models/paddleocr_vl.py | 3 +-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 22083d9f16148..95b64b380db0d 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -12,6 +12,7 @@ import pytest from packaging.version import Version from transformers import ( AutoModel, + AutoModelForCausalLM, AutoModelForImageTextToText, AutoModelForTextToWaveform, ) @@ -691,6 +692,23 @@ VLM_TEST_SETTINGS = { patch_hf_runner=model_utils.ovis2_5_patch_hf_runner, hf_model_kwargs={"revision": "refs/pr/5"}, ), + "paddleocr_vl": VLMTestInfo( + models=["PaddlePaddle/PaddleOCR-VL"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", + img_idx_to_prompt=lambda idx: ( + "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>" + ), + multi_image_prompt=( + "Image-1: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n" + "Image-2: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n" + "Describe these two images separately." + ), + max_model_len=8192, + max_num_seqs=2, + auto_cls=AutoModelForCausalLM, + image_size_factors=[(), (0.25,)], + ), "phi3v": VLMTestInfo( models=["microsoft/Phi-3.5-vision-instruct"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index 183f458658aa3..3ef6470070d18 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -232,8 +232,7 @@ class PaddleOCRVLProcessingInfo(BaseProcessingInfo): # Find factors of max_num_tokens close to its square root # to create a dummy image with a reasonable aspect ratio. h_patches = int(math.sqrt(max_num_tokens)) - while max_num_tokens % h_patches != 0: - h_patches -= 1 + max_num_tokens -= max_num_tokens % h_patches w_patches = max_num_tokens // h_patches return ImageSize(height=h_patches * factor, width=w_patches * factor)