[Fix] improve aspect ratio in dummy image generation and add common VLM tests for PaddleOCR-VL (#28711)

Signed-off-by: dongbo910220 <1275604947@qq.com>
2026-07-09 17:27:15 +08:00 · 2025-11-15 00:07:20 +08:00 · 2025-11-15 00:07:20 +08:00 · c934caee88
commit c934caee88
parent 3f8a874065
2 changed files with 19 additions and 2 deletions
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@ -12,6 +12,7 @@ import pytest
 from packaging.version import Version
 from transformers import (
    AutoModel,
    AutoModelForCausalLM,
    AutoModelForImageTextToText,
    AutoModelForTextToWaveform,
 )
@ -691,6 +692,23 @@ VLM_TEST_SETTINGS = {
        patch_hf_runner=model_utils.ovis2_5_patch_hf_runner,
        hf_model_kwargs={"revision": "refs/pr/5"},
    ),
    "paddleocr_vl": VLMTestInfo(
        models=["PaddlePaddle/PaddleOCR-VL"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
        img_idx_to_prompt=lambda idx: (
            "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
        ),
        multi_image_prompt=(
            "Image-1: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n"
            "Image-2: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n"
            "Describe these two images separately."
        ),
        max_model_len=8192,
        max_num_seqs=2,
        auto_cls=AutoModelForCausalLM,
        image_size_factors=[(), (0.25,)],
    ),
    "phi3v": VLMTestInfo(
        models=["microsoft/Phi-3.5-vision-instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@ -232,8 +232,7 @@ class PaddleOCRVLProcessingInfo(BaseProcessingInfo):
        # Find factors of max_num_tokens close to its square root
        # to create a dummy image with a reasonable aspect ratio.
        h_patches = int(math.sqrt(max_num_tokens))
-        while max_num_tokens % h_patches != 0:
+        max_num_tokens -= max_num_tokens % h_patches
            h_patches -= 1
        w_patches = max_num_tokens // h_patches
        return ImageSize(height=h_patches * factor, width=w_patches * factor)