From c934caee88f65258aac00d71d9ae0ecc4a4e1cd7 Mon Sep 17 00:00:00 2001
From: dongbo910220 <32610838+dongbo910220@users.noreply.github.com>
Date: Sat, 15 Nov 2025 00:07:20 +0800
Subject: [PATCH] [Fix] improve aspect ratio in dummy image generation and add
 common  VLM tests for PaddleOCR-VL (#28711)

Signed-off-by: dongbo910220 <1275604947@qq.com>
---
 .../multimodal/generation/test_common.py       | 18 ++++++++++++++++++
 vllm/model_executor/models/paddleocr_vl.py     |  3 +--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 22083d9f16148..95b64b380db0d 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -12,6 +12,7 @@ import pytest
 from packaging.version import Version
 from transformers import (
     AutoModel,
+    AutoModelForCausalLM,
     AutoModelForImageTextToText,
     AutoModelForTextToWaveform,
 )
@@ -691,6 +692,23 @@ VLM_TEST_SETTINGS = {
         patch_hf_runner=model_utils.ovis2_5_patch_hf_runner,
         hf_model_kwargs={"revision": "refs/pr/5"},
     ),
+    "paddleocr_vl": VLMTestInfo(
+        models=["PaddlePaddle/PaddleOCR-VL"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
+        img_idx_to_prompt=lambda idx: (
+            "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
+        ),
+        multi_image_prompt=(
+            "Image-1: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n"
+            "Image-2: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n"
+            "Describe these two images separately."
+        ),
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForCausalLM,
+        image_size_factors=[(), (0.25,)],
+    ),
     "phi3v": VLMTestInfo(
         models=["microsoft/Phi-3.5-vision-instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index 183f458658aa3..3ef6470070d18 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -232,8 +232,7 @@ class PaddleOCRVLProcessingInfo(BaseProcessingInfo):
         # Find factors of max_num_tokens close to its square root
         # to create a dummy image with a reasonable aspect ratio.
         h_patches = int(math.sqrt(max_num_tokens))
-        while max_num_tokens % h_patches != 0:
-            h_patches -= 1
+        max_num_tokens -= max_num_tokens % h_patches
         w_patches = max_num_tokens // h_patches
         return ImageSize(height=h_patches * factor, width=w_patches * factor)