[Fix] improve aspect ratio in dummy image generation and add common VLM tests for PaddleOCR-VL (#28711)

Signed-off-by: dongbo910220 <1275604947@qq.com>
This commit is contained in:
dongbo910220 2025-11-15 00:07:20 +08:00 committed by GitHub
parent 3f8a874065
commit c934caee88
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 19 additions and 2 deletions

View File

@ -12,6 +12,7 @@ import pytest
from packaging.version import Version
from transformers import (
AutoModel,
AutoModelForCausalLM,
AutoModelForImageTextToText,
AutoModelForTextToWaveform,
)
@ -691,6 +692,23 @@ VLM_TEST_SETTINGS = {
patch_hf_runner=model_utils.ovis2_5_patch_hf_runner,
hf_model_kwargs={"revision": "refs/pr/5"},
),
"paddleocr_vl": VLMTestInfo(
models=["PaddlePaddle/PaddleOCR-VL"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
img_idx_to_prompt=lambda idx: (
"<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
),
multi_image_prompt=(
"Image-1: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n"
"Image-2: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n"
"Describe these two images separately."
),
max_model_len=8192,
max_num_seqs=2,
auto_cls=AutoModelForCausalLM,
image_size_factors=[(), (0.25,)],
),
"phi3v": VLMTestInfo(
models=["microsoft/Phi-3.5-vision-instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),

View File

@ -232,8 +232,7 @@ class PaddleOCRVLProcessingInfo(BaseProcessingInfo):
# Find factors of max_num_tokens close to its square root
# to create a dummy image with a reasonable aspect ratio.
h_patches = int(math.sqrt(max_num_tokens))
while max_num_tokens % h_patches != 0:
h_patches -= 1
max_num_tokens -= max_num_tokens % h_patches
w_patches = max_num_tokens // h_patches
return ImageSize(height=h_patches * factor, width=w_patches * factor)