From 4fd4b743a23cc6ccbd832f11be12317a8c2f0fbc Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 11 Nov 2025 00:07:24 -0800 Subject: [PATCH] [Bugfix] Fix max image size for PaddleOCR-VL (#28442) Signed-off-by: Roger Wang --- vllm/model_executor/models/paddleocr_vl.py | 36 +++++++++++++--------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index 631475c964c0b..12ae15699e7d2 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -198,23 +198,18 @@ class PaddleOCRVLProcessingInfo(BaseProcessingInfo): if image_processor is None: image_processor = self.get_image_processor() - do_resize = True hf_config = self.get_hf_config() vision_config = hf_config.vision_config patch_size = vision_config.patch_size merge_size = vision_config.spatial_merge_size - - if do_resize: - resized_height, resized_width = smart_resize( - height=image_height, - width=image_width, - factor=patch_size * merge_size, - min_pixels=image_processor.min_pixels, - max_pixels=image_processor.max_pixels, - ) - preprocessed_size = ImageSize(width=resized_width, height=resized_height) - else: - preprocessed_size = ImageSize(width=image_width, height=image_height) + resized_height, resized_width = smart_resize( + height=image_height, + width=image_width, + factor=patch_size * merge_size, + min_pixels=image_processor.min_pixels, + max_pixels=image_processor.max_pixels, + ) + preprocessed_size = ImageSize(width=resized_width, height=resized_height) grid_t = 1 grid_h = preprocessed_size.height // patch_size @@ -227,8 +222,19 @@ class PaddleOCRVLProcessingInfo(BaseProcessingInfo): def get_image_size_with_most_features(self) -> ImageSize: hf_config = self.get_hf_config() - image_size = hf_config.vision_config.image_size - return ImageSize(height=image_size, width=image_size) + + # See `smart_resize` for the calculation of the image size. + merge_size = hf_config.vision_config.spatial_merge_size + patch_size = hf_config.vision_config.patch_size + factor = merge_size * patch_size + max_num_tokens = self.get_image_processor().max_pixels // (factor**2) + # Find factors of max_num_tokens close to its square root + # to create a dummy image with a reasonable aspect ratio. + h_patches = int(math.sqrt(max_num_tokens)) + while max_num_tokens % h_patches != 0: + h_patches -= 1 + w_patches = max_num_tokens // h_patches + return ImageSize(height=h_patches * factor, width=w_patches * factor) class PaddleOCRVLDummyInputsBuilder(BaseDummyInputsBuilder[PaddleOCRVLProcessingInfo]):