mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 07:14:59 +08:00
[Bugfix] Fix max image size for PaddleOCR-VL (#28442)
Signed-off-by: Roger Wang <hey@rogerw.io>
This commit is contained in:
parent
cc079763c5
commit
4fd4b743a2
@ -198,13 +198,10 @@ class PaddleOCRVLProcessingInfo(BaseProcessingInfo):
|
|||||||
if image_processor is None:
|
if image_processor is None:
|
||||||
image_processor = self.get_image_processor()
|
image_processor = self.get_image_processor()
|
||||||
|
|
||||||
do_resize = True
|
|
||||||
hf_config = self.get_hf_config()
|
hf_config = self.get_hf_config()
|
||||||
vision_config = hf_config.vision_config
|
vision_config = hf_config.vision_config
|
||||||
patch_size = vision_config.patch_size
|
patch_size = vision_config.patch_size
|
||||||
merge_size = vision_config.spatial_merge_size
|
merge_size = vision_config.spatial_merge_size
|
||||||
|
|
||||||
if do_resize:
|
|
||||||
resized_height, resized_width = smart_resize(
|
resized_height, resized_width = smart_resize(
|
||||||
height=image_height,
|
height=image_height,
|
||||||
width=image_width,
|
width=image_width,
|
||||||
@ -213,8 +210,6 @@ class PaddleOCRVLProcessingInfo(BaseProcessingInfo):
|
|||||||
max_pixels=image_processor.max_pixels,
|
max_pixels=image_processor.max_pixels,
|
||||||
)
|
)
|
||||||
preprocessed_size = ImageSize(width=resized_width, height=resized_height)
|
preprocessed_size = ImageSize(width=resized_width, height=resized_height)
|
||||||
else:
|
|
||||||
preprocessed_size = ImageSize(width=image_width, height=image_height)
|
|
||||||
|
|
||||||
grid_t = 1
|
grid_t = 1
|
||||||
grid_h = preprocessed_size.height // patch_size
|
grid_h = preprocessed_size.height // patch_size
|
||||||
@ -227,8 +222,19 @@ class PaddleOCRVLProcessingInfo(BaseProcessingInfo):
|
|||||||
|
|
||||||
def get_image_size_with_most_features(self) -> ImageSize:
|
def get_image_size_with_most_features(self) -> ImageSize:
|
||||||
hf_config = self.get_hf_config()
|
hf_config = self.get_hf_config()
|
||||||
image_size = hf_config.vision_config.image_size
|
|
||||||
return ImageSize(height=image_size, width=image_size)
|
# See `smart_resize` for the calculation of the image size.
|
||||||
|
merge_size = hf_config.vision_config.spatial_merge_size
|
||||||
|
patch_size = hf_config.vision_config.patch_size
|
||||||
|
factor = merge_size * patch_size
|
||||||
|
max_num_tokens = self.get_image_processor().max_pixels // (factor**2)
|
||||||
|
# Find factors of max_num_tokens close to its square root
|
||||||
|
# to create a dummy image with a reasonable aspect ratio.
|
||||||
|
h_patches = int(math.sqrt(max_num_tokens))
|
||||||
|
while max_num_tokens % h_patches != 0:
|
||||||
|
h_patches -= 1
|
||||||
|
w_patches = max_num_tokens // h_patches
|
||||||
|
return ImageSize(height=h_patches * factor, width=w_patches * factor)
|
||||||
|
|
||||||
|
|
||||||
class PaddleOCRVLDummyInputsBuilder(BaseDummyInputsBuilder[PaddleOCRVLProcessingInfo]):
|
class PaddleOCRVLDummyInputsBuilder(BaseDummyInputsBuilder[PaddleOCRVLProcessingInfo]):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user