diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index de6365c0d858..ea7a793d026b 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -593,21 +593,21 @@ def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData: def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData: try: - from qwen_vl_utils import process_vision_info + from qwen_vl_utils import smart_resize except ModuleNotFoundError: print( "WARNING: `qwen-vl-utils` not installed, input images will not " "be automatically resized. You can enable this functionality by " "`pip install qwen-vl-utils`." ) - process_vision_info = None + smart_resize = None model_name = "Qwen/Qwen2-VL-7B-Instruct" # Tested on L40 engine_args = EngineArgs( model=model_name, - max_model_len=32768 if process_vision_info is None else 4096, + max_model_len=32768 if smart_resize is None else 4096, max_num_seqs=5, limit_mm_per_prompt={"image": len(image_urls)}, ) @@ -630,10 +630,18 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData: messages, tokenize=False, add_generation_prompt=True ) - if process_vision_info is None: + if smart_resize is None: image_data = [fetch_image(url) for url in image_urls] else: - image_data, _ = process_vision_info(messages) + + def post_process_image(image: Image) -> Image: + width, height = image.size + resized_height, resized_width = smart_resize( + height, width, max_pixels=1024 * 28 * 28 + ) + return image.resize((resized_width, resized_height)) + + image_data = [post_process_image(fetch_image(url)) for url in image_urls] return ModelRequestData( engine_args=engine_args, @@ -644,20 +652,20 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData: def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: try: - from qwen_vl_utils import process_vision_info + from qwen_vl_utils import smart_resize except ModuleNotFoundError: print( "WARNING: `qwen-vl-utils` not installed, input images will not " "be automatically resized. You can enable this functionality by " "`pip install qwen-vl-utils`." ) - process_vision_info = None + smart_resize = None model_name = "Qwen/Qwen2.5-VL-3B-Instruct" engine_args = EngineArgs( model=model_name, - max_model_len=32768 if process_vision_info is None else 4096, + max_model_len=32768 if smart_resize is None else 4096, max_num_seqs=5, limit_mm_per_prompt={"image": len(image_urls)}, ) @@ -680,10 +688,18 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: messages, tokenize=False, add_generation_prompt=True ) - if process_vision_info is None: + if smart_resize is None: image_data = [fetch_image(url) for url in image_urls] else: - image_data, _ = process_vision_info(messages, return_video_kwargs=False) + + def post_process_image(image: Image) -> Image: + width, height = image.size + resized_height, resized_width = smart_resize( + height, width, max_pixels=1024 * 28 * 28 + ) + return image.resize((resized_width, resized_height)) + + image_data = [post_process_image(fetch_image(url)) for url in image_urls] return ModelRequestData( engine_args=engine_args,