From a21cd9ed239b853bd587ffe3c9140fe68cd41f59 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 3 Dec 2025 18:05:10 +0800 Subject: [PATCH] [Bugfix] Fix incorrect `image_grid_thw` rank for HunyuanOCR from missing `merge_by_field_config=True` (#29950) Signed-off-by: Isotr0py --- .../vision_language_multi_image.py | 23 +++++++++++++++++++ vllm/model_executor/models/hunyuan_vision.py | 1 + 2 files changed, 24 insertions(+) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 2193b1ca9cf4..560ca768d1a6 100755 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -309,6 +309,28 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData: ) +# HunyuanOCR +def load_hunyuan_vl(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "tencent/HunyuanOCR" + + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholder = ( + "<|hy_place▁holder▁no▁100|><|hy_place▁holder▁no▁102|><|hy_place▁holder▁no▁101|>" # noqa: E501 + ) * len(image_urls) + prompt = f"<|hy_begin▁of▁sentence|>{placeholder}{question}<|hy_User|>" + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_hyperclovax_seed_vision( question: str, image_urls: list[str] ) -> ModelRequestData: @@ -1322,6 +1344,7 @@ model_example_map = { "deepseek_ocr": load_deepseek_ocr, "gemma3": load_gemma3, "h2ovl_chat": load_h2ovl, + "hunyuan_vl": load_hunyuan_vl, "hyperclovax_seed_vision": load_hyperclovax_seed_vision, "idefics3": load_idefics3, "interns1": load_interns1, diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index 2950db571e6e..6537b6df876a 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -785,6 +785,7 @@ class HunYuanVLForConditionalGeneration( SupportsQuant, SupportsXDRoPE, ): + merge_by_field_config = True multimodal_cpu_fields = {"image_grid_thw"} # To ensure correct weight loading and mapping.