[Bugfix] Fix incorrect image_grid_thw rank for HunyuanOCR from missing merge_by_field_config=True (#29950)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-03-16 11:47:09 +08:00 · 2025-12-03 18:05:10 +08:00 · 2025-12-03 18:05:10 +08:00 · a21cd9ed23
commit a21cd9ed23
parent 7fe9c1a223
2 changed files with 24 additions and 0 deletions
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -309,6 +309,28 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
    )


+# HunyuanOCR
+def load_hunyuan_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "tencent/HunyuanOCR"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholder = (
+        "<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>"  # noqa: E501
+    ) * len(image_urls)
+    prompt = f"<｜hy_begin▁of▁sentence｜>{placeholder}{question}<｜hy_User｜>"
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_hyperclovax_seed_vision(
    question: str, image_urls: list[str]
 ) -> ModelRequestData:
@ -1322,6 +1344,7 @@ model_example_map = {
    "deepseek_ocr": load_deepseek_ocr,
    "gemma3": load_gemma3,
    "h2ovl_chat": load_h2ovl,
+    "hunyuan_vl": load_hunyuan_vl,
    "hyperclovax_seed_vision": load_hyperclovax_seed_vision,
    "idefics3": load_idefics3,
    "interns1": load_interns1,
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@ -785,6 +785,7 @@ class HunYuanVLForConditionalGeneration(
    SupportsQuant,
    SupportsXDRoPE,
 ):
+    merge_by_field_config = True
    multimodal_cpu_fields = {"image_grid_thw"}

    # To ensure correct weight loading and mapping.