[MODEL] New model support for naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B (#20931)

Signed-off-by: bigshanedogg <bigshane319@gmail.com>
2026-03-21 14:13:34 +08:00 · 2025-07-25 22:05:42 +09:00 · 2025-07-25 22:05:42 +09:00 · 29c6fbe58c
commit 29c6fbe58c
parent c72f049cb4
7 changed files with 1365 additions and 0 deletions
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -365,6 +365,7 @@ th {
 | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
 | `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ |
 | `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ |
+| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | ✅︎ |
 | `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -316,6 +316,85 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
    )


+# naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
+def run_hyperclovax_seed_vision(
+    questions: list[str], modality: str
+) -> ModelRequestData:
+    model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192 if modality == "image" else 16384,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    messages = list()
+    for question in questions:
+        if modality == "image":
+            """
+            ocr: List the words in the image in raster order. 
+                Even if the word order feels unnatural for reading, 
+                the model will handle it as long as it follows raster order.
+                e.g. "Naver, CLOVA, bigshane"
+            lens_keywords: List the entity names in the image.
+                e.g. "iPhone"
+            lens_local_keywords: List the entity names with quads in the image.
+                e.g. "[0.07, 0.21, 0.92, 0.90] iPhone"
+            """
+            messages.append(
+                [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image",
+                                "ocr": "",
+                                "lens_keywords": "",
+                                "lens_local_keywords": "",
+                            },
+                            {
+                                "type": "text",
+                                "text": question,
+                            },
+                        ],
+                    }
+                ]
+            )
+        elif modality == "video":
+            messages.append(
+                [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "video",
+                            },
+                            {
+                                "type": "text",
+                                "text": question,
+                            },
+                        ],
+                    }
+                ]
+            )
+        else:
+            raise ValueError(f"Unsupported modality: {modality}")
+
+    prompts = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=None,
+    )
+
+
 # Idefics3-8B-Llama3
 def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@ -1222,6 +1301,7 @@ model_example_map = {
    "glm4v": run_glm4v,
    "glm4_1v": run_glm4_1v,
    "h2ovl_chat": run_h2ovl,
+    "hyperclovax_seed_vision": run_hyperclovax_seed_vision,
    "idefics3": run_idefics3,
    "internvl_chat": run_internvl,
    "nemotron_vl": run_nemotron_vl,
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -289,6 +289,53 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
    )


+def load_hyperclovax_seed_vision(
+    question: str, image_urls: list[str]
+) -> ModelRequestData:
+    model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=16384,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    message = {"role": "user", "content": list()}
+    for _image_url in image_urls:
+        message["content"].append(
+            {
+                "type": "image",
+                "image": _image_url,
+                "ocr": "",
+                "lens_keywords": "",
+                "lens_local_keywords": "",
+            }
+        )
+    message["content"].append(
+        {
+            "type": "text",
+            "text": question,
+        }
+    )
+
+    prompt = tokenizer.apply_chat_template(
+        [
+            message,
+        ],
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        stop_token_ids=None,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
    # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
    # it will generate poor response for multi-image inputs!
@ -900,6 +947,7 @@ model_example_map = {
    "h2ovl_chat": load_h2ovl,
    "idefics3": load_idefics3,
    "internvl_chat": load_internvl,
+    "hyperclovax_seed_vision": load_hyperclovax_seed_vision,
    "keye_vl": load_keye_vl,
    "kimi_vl": load_kimi_vl,
    "llava": load_llava,
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@ -278,6 +278,7 @@ def _test_processing_correctness_one(
    "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
    "moonshotai/Kimi-VL-A3B-Instruct",
    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
    "llava-hf/llava-1.5-7b-hf",
    "llava-hf/llava-v1.6-mistral-7b-hf",
    "llava-hf/LLaVA-NeXT-Video-7B-hf",
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -201,6 +201,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                               trust_remote_code=True),
    "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124",
                                               trust_remote_code=True),
+    "HCXVisionForCausalLM": _HfExamplesInfo(
+        "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
+        trust_remote_code=True),
    "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
                                           trust_remote_code=True),
    "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@ -81,6 +81,7 @@ _TEXT_GENERATION_MODELS = {
    "Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"),
    "HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"),
    "HunYuanDenseV1ForCausalLM": ("hunyuan_v1", "HunYuanDenseV1ForCausalLM"),
+    "HCXVisionForCausalLM": ("hyperclovax_vision", "HCXVisionForCausalLM"),
    "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
    "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
    "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),