From b87cb97a53bcff92a90308528b3f313e43aff102 Mon Sep 17 00:00:00 2001 From: myselvess <244285088@qq.com> Date: Tue, 19 Aug 2025 21:12:59 +0800 Subject: [PATCH] [Model] support new model ovis2.5 (#23084) Signed-off-by: myselvess <244285088@qq.com> Signed-off-by: Isotr0py Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py --- docs/models/supported_models.md | 1 + examples/offline_inference/vision_language.py | 33 + .../vision_language_multi_image.py | 31 + .../multimodal/generation/test_common.py | 21 + .../generation/vlm_utils/model_utils.py | 58 ++ .../multimodal/processing/test_common.py | 2 + tests/models/registry.py | 3 + vllm/model_executor/models/ovis2_5.py | 570 ++++++++++++++++ vllm/model_executor/models/registry.py | 1 + vllm/model_executor/models/siglip2navit.py | 607 ++++++++++++++++++ .../transformers_utils/processors/__init__.py | 3 +- vllm/transformers_utils/processors/ovis2_5.py | 458 +++++++++++++ 12 files changed, 1787 insertions(+), 1 deletion(-) create mode 100644 vllm/model_executor/models/ovis2_5.py create mode 100644 vllm/model_executor/models/siglip2navit.py create mode 100644 vllm/transformers_utils/processors/ovis2_5.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index bfab5713c742..1d165fa6f16b 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -641,6 +641,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `MolmoForCausalLM` | Molmo | T + I+ | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ | | `NVLM_D_Model` | NVLM-D 1.0 | T + I+ | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ | | `Ovis` | Ovis2, Ovis1.6 | T + I+ | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ | +| `Ovis2_5` | Ovis2.5 | T + I+ + V | `AIDC-AI/Ovis2.5-9B`, etc. | | | ✅︎ | | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + IE | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ | | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + IE+ | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ | | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 9f6028d87cb2..88bbbfdfbd18 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1105,6 +1105,38 @@ def run_ovis(questions: list[str], modality: str) -> ModelRequestData: ) +# Ovis2_5 +def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData: + model_name = "AIDC-AI/Ovis2.5-2B" + + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=2, + trust_remote_code=True, + dtype="half", + limit_mm_per_prompt={modality: 1}, + ) + if modality == "image": + placeholder = "" + elif modality == "video": + placeholder = "