From 32d2b4064feea38802489b71e47703d1f901a17e Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 23 Aug 2025 01:46:34 +0800 Subject: [PATCH] [Model] Add Ovis2.5 PP support (#23405) Signed-off-by: Isotr0py --- tests/distributed/test_pipeline_parallel.py | 1 + .../multimodal/generation/test_common.py | 6 +- tests/models/registry.py | 4 +- vllm/model_executor/models/ovis2_5.py | 36 +-- vllm/model_executor/models/siglip2navit.py | 243 ++++++++++++------ 5 files changed, 185 insertions(+), 105 deletions(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 12dd7c422263..28150d768237 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -233,6 +233,7 @@ MULTIMODAL_MODELS = { "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(), "allenai/Molmo-7B-D-0924": PPTestSettings.fast(), "AIDC-AI/Ovis2-1B": PPTestSettings.fast(), + "AIDC-AI/Ovis2.5-2B": PPTestSettings.fast(), "microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(), "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"), "Qwen/Qwen-VL-Chat": PPTestSettings.fast(), diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index ea5de9d9f5c5..96208f8eda62 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -11,7 +11,6 @@ from pathlib import PosixPath import pytest from transformers import (AutoModel, AutoModelForImageTextToText, AutoModelForTextToWaveform, AutoModelForVision2Seq) -from transformers.utils import is_flash_attn_2_available from vllm.platforms import current_platform from vllm.utils import identity @@ -637,10 +636,7 @@ VLM_TEST_SETTINGS = { dtype="half", num_logprobs=10, patch_hf_runner=model_utils.ovis2_5_patch_hf_runner, - marks=[pytest.mark.skipif( - not is_flash_attn_2_available(), - reason="HF model needs `flash_attn` installed" - )], + hf_model_kwargs={"revision": "refs/pr/5"}, ), "phi3v": VLMTestInfo( models=["microsoft/Phi-3.5-vision-instruct"], diff --git a/tests/models/registry.py b/tests/models/registry.py index 4035319b45ce..25dbbd7fa983 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -468,9 +468,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B", "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}), # noqa: E501 "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", - trust_remote_code=True, - max_transformers_version="4.53", - transformers_version_reason="HF model is not compatible"), # noqa: E501 + trust_remote_code=True), "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224", # noqa: E501 extras={"v2": "google/paligemma2-3b-ft-docci-448"}), # noqa: E501 "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct", diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index aa4ea3dd48f6..58a14072443c 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -30,7 +30,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor -from .interfaces import MultiModalEmbeddings, SupportsMultiModal +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP IMAGE_TOKEN = "" VIDEO_TOKEN = "