diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 4763f2281d32..74f3a9d1cdb5 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -627,7 +627,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | | `InternS1ForConditionalGeneration` | Intern-S1 | T + IE+ + VE+ | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ | | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ | | `Llama4ForConditionalGeneration` | Llama 4 | T + I+ | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ | @@ -701,7 +701,7 @@ Some models are supported only via the [Transformers backend](#transformers). Th - There's no PLE caching or out-of-memory swapping support, as described in [Google's blog](https://developers.googleblog.com/en/introducing-gemma-3n/). These features might be too model-specific for vLLM, and swapping in particular may be better suited for constrained setups. !!! note - Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently. + For `InternVLChatModel`, only InternVL2.5 with Qwen2.5 text backbone (`OpenGVLab/InternVL2.5-1B` etc), InternVL3 and InternVL3.5 have video inputs support currently. !!! note To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index a604d11f0e76..74ca10d32609 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -286,6 +286,9 @@ def _test_processing_correctness_one( "internlm/Intern-S1", "OpenGVLab/InternVL2-1B", "OpenGVLab/InternVL3-1B", + "OpenGVLab/InternVL3_5-1B", + "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview", + "OpenGVLab/InternVL3_5-30B-A3B", "Kwai-Keye/Keye-VL-8B-Preview", "moonshotai/Kimi-VL-A3B-Instruct", "meta-llama/Llama-4-Scout-17B-16E-Instruct", diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 79164f02c339..2d8cd49edc73 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -38,7 +38,12 @@ ARCH_NEEDS_EXTRAS = [ "MiniCPMV", "PaliGemmaForConditionalGeneration", ] -REPO_ID_TO_SKIP = {"nm-testing/pixtral-12b-FP8-dynamic": "duplicated test"} +REPO_ID_TO_SKIP = { + "nm-testing/pixtral-12b-FP8-dynamic": "duplicated test", + # FIXME(Isotr0py): enable GPT-OSS based InternVL3.5 model + # after support PP for GPT-OSS + "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview": "Broken model", +} ImageInput = list[Image.Image] VideoInput = Union[list[Image.Image], list[np.ndarray], diff --git a/tests/models/registry.py b/tests/models/registry.py index b34c6f2e5dc8..20c7c3af6776 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -422,7 +422,10 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True), # noqa: E501 "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", extras={"2B": "OpenGVLab/InternVL2-2B", - "3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501 + "3.0": "OpenGVLab/InternVL3-1B", # noqa: E501 + "3.5-qwen3": "OpenGVLab/InternVL3_5-1B", # noqa: E501 + "3.5-qwen3moe": "OpenGVLab/InternVL3_5-30B-A3B", # noqa: E501 + "3.5-gptoss": "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview"}, # noqa: E501 trust_remote_code=True), "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501 trust_remote_code=True), diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index da8ad8396725..b09ed7bbe72a 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -855,9 +855,13 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo): def get_video_token(self) -> Optional[str]: text_model_type = self.get_hf_config().get_text_config().model_type - if text_model_type == "qwen2": - return "<|video_pad|>" - return None + video_token_map = { + "qwen2": "<|video_pad|>", + "qwen3": "<|video_pad|>", + "qwen3_moe": "<|video_pad|>", + "gpt_oss": "<|reserved_200000|>", + } + return video_token_map.get(text_model_type) def get_num_frames_with_most_features( self,