diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 4763f2281d32..74f3a9d1cdb5 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -627,7 +627,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ |
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
| `InternS1ForConditionalGeneration` | Intern-S1 | T + IE+ + VE+ | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ |
| `Llama4ForConditionalGeneration` | Llama 4 | T + I+ | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
@@ -701,7 +701,7 @@ Some models are supported only via the [Transformers backend](#transformers). Th
- There's no PLE caching or out-of-memory swapping support, as described in [Google's blog](https://developers.googleblog.com/en/introducing-gemma-3n/). These features might be too model-specific for vLLM, and swapping in particular may be better suited for constrained setups.
!!! note
- Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.
+ For `InternVLChatModel`, only InternVL2.5 with Qwen2.5 text backbone (`OpenGVLab/InternVL2.5-1B` etc), InternVL3 and InternVL3.5 have video inputs support currently.
!!! note
To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index a604d11f0e76..74ca10d32609 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -286,6 +286,9 @@ def _test_processing_correctness_one(
"internlm/Intern-S1",
"OpenGVLab/InternVL2-1B",
"OpenGVLab/InternVL3-1B",
+ "OpenGVLab/InternVL3_5-1B",
+ "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
+ "OpenGVLab/InternVL3_5-30B-A3B",
"Kwai-Keye/Keye-VL-8B-Preview",
"moonshotai/Kimi-VL-A3B-Instruct",
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 79164f02c339..2d8cd49edc73 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -38,7 +38,12 @@ ARCH_NEEDS_EXTRAS = [
"MiniCPMV",
"PaliGemmaForConditionalGeneration",
]
-REPO_ID_TO_SKIP = {"nm-testing/pixtral-12b-FP8-dynamic": "duplicated test"}
+REPO_ID_TO_SKIP = {
+ "nm-testing/pixtral-12b-FP8-dynamic": "duplicated test",
+ # FIXME(Isotr0py): enable GPT-OSS based InternVL3.5 model
+ # after support PP for GPT-OSS
+ "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview": "Broken model",
+}
ImageInput = list[Image.Image]
VideoInput = Union[list[Image.Image], list[np.ndarray],
diff --git a/tests/models/registry.py b/tests/models/registry.py
index b34c6f2e5dc8..20c7c3af6776 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -422,7 +422,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code=True), # noqa: E501
"InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
extras={"2B": "OpenGVLab/InternVL2-2B",
- "3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501
+ "3.0": "OpenGVLab/InternVL3-1B", # noqa: E501
+ "3.5-qwen3": "OpenGVLab/InternVL3_5-1B", # noqa: E501
+ "3.5-qwen3moe": "OpenGVLab/InternVL3_5-30B-A3B", # noqa: E501
+ "3.5-gptoss": "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview"}, # noqa: E501
trust_remote_code=True),
"KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
trust_remote_code=True),
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index da8ad8396725..b09ed7bbe72a 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -855,9 +855,13 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
def get_video_token(self) -> Optional[str]:
text_model_type = self.get_hf_config().get_text_config().model_type
- if text_model_type == "qwen2":
- return "<|video_pad|>"
- return None
+ video_token_map = {
+ "qwen2": "<|video_pad|>",
+ "qwen3": "<|video_pad|>",
+ "qwen3_moe": "<|video_pad|>",
+ "gpt_oss": "<|reserved_200000|>",
+ }
+ return video_token_map.get(text_model_type)
def get_num_frames_with_most_features(
self,