From c9232d41f433abd1d6f0960bcec020660078d718 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 14 Aug 2025 01:03:05 +0800
Subject: [PATCH] [CI/Build] Update VLM common tests (#22841)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../multimodal/generation/test_common.py      | 16 +---------------
 vllm/model_executor/models/minicpmv.py        | 19 +------------------
 2 files changed, 2 insertions(+), 33 deletions(-)
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 2a65d7e244d7..2919bdbe91bb 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -561,7 +561,7 @@ VLM_TEST_SETTINGS = {
         get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
-        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
+        # FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
         marks=[pytest.mark.skip("HF import fails")],
     ),
     "minicpmv_26": VLMTestInfo(
@@ -574,8 +574,6 @@ VLM_TEST_SETTINGS = {
         get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
-        # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
-        marks=[pytest.mark.skip("HF import fails")],
     ),
     "minimax_vl_01": VLMTestInfo(
         models=["MiniMaxAI/MiniMax-VL-01"],
@@ -611,18 +609,6 @@ VLM_TEST_SETTINGS = {
         patch_hf_runner=model_utils.ovis_patch_hf_runner,
         marks=[large_gpu_mark(min_gb=32)],
     ),
-    "ovis1_6": VLMTestInfo(
-        models=["AIDC-AI/Ovis1.6-Llama3.2-3B"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful and honest multimodal assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
-        img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
-        max_model_len=4096,
-        max_num_seqs=2,
-        dtype="half",
-        # use sdpa mode for hf runner since ovis2 didn't work with flash_attn
-        hf_model_kwargs={"llm_attn_implementation": "sdpa"},
-        patch_hf_runner=model_utils.ovis_patch_hf_runner,
-    ),
     "ovis2": VLMTestInfo(
         models=["AIDC-AI/Ovis2-1B"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 88dd1a57626f..47ce771d8c90 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -85,30 +85,13 @@ class MiniCPMVImagePixelInputs(TensorSchema):
         - w: Width
     """
 
-    def _validate_nested_tensors(
-        self,
-        value: Union[list[torch.Tensor], tuple[torch.Tensor, ...]],
-        field_name: str,
-        expected_shape: tuple[Union[int, str], ...],
-        dynamic_dims: set[str],
-    ) -> tuple[int, ...]:
-        # value[0] is the scaled image,
-        # and value[1:] is a collection of image slices.
-        # It is ensured that all slices in the collection
-        # have the same shape.
-        if field_name == "pixel_values":
-            value = value[1:] if len(value) > 1 else value
-
-        return super()._validate_nested_tensors(value, field_name,
-                                                expected_shape, dynamic_dims)
-
     type: Literal["pixel_values"] = "pixel_values"
 
     # Note that the image size may vary, so we pass it as a list instead of a
     # batched tensor.
     pixel_values: Annotated[
         list[torch.Tensor],
-        TensorShape("bns", "c", "h", "w"),
+        TensorShape("bns", "c", "h", "w", dynamic_dims={"h", "w"}),
     ]
     tgt_sizes: Annotated[
         torch.Tensor,