From c9232d41f433abd1d6f0960bcec020660078d718 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 14 Aug 2025 01:03:05 +0800 Subject: [PATCH] [CI/Build] Update VLM common tests (#22841) Signed-off-by: DarkLight1337 --- .../multimodal/generation/test_common.py | 16 +--------------- vllm/model_executor/models/minicpmv.py | 19 +------------------ 2 files changed, 2 insertions(+), 33 deletions(-) diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 2a65d7e244d7..2919bdbe91bb 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -561,7 +561,7 @@ VLM_TEST_SETTINGS = { get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner, - # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55 + # FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49 marks=[pytest.mark.skip("HF import fails")], ), "minicpmv_26": VLMTestInfo( @@ -574,8 +574,6 @@ VLM_TEST_SETTINGS = { get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner, - # FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55 - marks=[pytest.mark.skip("HF import fails")], ), "minimax_vl_01": VLMTestInfo( models=["MiniMaxAI/MiniMax-VL-01"], @@ -611,18 +609,6 @@ VLM_TEST_SETTINGS = { patch_hf_runner=model_utils.ovis_patch_hf_runner, marks=[large_gpu_mark(min_gb=32)], ), - "ovis1_6": VLMTestInfo( - models=["AIDC-AI/Ovis1.6-Llama3.2-3B"], - test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful and honest multimodal assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501 - img_idx_to_prompt=lambda idx: "\n", # noqa: E501 - max_model_len=4096, - max_num_seqs=2, - dtype="half", - # use sdpa mode for hf runner since ovis2 didn't work with flash_attn - hf_model_kwargs={"llm_attn_implementation": "sdpa"}, - patch_hf_runner=model_utils.ovis_patch_hf_runner, - ), "ovis2": VLMTestInfo( models=["AIDC-AI/Ovis2-1B"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 88dd1a57626f..47ce771d8c90 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -85,30 +85,13 @@ class MiniCPMVImagePixelInputs(TensorSchema): - w: Width """ - def _validate_nested_tensors( - self, - value: Union[list[torch.Tensor], tuple[torch.Tensor, ...]], - field_name: str, - expected_shape: tuple[Union[int, str], ...], - dynamic_dims: set[str], - ) -> tuple[int, ...]: - # value[0] is the scaled image, - # and value[1:] is a collection of image slices. - # It is ensured that all slices in the collection - # have the same shape. - if field_name == "pixel_values": - value = value[1:] if len(value) > 1 else value - - return super()._validate_nested_tensors(value, field_name, - expected_shape, dynamic_dims) - type: Literal["pixel_values"] = "pixel_values" # Note that the image size may vary, so we pass it as a list instead of a # batched tensor. pixel_values: Annotated[ list[torch.Tensor], - TensorShape("bns", "c", "h", "w"), + TensorShape("bns", "c", "h", "w", dynamic_dims={"h", "w"}), ] tgt_sizes: Annotated[ torch.Tensor,