diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index deaeea059cca..0eaf7198f91b 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -137,7 +137,7 @@ VLM_TEST_SETTINGS = { max_num_seqs=2, auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, - image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), "qwen2_5_omni": VLMTestInfo( @@ -152,7 +152,7 @@ VLM_TEST_SETTINGS = { auto_cls=AutoModelForTextToWaveform, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner, - image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), "qwen3_vl": VLMTestInfo( @@ -173,7 +173,7 @@ VLM_TEST_SETTINGS = { auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner, - image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[ pytest.mark.core_model, ], @@ -350,7 +350,7 @@ VLM_TEST_SETTINGS = { patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner, hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output, stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], - image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], + image_size_factors=[(1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], ), "fuyu": VLMTestInfo( models=["adept/fuyu-8b"], @@ -707,7 +707,7 @@ VLM_TEST_SETTINGS = { max_model_len=8192, max_num_seqs=2, auto_cls=AutoModelForCausalLM, - image_size_factors=[(), (0.25,)], + image_size_factors=[(0.25,)], marks=[ pytest.mark.skipif( Version(TRANSFORMERS_VERSION) == Version("4.57.3"), @@ -760,7 +760,7 @@ VLM_TEST_SETTINGS = { max_num_seqs=2, auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, - image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[pytest.mark.cpu_model], ), "skywork_r1v": VLMTestInfo( @@ -812,7 +812,7 @@ VLM_TEST_SETTINGS = { max_model_len=4096, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, - image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[pytest.mark.skip("Model initialization hangs")], ), ### Tensor parallel / multi-gpu broadcast tests diff --git a/tests/models/multimodal/generation/vlm_utils/case_filtering.py b/tests/models/multimodal/generation/vlm_utils/case_filtering.py index d42150bcbf67..116eead7a70a 100644 --- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py +++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py @@ -62,6 +62,65 @@ def get_filtered_test_settings( return matching_tests +def get_model_type_cases( + model_type: str, + test_info: VLMTestInfo, + test_type: VLMTestType, +): + # Ensure that something is wrapped as an iterable it's not already + ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,) + + # This is essentially the same as nesting a bunch of mark.parametrize + # decorators, but we do it programmatically to allow overrides for on + # a per-model basis, while still being able to execute each of these + # as individual test cases in pytest. + iter_kwargs = OrderedDict( + [ + ("model", ensure_wrapped(test_info.models)), + ("max_tokens", ensure_wrapped(test_info.max_tokens)), + ("num_logprobs", ensure_wrapped(test_info.num_logprobs)), + ("dtype", ensure_wrapped(test_info.dtype)), + ( + "distributed_executor_backend", + ensure_wrapped(test_info.distributed_executor_backend), + ), + ] + ) + + # num_frames is video only + if test_type == VLMTestType.VIDEO: + iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames) + iter_kwargs["needs_video_metadata"] = ensure_wrapped( + test_info.needs_video_metadata + ) + + # No sizes passed for custom inputs, since inputs are directly provided + if test_type not in ( + VLMTestType.CUSTOM_INPUTS, + VLMTestType.AUDIO, + ): + wrapped_sizes = get_wrapped_test_sizes(test_info, test_type) + if wrapped_sizes is None: + raise ValueError(f"Sizes must be set for test type {test_type}") + iter_kwargs["size_wrapper"] = wrapped_sizes + + # Otherwise expand the custom test options instead + elif test_type == VLMTestType.CUSTOM_INPUTS: + if test_info.custom_test_opts is None: + raise ValueError("Test has type CUSTOM_INPUTS, but none given") + iter_kwargs["custom_test_opts"] = test_info.custom_test_opts + + # Wrap all model cases in a pytest parameter & pass marks through + return [ + pytest.param( + model_type, + ExpandableVLMTestArgs(**{k: v for k, v in zip(iter_kwargs.keys(), case)}), + marks=test_info.marks if test_info.marks is not None else [], + ) + for case in list(itertools.product(*iter_kwargs.values())) + ] + + def get_parametrized_options( test_settings: dict[str, VLMTestInfo], test_type: VLMTestType, @@ -76,64 +135,11 @@ def get_parametrized_options( test_settings, test_type, create_new_process_for_each_test ) - # Ensure that something is wrapped as an iterable it's not already - ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,) - - def get_model_type_cases(model_type: str, test_info: VLMTestInfo): - # This is essentially the same as nesting a bunch of mark.parametrize - # decorators, but we do it programmatically to allow overrides for on - # a per-model basis, while still being able to execute each of these - # as individual test cases in pytest. - iter_kwargs = OrderedDict( - [ - ("model", ensure_wrapped(test_info.models)), - ("max_tokens", ensure_wrapped(test_info.max_tokens)), - ("num_logprobs", ensure_wrapped(test_info.num_logprobs)), - ("dtype", ensure_wrapped(test_info.dtype)), - ( - "distributed_executor_backend", - ensure_wrapped(test_info.distributed_executor_backend), - ), - ] - ) - - # num_frames is video only - if test_type == VLMTestType.VIDEO: - iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames) - iter_kwargs["needs_video_metadata"] = ensure_wrapped( - test_info.needs_video_metadata - ) - - # No sizes passed for custom inputs, since inputs are directly provided - if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO): - wrapped_sizes = get_wrapped_test_sizes(test_info, test_type) - if wrapped_sizes is None: - raise ValueError(f"Sizes must be set for test type {test_type}") - iter_kwargs["size_wrapper"] = wrapped_sizes - - # Otherwise expand the custom test options instead - elif test_type == VLMTestType.CUSTOM_INPUTS: - if test_info.custom_test_opts is None: - raise ValueError("Test has type CUSTOM_INPUTS, but none given") - iter_kwargs["custom_test_opts"] = test_info.custom_test_opts - - # Wrap all model cases in a pytest parameter & pass marks through - return [ - pytest.param( - model_type, - ExpandableVLMTestArgs( - **{k: v for k, v in zip(iter_kwargs.keys(), case)} - ), - marks=test_info.marks if test_info.marks is not None else [], - ) - for case in list(itertools.product(*iter_kwargs.values())) - ] - # Get a list per model type, where each entry contains a tuple of all of # that model type's cases, then flatten them into the top level so that # we can consume them in one mark.parametrize call. cases_by_model_type = [ - get_model_type_cases(model_type, test_info) + get_model_type_cases(model_type, test_info, test_type) for model_type, test_info in matching_tests.items() ] return list(itertools.chain(*cases_by_model_type)) diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py index 0c03c8449712..ae2f75481359 100644 --- a/tests/models/multimodal/generation/vlm_utils/types.py +++ b/tests/models/multimodal/generation/vlm_utils/types.py @@ -50,8 +50,8 @@ MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PL VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?" -IMAGE_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)] -EMBEDDING_SIZE_FACTORS = [(), (1.0,), (1.0, 1.0, 1.0)] +IMAGE_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)] +EMBEDDING_SIZE_FACTORS = [(1.0,), (1.0, 1.0, 1.0)] RunnerOutput = tuple[list[int], str, SampleLogprobs | None]