diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 6f23a29e72f71..0edcd0407747c 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -449,25 +449,6 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: ) -# omni-research/Tarsier-7b -def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - model_name = "omni-research/Tarsier-7b" - - engine_args = EngineArgs( - model=model_name, - trust_remote_code=True, - max_model_len=4096, - limit_mm_per_prompt={modality: 1}, - ) - prompts = [(f"USER: \n{question} ASSISTANT:") for question in questions] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - ) - - # Intern-S1 def run_interns1(questions: list[str], modality: str) -> ModelRequestData: model_name = "internlm/Intern-S1" @@ -1293,6 +1274,25 @@ def run_qwen2_5_omni(questions: list[str], modality: str): ) +# omni-research/Tarsier-7b +def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + model_name = "omni-research/Tarsier-7b" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + limit_mm_per_prompt={modality: 1}, + ) + prompts = [(f"USER: \n{question} ASSISTANT:") for question in questions] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData: model_name = "omni-research/Tarsier2-Recap-7b" diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index 604bb307b889d..76f3bc0ebf89f 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -4,8 +4,6 @@ from dataclasses import dataclass from typing import Optional import pytest -from packaging.version import Version -from transformers import __version__ as TRANSFORMERS_VERSION import vllm from vllm.assets.image import ImageAsset @@ -185,10 +183,6 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files): current_platform.is_rocm(), reason="Qwen2.5-VL dependency xformers incompatible with ROCm", ) -@pytest.mark.skipif( - Version(TRANSFORMERS_VERSION) < Version("4.49.0"), - reason="Qwen2.5-VL require transformers version no lower than 4.49.0", -) def test_qwen25vl_lora(qwen25vl_lora_files): """Test Qwen 2.5 VL model with LoRA""" config = TestConfig(model_path=QWEN25VL_MODEL_PATH, diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 5bff615fb1071..967228b54a0af 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -702,13 +702,38 @@ VLM_TEST_SETTINGS = { "smolvlm": VLMTestInfo( models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - prompt_formatter=lambda img_prompt:f"<|im_start|>User:{img_prompt}\nAssistant:", # noqa: E501 + prompt_formatter=lambda img_prompt: f"<|im_start|>User:{img_prompt}\nAssistant:", # noqa: E501 img_idx_to_prompt=lambda idx: "", max_model_len=8192, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, hf_output_post_proc=model_utils.smolvlm_trunc_hf_output, ), + "tarsier": VLMTestInfo( + models=["omni-research/Tarsier-7b"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt: f"USER: {img_prompt} ASSISTANT:", + max_model_len=4096, + max_num_seqs=2, + auto_cls=AutoModelForImageTextToText, + patch_hf_runner=model_utils.tarsier_patch_hf_runner, + ), + "tarsier2": VLMTestInfo( + models=["omni-research/Tarsier2-Recap-7b"], + test_type=( + VLMTestType.IMAGE, + VLMTestType.MULTI_IMAGE, + VLMTestType.VIDEO, + ), + prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 + video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 + max_model_len=4096, + max_num_seqs=2, + auto_cls=AutoModelForImageTextToText, + image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + marks=[pytest.mark.skip("Model initialization hangs")], + ), ### Tensor parallel / multi-gpu broadcast tests "chameleon-broadcast": VLMTestInfo( models=["facebook/chameleon-7b"], diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index c1a2aa0dcafbb..5e8dac6bce96a 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -818,3 +818,15 @@ def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner: thinker.get_output_embeddings = lambda: thinker.lm_head hf_model.model = thinker return hf_model + + +def tarsier_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + from vllm.model_executor.models.tarsier import get_vision_encoder_info + + vision_encoder_info = get_vision_encoder_info(hf_model.config) + + hf_processor = hf_model.processor + if hf_processor.patch_size is None: + hf_processor.patch_size = vision_encoder_info.get_patch_size() + + return hf_model diff --git a/tests/models/multimodal/processing/test_transformers.py b/tests/models/multimodal/processing/test_transformers.py index c7d1b5271ff72..54a0be99384a8 100644 --- a/tests/models/multimodal/processing/test_transformers.py +++ b/tests/models/multimodal/processing/test_transformers.py @@ -16,7 +16,7 @@ def test_multimodal_processor(model_id): model_impl="transformers", ) - mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config, ) + mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config) image_pil = ImageAsset('cherry_blossom').pil_image mm_data = {"image": image_pil} diff --git a/tests/models/registry.py b/tests/models/registry.py index b9e7de4e9fd11..806342a57dfab 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -465,8 +465,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { is_available_online=False), "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501 trust_remote_code=True), - "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b", # noqa: E501 - hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501 + "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"), # noqa: E501 "Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b", # noqa: E501 hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}), # noqa: E501 "VoxtralForConditionalGeneration": _HfExamplesInfo( diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 8a3f09bdbe27e..659ee9af9ddec 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -2,16 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import nullcontext -from types import MethodType -from typing import cast +from typing import Optional, cast from unittest.mock import MagicMock import numpy as np import pytest import torch -from transformers import ProcessorMixin from vllm.config import ModelConfig +from vllm.inputs import InputProcessingContext from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs, MultiModalKwargsItem, @@ -1013,57 +1012,91 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): ) -class _ProcessorProxy: +class DummyProcessor: - def __init__(self, processor: ProcessorMixin) -> None: + def __init__(self, a: int = 0, b: int = 0) -> None: super().__init__() - self.__processor = processor - - def __getattr__(self, key: str): - return getattr(self.__processor, key) + self.a = a + self.b = b def __call__( self, - text=None, - images=None, - videos=None, - exists=None, - return_tensors=None, - ): - return dict(exists=exists) + a: int = 0, + c: int = 0, + return_tensors: Optional[str] = None, + ) -> dict[str, int]: + return dict(a=a, c=c) -@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy # yapf: disable +@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy @pytest.mark.parametrize( - ("call_kwargs", "expected_kwargs"), + ("config_kwargs", "inference_kwargs", "expected_kwargs"), [ - # Should ignore invalid kwargs - ({"does_not_exist": 100}, {"exists": None}), - ({"exists": 1}, {"exists": 1}), - ({"does_not_exist": 100, "exists": 1}, {"exists": 1}), + ({"a": 1}, {}, {"a": 1, "b": 0}), + ({}, {"a": 1}, {"a": 1, "b": 0}), + # inference_kwargs should take precedence + ({"a": 1}, {"a": 2}, {"a": 2, "b": 0}), + # Should ignore extra kwargs + ({"a": 1, "c": 1}, {}, {"a": 1, "b": 0}), + ({"b": 1, "c": 1}, {}, {"a": 0, "b": 1}), ], ) # yapf: enable -def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs): - model_config = ModelConfig(model_id) +def test_hf_processor_init_kwargs( + model_id, + config_kwargs, + inference_kwargs, + expected_kwargs, +): + # Should not be used since there is nothing to convert to tokens + mock_tokenizer = cast(AnyTokenizer, object()) - processor = MULTIMODAL_REGISTRY.create_processor(model_config) - orig_get_hf_processor = processor.info.get_hf_processor - - def get_hf_processor(self, **kwargs): - assert kwargs == call_kwargs - return _ProcessorProxy(orig_get_hf_processor()) - - processor.info.get_hf_processor = MethodType(get_hf_processor, - processor.info) - - out_kwargs = processor._call_hf_processor( - prompt="", - mm_data={}, - mm_kwargs=call_kwargs, - tok_kwargs={}, + ctx = InputProcessingContext( + model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs), + tokenizer=mock_tokenizer, ) - assert out_kwargs == expected_kwargs + processor = ctx.get_hf_processor( + DummyProcessor, # type: ignore[arg-type] + **inference_kwargs, + ) + + for k, v in expected_kwargs.items(): + assert getattr(processor, k) == v + + +# yapf: disable +@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy +@pytest.mark.parametrize( + ("config_kwargs", "inference_kwargs", "expected_kwargs"), + [ + ({"a": 1}, {}, {"a": 1, "c": 0}), + ({}, {"a": 1}, {"a": 1, "c": 0}), + # inference_kwargs should take precedence + ({"a": 1}, {"a": 2}, {"a": 2, "c": 0}), + # Should ignore extra kwargs + ({"a": 1, "c": 1}, {}, {"a": 1, "c": 1}), + ({"b": 1, "c": 1}, {}, {"a": 0, "c": 1}), + ], +) +# yapf: enable +def test_hf_processor_call_kwargs( + model_id, + config_kwargs, + inference_kwargs, + expected_kwargs, +): + # Should not be used since there is nothing to convert to tokens + mock_tokenizer = cast(AnyTokenizer, object()) + + ctx = InputProcessingContext( + model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs), + tokenizer=mock_tokenizer, + ) + + processor = ctx.get_hf_processor(DummyProcessor) # type: ignore[arg-type] + + result = ctx.call_hf_processor(processor, {}, inference_kwargs) + assert result == expected_kwargs diff --git a/vllm/config.py b/vllm/config.py index edad5dd0406bf..9d5739ca11efd 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -11,6 +11,7 @@ import textwrap import uuid import warnings from collections import Counter +from collections.abc import Mapping from contextlib import contextmanager from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass, replace) @@ -3332,7 +3333,16 @@ class MultiModalConfig: 999 if envs.VLLM_USE_V1 else 1, ) - # TODO: Add configs to init vision tower or not. + def merge_mm_processor_kwargs( + self, + inference_kwargs: Mapping[str, object], + ) -> dict[str, object]: + """ + Get the keyword arguments to pass to the multi-modal processor + according to the extra arguments passed during inference. + """ + kwargs = self.mm_processor_kwargs or {} + return kwargs | dict(inference_kwargs) @config diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 652136fbbfe73..6331a70b469aa 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -11,7 +11,7 @@ from typing_extensions import TypeVar from vllm.jsontree import JSONTree, json_map_leaves from vllm.logger import init_logger from vllm.transformers_utils.processor import cached_processor_from_config -from vllm.utils import resolve_mm_processor_kwargs +from vllm.utils import get_allowed_kwarg_only_overrides if TYPE_CHECKING: from vllm.config import ModelConfig @@ -154,14 +154,11 @@ class InputProcessingContext(InputContext): assert callable(hf_processor) mm_config = self.model_config.get_multimodal_config() - base_kwargs = mm_config.mm_processor_kwargs - if base_kwargs is None: - base_kwargs = {} + merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs) - merged_kwargs = resolve_mm_processor_kwargs( - base_kwargs, - kwargs, + allowed_kwargs = get_allowed_kwarg_only_overrides( hf_processor, + merged_kwargs, requires_kw_only=False, allow_var_kwargs=True, ) @@ -173,7 +170,9 @@ class InputProcessingContext(InputContext): return x try: - output = hf_processor(**data, **merged_kwargs, return_tensors="pt") + output = hf_processor(**data, + **allowed_kwargs, + return_tensors="pt") # this emulates output.to(dtype=self.model_config.dtype) if isinstance(output, BatchFeature): cast_output = json_map_leaves(maybe_cast_dtype, output.data) @@ -189,7 +188,7 @@ class InputProcessingContext(InputContext): except Exception as exc: msg = (f"Failed to apply {type(hf_processor).__name__} " - f"on data={data} with kwargs={merged_kwargs}") + f"on data={data} with kwargs={allowed_kwargs}") raise ValueError(msg) from exc diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index a3eee9f065aea..b476a4f918bc3 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -123,16 +123,10 @@ class AyaVisionProcessingInfo(BaseProcessingInfo): return self.ctx.get_hf_config(AyaVisionConfig) def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor: - processor = self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs) + return self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs) - # Temporary workaround since this processor has multiple image tokens - # See https://github.com/huggingface/transformers/issues/38350 - processor._check_special_mm_tokens = lambda *args, **kwargs: None - - return processor - - def get_image_processor(self) -> GotOcr2ImageProcessor: - return self.get_hf_processor().image_processor + def get_image_processor(self, **kwargs: object) -> GotOcr2ImageProcessor: + return self.get_hf_processor(**kwargs).image_processor def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 544de5fe02d35..531018625478b 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -214,25 +214,25 @@ class DeepseekVL2MultiModalProcessor( mm_kwargs: Mapping[str, object], tok_kwargs: Mapping[str, object], ) -> BatchFeature: - if mm_data: - processed_outputs = self.info.ctx.call_hf_processor( - self.info.get_hf_processor(**mm_kwargs), - dict(prompt=prompt, **mm_data), - dict(**mm_kwargs, **tok_kwargs), - ) - pixel_values = processed_outputs["pixel_values"] - # split pixel values into patches corresponding to each image - images_spatial_crop = processed_outputs["images_spatial_crop"] - patches_per_image = [ - x.prod().item() + 1 for x in images_spatial_crop - ] - pixel_values = pixel_values.split(patches_per_image) - processed_outputs["pixel_values"] = pixel_values - else: + if not mm_data: tokenizer = self.info.get_tokenizer() - processed_outputs = tokenizer(prompt, - add_special_tokens=True, - return_tensors="pt") + return tokenizer(prompt, + add_special_tokens=True, + return_tensors="pt") + + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + tok_kwargs=tok_kwargs, + ) + + pixel_values = processed_outputs["pixel_values"] + # split pixel values into patches corresponding to each image + images_spatial_crop = processed_outputs["images_spatial_crop"] + patches_per_image = [x.prod().item() + 1 for x in images_spatial_crop] + pixel_values = pixel_values.split(patches_per_image) + processed_outputs["pixel_values"] = pixel_values return processed_outputs diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index 399c739f408ee..56e456c2f1f2a 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -761,12 +761,6 @@ class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only): class Florence2ProcessingInfo(BaseProcessingInfo): - def get_hf_config(self): - return self.ctx.get_hf_config() - - def get_hf_processor(self): - return self.ctx.get_hf_processor() - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": 1} diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 7e1d478562a4c..b61e0361fe8c3 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -83,8 +83,8 @@ class FuyuProcessingInfo(BaseProcessingInfo): def get_hf_processor(self, **kwargs: object): return self.ctx.get_hf_processor(FuyuProcessor, **kwargs) - def get_image_processor(self) -> FuyuImageProcessor: - return self.get_hf_processor().image_processor + def get_image_processor(self, **kwargs: object) -> FuyuImageProcessor: + return self.get_hf_processor(**kwargs).image_processor def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": 1} diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index ae1bf22c704e5..5f306f05d140e 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -809,11 +809,11 @@ class Glm4vProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": 1} - def get_image_processor(self) -> Glm4vImageProcessor: - return self.get_hf_processor().image_processor + def get_image_processor(self, **kwargs: object) -> Glm4vImageProcessor: + return self.get_hf_processor(**kwargs).image_processor - def get_video_processor(self) -> Glm4vVideoProcessor: - return self.get_hf_processor().video_processor + def get_video_processor(self, **kwargs: object) -> Glm4vVideoProcessor: + return self.get_hf_processor(**kwargs).video_processor def _get_vision_info( self, diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 467b074f37753..c3e4f81597adb 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -392,21 +392,7 @@ class H2OVLProcessor(BaseInternVLProcessor): class H2OVLProcessingInfo(BaseInternVLProcessingInfo): - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> H2OVLProcessor: - if min_dynamic_patch is not None: - kwargs["min_dynamic_patch"] = min_dynamic_patch - if max_dynamic_patch is not None: - kwargs["max_dynamic_patch"] = max_dynamic_patch - if dynamic_image_size is not None: - kwargs["dynamic_image_size"] = dynamic_image_size - + def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor: return self.ctx.init_processor( H2OVLProcessor, config=self.get_hf_config(), diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index 3e8e50b35c0b7..e5c94c7f3a706 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -25,8 +25,7 @@ import torch import torch.nn as nn from timm.layers import LayerNorm, LayerNorm2d from timm.models.regnet import RegStage -from transformers import (AutoProcessor, BatchFeature, CLIPVisionConfig, - SiglipVisionConfig) +from transformers import BatchFeature, CLIPVisionConfig, SiglipVisionConfig from transformers.modeling_utils import no_init_weights from vllm.config import VllmConfig @@ -80,26 +79,9 @@ HCXVisionMultimodalInputs = Union[HCXVisionMultimodalPixelInputs] class HCXVisionProcessingInfo(BaseProcessingInfo): - def get_hf_config(self): - return self.ctx.get_hf_config() - def get_vision_encoder_info(self): return get_vision_encoder_info(self.get_hf_config()) - def get_hf_processor( - self, - **kwargs: object, - ): - processor_cls = type( - AutoProcessor.from_pretrained( - self.ctx.model_config.model, - trust_remote_code=self.ctx.model_config.trust_remote_code, - )) - return self.ctx.get_hf_processor( - processor_cls, - **kwargs, - ) - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 6e991d99b9638..3c01789b90066 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -88,15 +88,7 @@ ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs] class Idefics3ProcessingInfo(BaseProcessingInfo): - def get_hf_processor( - self, - *, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ) -> Idefics3Processor: - if size is not None: - kwargs["size"] = size - + def get_hf_processor(self, **kwargs: object) -> Idefics3Processor: return self.ctx.get_hf_processor(Idefics3Processor, **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index a0e98ca3f8155..8e766dd4c4768 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -665,14 +665,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo): """Basic image-only ProcessingInfo for InternVL-style models.""" @abstractmethod - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> BaseInternVLProcessor: + def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor: raise NotImplementedError def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: @@ -882,27 +875,12 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo): return max(max_frames_per_video, 1) - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> InternVLProcessor: - if min_dynamic_patch is not None: - kwargs["min_dynamic_patch"] = min_dynamic_patch - if max_dynamic_patch is not None: - kwargs["max_dynamic_patch"] = max_dynamic_patch - if dynamic_image_size is not None: - kwargs["dynamic_image_size"] = dynamic_image_size - - kwargs["video_token"] = self.get_video_token() - + def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: return self.ctx.init_processor( InternVLProcessor, config=self.get_hf_config(), tokenizer=self.get_tokenizer(), + video_token=self.get_video_token(), **kwargs, ) diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 892d970aaade0..4d8aa8de0f0b1 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -44,8 +44,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope -from vllm.transformers_utils.processor import ( - cached_image_processor_from_config) from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, @@ -980,72 +978,8 @@ class KeyeMultiModalDataParser(MultiModalDataParser): class KeyeProcessingInfo(BaseProcessingInfo): - def get_hf_processor( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ): - return self.ctx.get_hf_processor( - image_processor=self.get_image_processor( - min_pixels=min_pixels, - max_pixels=max_pixels, - size=size, - ), - **kwargs, - ) - - def _get_image_processor_kwargs( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ): - if self.ctx.model_config.mm_processor_kwargs: - kwargs.update(self.ctx.model_config.mm_processor_kwargs) - - if min_pixels is not None: - kwargs["min_pixels"] = min_pixels - - if size is None: - size = {"shortest_edge": min_pixels} - else: - size["shortest_edge"] = min_pixels - - if max_pixels is not None: - kwargs["max_pixels"] = max_pixels - - if size is None: - size = {"longest_edge": max_pixels} - else: - size["longest_edge"] = max_pixels - - if size is not None: - kwargs["size"] = size - - return kwargs - - def get_image_processor( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ): - return cached_image_processor_from_config( - self.ctx.model_config, - **self._get_image_processor_kwargs( - min_pixels=min_pixels, - max_pixels=max_pixels, - size=size, - **kwargs, - ), - ) + def get_image_processor(self, **kwargs: object): + return self.get_hf_processor(**kwargs).image_processor def get_supported_mm_limits(self, ) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} @@ -1246,20 +1180,6 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]): def _get_data_parser(self) -> MultiModalDataParser: return KeyeMultiModalDataParser() - def _call_hf_processor( - self, - prompt: str, - mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], - tok_kwargs: Mapping[str, object], - ) -> BatchFeature: - mm_kwargs = self.info._get_image_processor_kwargs(**mm_kwargs) - return self.info.ctx.call_hf_processor( - self.info.get_hf_processor(**mm_kwargs), - dict(text=prompt, **mm_data), - dict(**mm_kwargs, **tok_kwargs), - ) - def _get_prompt_updates( self, mm_items: MultiModalDataItems, diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 0126ace09e707..c863ba406422d 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -8,11 +8,9 @@ from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, import torch import torch.nn as nn -from packaging.version import Version from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig, PixtralVisionConfig, PretrainedConfig, SiglipVisionConfig) -from transformers import __version__ as TRANSFORMERS_VERSION from transformers.models.llava import LlavaProcessor from transformers.models.pixtral import PixtralProcessor @@ -307,29 +305,14 @@ class PixtralHFMultiModalProcessor( pixel_values = processed_outputs.get("pixel_values") if pixel_values is not None: - # Before/after https://github.com/huggingface/transformers/pull/35122 - if Version(TRANSFORMERS_VERSION) <= Version("4.48.3"): - images = mm_data["images"] - assert isinstance(images, list) + # Avoid padding since we need the output for each image to be + # independent of other images for the cache to work correctly + image_sizes = processed_outputs["image_sizes"] + assert len(pixel_values) == len(image_sizes) - # Original output: (1, num_images, C, H, W) - # New output: (num_images, C, H, W) - assert (isinstance(pixel_values, list) - and len(pixel_values) == 1) - assert (isinstance(pixel_values[0], list) - and len(pixel_values[0]) == len(images)) - - processed_outputs["pixel_values"] = pixel_values[0] - else: - # Avoid padding since we need the output for each image to be - # independent of other images for the cache to work correctly - image_sizes = processed_outputs["image_sizes"] - assert len(pixel_values) == len(image_sizes) - - processed_outputs["pixel_values"] = [ - p[:, :h, :w] - for p, (h, w) in zip(pixel_values, image_sizes) - ] + processed_outputs["pixel_values"] = [ + p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes) + ] return processed_outputs @@ -784,17 +767,10 @@ class MantisProcessingInfo(LlavaProcessingInfo): vision_info = self.get_vision_encoder_info() kwargs.setdefault("patch_size", vision_info.get_patch_size()) - - if Version(TRANSFORMERS_VERSION) < Version("4.48"): - # BUG: num_additional_image_tokens = 0 but treated as 1, - # so we set vision_feature_select_strategy to None to offset this - kwargs.setdefault("vision_feature_select_strategy", None) - else: - # FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150 - kwargs.setdefault( - "vision_feature_select_strategy", - hf_config.vision_feature_select_strategy, - ) + kwargs.setdefault( + "vision_feature_select_strategy", + hf_config.vision_feature_select_strategy, + ) return self.ctx.get_hf_processor(LlavaProcessor, **kwargs) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 70f2d4a6420b9..e172758b2f2c5 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -331,10 +331,8 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo): return hf_processor - def get_image_processor(self): - hf_processor = self.get_hf_processor() - image_processor = hf_processor.image_processor # type: ignore - return image_processor + def get_image_processor(self, **kwargs: object): + return self.get_hf_processor(**kwargs).image_processor def get_model_version(self): return get_version_by_config(self.get_hf_config()) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index dea85d320adfd..924f10d82b381 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -533,7 +533,7 @@ class Mllama4ProcessingInfo(BaseProcessingInfo): def get_hf_processor(self, **kwargs: object) -> Llama4Processor: return self.ctx.get_hf_processor(Llama4Processor, - use_fast=True, + use_fast=kwargs.pop("use_fast", True), **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index 5d0513d707413..b90cb9b39a60b 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -137,34 +137,16 @@ class NemotronVLProcessor(InternVLProcessor): class NemotronVLProcessingInfo(BaseInternVLProcessingInfo): """Processing info for Nemotron VL models.""" - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> NemotronVLProcessor: - if min_dynamic_patch is not None: - kwargs["min_dynamic_patch"] = min_dynamic_patch - if max_dynamic_patch is not None: - kwargs["max_dynamic_patch"] = max_dynamic_patch - if dynamic_image_size is not None: - kwargs["dynamic_image_size"] = dynamic_image_size - - image_processor = self.get_image_processor() + def get_hf_processor(self, **kwargs: object) -> NemotronVLProcessor: return self.ctx.init_processor( NemotronVLProcessor, config=self.get_hf_config(), tokenizer=self.get_tokenizer(), - image_processor=image_processor, + image_processor=self.get_image_processor(), **kwargs, ) - def get_image_processor( - self, - **kwargs: object, - ): + def get_image_processor(self, **kwargs: object): return cached_image_processor_from_config( self.ctx.model_config, **kwargs, diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 2f7f8e437f0ad..4bea1392a6814 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -63,21 +63,7 @@ class NVLMProcessor(BaseInternVLProcessor): class NVLMProcessingInfo(BaseInternVLProcessingInfo): - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> NVLMProcessor: - if min_dynamic_patch is not None: - kwargs["min_dynamic_patch"] = min_dynamic_patch - if max_dynamic_patch is not None: - kwargs["max_dynamic_patch"] = max_dynamic_patch - if dynamic_image_size is not None: - kwargs["dynamic_image_size"] = dynamic_image_size - + def get_hf_processor(self, **kwargs: object) -> NVLMProcessor: return self.ctx.init_processor( NVLMProcessor, config=self.get_hf_config(), diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index c8b528048b557..6b27980e0b0c3 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -25,7 +25,7 @@ import torch import torch.nn as nn from torch import Tensor from torch.nn.functional import gumbel_softmax, pad, softmax -from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig +from transformers import BatchFeature, PretrainedConfig from vllm.config import VllmConfig from vllm.model_executor.layers.linear import ReplicatedLinear @@ -245,11 +245,12 @@ class VisualEmbedding(torch.nn.Embedding): class OvisProcessingInfo(BaseProcessingInfo): - def get_hf_processor(self, **kwargs): + def get_hf_processor(self, **kwargs: object): return self.ctx.get_hf_processor( OvisProcessor, image_pad_token=self.get_image_pad_token(), image_segment_len=self.get_image_segment_len(), + **kwargs, ) def get_image_segment_len(self) -> int: @@ -269,9 +270,6 @@ class OvisProcessingInfo(BaseProcessingInfo): text_model_type = hf_text_config.model_type return IMAGE_PAD_TOKEN_MAP.get(text_model_type) - def get_image_processor(self) -> BaseImageProcessor: - return self.get_hf_processor().image_processor # type: ignore - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index aa739f22fd7bf..9ef4f8371eb3d 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -318,17 +318,6 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase): class Phi3VProcessingInfo(BaseProcessingInfo): - def get_hf_processor( - self, - *, - num_crops: Optional[int] = None, - **kwargs: object, - ) -> ProcessorMixin: - if num_crops is not None: - kwargs["num_crops"] = num_crops - - return self.ctx.get_hf_processor(**kwargs) - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py index 432b707a61591..e13b8276bf17a 100644 --- a/vllm/model_executor/models/phi4_multimodal.py +++ b/vllm/model_executor/models/phi4_multimodal.py @@ -696,19 +696,12 @@ class Phi4MMProcessingInfo(BaseProcessingInfo): def get_hf_config(self) -> Phi4MultimodalConfig: return self.ctx.get_hf_config(Phi4MultimodalConfig) - def get_hf_processor( - self, - *, - dynamic_hd: Optional[int] = None, - **kwargs: object, - ) -> Phi4MMProcessor: - if dynamic_hd is not None: - kwargs["dynamic_hd"] = dynamic_hd + def get_hf_processor(self, **kwargs: object) -> Phi4MMProcessor: + return self.ctx.get_hf_processor(Phi4MMProcessor, **kwargs) - return self.ctx.get_hf_processor(**kwargs) - - def get_feature_extractor(self) -> Phi4MultimodalFeatureExtractor: - return self.get_hf_processor().audio_processor + def get_feature_extractor( + self, **kwargs: object) -> Phi4MultimodalFeatureExtractor: + return self.get_hf_processor(**kwargs).audio_processor def get_image_processor( self, @@ -1007,7 +1000,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): if audio_data: audio_features = processed_outputs['audio_input_features'] - sr = self.info.get_feature_extractor().sampling_rate + sr = self.info.get_feature_extractor(**mm_kwargs).sampling_rate feature_sizes = [ self.info.get_audio_num_frames(len(audio), sr) for audio in audio_data @@ -1043,7 +1036,8 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): audio_token_id = tokenizer.vocab[tokenizer.audio_token] hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - audio_processor = self.info.get_feature_extractor() + audio_processor = self.info.get_feature_extractor( + **hf_processor_mm_kwargs) def get_image_replacement_phi4mm(item_idx: int): images = mm_items.get_items( diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 9b61c3634d841..73e8446e6dea7 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -459,17 +459,6 @@ def cat_with_pad(tensors, dim, padding_value=0): class Phi4MMProcessingInfo(BaseProcessingInfo): - def get_hf_processor( - self, - *, - dynamic_hd: Optional[int] = None, - **kwargs: object, - ) -> ProcessorMixin: - if dynamic_hd is not None: - kwargs["dynamic_hd"] = dynamic_hd - - return self.ctx.get_hf_processor(**kwargs) - @property def image_tokens(self) -> list[str]: return [f"<|image_{i+1}|>" for i in range(100)] @@ -487,8 +476,9 @@ class Phi4MMProcessingInfo(BaseProcessingInfo): image_processor = processor.image_processor return image_processor.dynamic_hd - def get_feature_extractor(self) -> SequenceFeatureExtractor: - return self.get_hf_processor().audio_processor + def get_feature_extractor(self, + **kwargs: object) -> SequenceFeatureExtractor: + return self.get_hf_processor(**kwargs).audio_processor def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": None, "image": None} @@ -769,7 +759,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") - sr = self.info.get_feature_extractor().sampling_rate + sr = self.info.get_feature_extractor(**mm_kwargs).sampling_rate if (audio_data := mm_data.get("audios", [])): mm_data['audios'] = [(data, sr) for data in audio_data] @@ -816,7 +806,8 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): ) -> Sequence[PromptUpdate]: image_tokens: list[str] = self.info.image_tokens # type: ignore audio_tokens: list[str] = self.info.audio_tokens # type: ignore - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor( + **hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) def get_image_replacement_phi4mm(item_idx: int): diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index c5a5c10d9509f..b9fed79c84cdd 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -132,50 +132,15 @@ class Qwen2_5OmniThinkerProcessingInfo(Qwen2AudioProcessingInfo, def get_hf_config(self): return self.ctx.get_hf_config(Qwen2_5OmniConfig).thinker_config - def get_hf_processor( - self, - *, - sampling_rate: Optional[int] = None, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - fps: Optional[Union[float, list[float]]] = None, - **kwargs: object, - ) -> Qwen2_5OmniProcessor: - if fps is not None: - kwargs["fps"] = fps - - # Monkey patch for Transformers v4.53 - processor_class = Qwen2_5OmniProcessor - if processor_class.image_processor_class != "AutoImageProcessor": - processor_class.image_processor_class = "AutoImageProcessor" - if processor_class.video_processor_class != "AutoVideoProcessor": - processor_class.video_processor_class = "AutoVideoProcessor" - - processor = self.ctx.get_hf_processor( - processor_class, - image_processor=self.get_image_processor(min_pixels=min_pixels, - max_pixels=max_pixels, - size=size, - use_fast=kwargs.get( - "use_fast", True)), + def get_hf_processor(self, **kwargs: object) -> Qwen2_5OmniProcessor: + return self.ctx.get_hf_processor( + Qwen2_5OmniProcessor, + use_fast=kwargs.pop("use_fast", True), **kwargs, ) - if not hasattr(processor, "audio_token"): - processor.audio_token = "<|AUDIO|>" - if not hasattr(processor, "image_token"): - processor.image_token = "<|IMAGE|>" - if not hasattr(processor, "video_token"): - processor.video_token = "<|VIDEO|>" - return processor - def get_feature_extractor( - self, - *, - sampling_rate: Optional[int] = None, - **kwargs: object, - ): - hf_processor = self.get_hf_processor(sampling_rate=sampling_rate) + def get_feature_extractor(self, **kwargs: object): + hf_processor = self.get_hf_processor(**kwargs) feature_extractor = hf_processor.feature_extractor # type: ignore assert isinstance(feature_extractor, WhisperFeatureExtractor) return feature_extractor diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 8ae096536fdc5..c4c4650f569e1 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -780,25 +780,10 @@ class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(Qwen2_5_VLConfig) - def get_hf_processor( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - fps: Optional[Union[float, list[float]]] = None, - **kwargs: object, - ) -> Qwen2_5_VLProcessor: - if fps is not None: - kwargs["fps"] = fps - + def get_hf_processor(self, **kwargs: object) -> Qwen2_5_VLProcessor: return self.ctx.get_hf_processor( Qwen2_5_VLProcessor, - image_processor=self.get_image_processor(min_pixels=min_pixels, - max_pixels=max_pixels, - size=size, - use_fast=kwargs.get( - "use_fast", True)), + use_fast=kwargs.pop("use_fast", True), **kwargs, ) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index d7fec30acd8d3..3ef55cd704cf0 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -86,22 +86,12 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(Qwen2AudioConfig) - def get_hf_processor( - self, - *, - # Ignored in initialization - sampling_rate: Optional[int] = None, - **kwargs: object, - ) -> Qwen2AudioProcessor: + def get_hf_processor(self, **kwargs: object) -> Qwen2AudioProcessor: return self.ctx.get_hf_processor(Qwen2AudioProcessor, **kwargs) - def get_feature_extractor( - self, - *, - # Ignored in initialization - sampling_rate: Optional[int] = None, - ) -> WhisperFeatureExtractor: - hf_processor = self.get_hf_processor(sampling_rate=sampling_rate) + def get_feature_extractor(self, + **kwargs: object) -> WhisperFeatureExtractor: + hf_processor = self.get_hf_processor(**kwargs) feature_extractor = hf_processor.feature_extractor # type: ignore assert isinstance(feature_extractor, WhisperFeatureExtractor) return feature_extractor diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index ad63bb4af4e9d..4e8ea8e449133 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -69,8 +69,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import _Backend, current_platform from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope -from vllm.transformers_utils.processor import ( - cached_image_processor_from_config) from vllm.transformers_utils.tokenizer import AnyTokenizer from .interfaces import (MultiModalEmbeddings, SupportsLoRA, @@ -752,73 +750,15 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(Qwen2VLConfig) - def get_hf_processor( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ) -> Qwen2VLProcessor: + def get_hf_processor(self, **kwargs: object) -> Qwen2VLProcessor: return self.ctx.get_hf_processor( Qwen2VLProcessor, - image_processor=self.get_image_processor(min_pixels=min_pixels, - max_pixels=max_pixels, - size=size, - use_fast=kwargs.get( - "use_fast", True)), + use_fast=kwargs.pop("use_fast", True), **kwargs, ) - def _get_image_processor_kwargs( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ): - mm_config = self.ctx.model_config.get_multimodal_config() - if mm_config.mm_processor_kwargs: - kwargs.update(mm_config.mm_processor_kwargs) - - if min_pixels is not None: - kwargs["min_pixels"] = min_pixels - - if size is None: - size = {"shortest_edge": min_pixels} - else: - size["shortest_edge"] = min_pixels - - if max_pixels is not None: - kwargs["max_pixels"] = max_pixels - - if size is None: - size = {"longest_edge": max_pixels} - else: - size["longest_edge"] = max_pixels - - if size is not None: - kwargs["size"] = size - - return kwargs - - def get_image_processor( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ) -> Qwen2VLImageProcessor: - kwargs["use_fast"] = kwargs.get("use_fast", True) - return cached_image_processor_from_config( - self.ctx.model_config, - **self._get_image_processor_kwargs(min_pixels=min_pixels, - max_pixels=max_pixels, - size=size, - **kwargs), - ) + def get_image_processor(self, **kwargs: object) -> Qwen2VLImageProcessor: + return self.get_hf_processor(**kwargs).image_processor def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} @@ -1023,20 +963,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo] def _get_data_parser(self) -> MultiModalDataParser: return Qwen2VLMultiModalDataParser() - def _call_hf_processor( - self, - prompt: str, - mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], - tok_kwargs: Mapping[str, object], - ) -> BatchFeature: - mm_kwargs = self.info._get_image_processor_kwargs(**mm_kwargs) - return self.info.ctx.call_hf_processor( - self.info.get_hf_processor(**mm_kwargs), - dict(text=prompt, **mm_data), - dict(**mm_kwargs, **tok_kwargs), - ) - def _get_prompt_updates( self, mm_items: MultiModalDataItems, diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index 5ae5c0bc1d5dc..c76aabcd27ccb 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -7,9 +7,8 @@ # Copyright (c) 2025 Skywork # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- -from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, TypedDict, TypeVar, Union +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -232,7 +231,7 @@ def image_to_pixel_values_skyworkr1v( return pixel_values -class BaseSkyworkR1VProcessor(ABC): +class SkyworkR1VProcessor: """ This model doesn't define its own HF processor, so we implement our own one here. @@ -279,17 +278,18 @@ class BaseSkyworkR1VProcessor(ABC): self.use_thumbnail: bool = config.use_thumbnail @property - @abstractmethod def image_token_id(self) -> int: - raise NotImplementedError + return self.tokenizer.get_vocab()[IMG_CONTEXT] - @abstractmethod def get_image_repl( self, feature_size: int, num_patches: Optional[int], ) -> PromptUpdateDetails[str]: - raise NotImplementedError + repl_features = IMG_CONTEXT * feature_size + repl_full = IMG_START + repl_features + IMG_END + + return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT) def resolve_min_max_num( self, @@ -426,35 +426,15 @@ class BaseSkyworkR1VProcessor(ABC): } -class SkyworkR1VProcessor(BaseSkyworkR1VProcessor): +class SkyworkR1VProcessingInfo(BaseProcessingInfo): - @property - def image_token_id(self) -> int: - return self.tokenizer.get_vocab()[IMG_CONTEXT] - - def get_image_repl( - self, - feature_size: int, - num_patches: Optional[int], - ) -> PromptUpdateDetails[str]: - repl_features = IMG_CONTEXT * feature_size - repl_full = IMG_START + repl_features + IMG_END - - return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT) - - -class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo): - - @abstractmethod - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> BaseSkyworkR1VProcessor: - raise NotImplementedError + def get_hf_processor(self, **kwargs: object) -> SkyworkR1VProcessor: + return self.ctx.init_processor( + SkyworkR1VProcessor, + config=self.get_hf_config(), + tokenizer=self.get_tokenizer(), + **kwargs, + ) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} @@ -464,7 +444,7 @@ class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - processor: Optional[BaseSkyworkR1VProcessor], + processor: Optional[SkyworkR1VProcessor], ) -> int: if processor is None: processor = self.get_hf_processor() @@ -500,10 +480,8 @@ class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo): return largest_feature_pinpoint -_I = TypeVar("_I", bound=BaseSkyworkR1VProcessingInfo) - - -class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[_I]): +class SkyworkR1VDummyInputsBuilder( + BaseDummyInputsBuilder[SkyworkR1VProcessingInfo]): def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: num_images = mm_counts.get("image", 0) @@ -527,7 +505,8 @@ class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[_I]): } -class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]): +class SkyworkR1VMultiModalProcessor( + BaseMultiModalProcessor[SkyworkR1VProcessingInfo]): def _call_hf_processor( self, @@ -617,31 +596,6 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]): ] -class SkyworkR1VProcessingInfo(BaseSkyworkR1VProcessingInfo): - - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> SkyworkR1VProcessor: - if min_dynamic_patch is not None: - kwargs["min_dynamic_patch"] = min_dynamic_patch - if max_dynamic_patch is not None: - kwargs["max_dynamic_patch"] = max_dynamic_patch - if dynamic_image_size is not None: - kwargs["dynamic_image_size"] = dynamic_image_size - - return self.ctx.init_processor( - SkyworkR1VProcessor, - config=self.get_hf_config(), - tokenizer=self.get_tokenizer(), - **kwargs, - ) - - @MULTIMODAL_REGISTRY.register_processor( SkyworkR1VMultiModalProcessor, info=SkyworkR1VProcessingInfo, diff --git a/vllm/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py index 0f22ba5b406ce..2adfad67152b3 100644 --- a/vllm/model_executor/models/smolvlm.py +++ b/vllm/model_executor/models/smolvlm.py @@ -19,15 +19,7 @@ from .idefics3 import Idefics3ProcessingInfo class SmolVLMProcessingInfo(Idefics3ProcessingInfo): - def get_hf_processor( - self, - *, - max_image_size: Optional[dict[str, int]] = None, - **kwargs: object, - ) -> SmolVLMProcessor: - if max_image_size is not None: - kwargs["max_image_size"] = max_image_size - + def get_hf_processor(self, **kwargs: object) -> SmolVLMProcessor: return self.ctx.get_hf_processor(SmolVLMProcessor, **kwargs) def _get_image_token( diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py index 979d789b330cf..70cf5e95a54e1 100644 --- a/vllm/model_executor/models/tarsier.py +++ b/vllm/model_executor/models/tarsier.py @@ -178,13 +178,11 @@ class TarsierProcessingInfo(BaseProcessingInfo): return get_vision_encoder_info(self.get_hf_config()) def get_hf_processor(self, **kwargs: object) -> TarsierProcessor: - hf_processor = self.ctx.get_hf_processor(TarsierProcessor, **kwargs) - # Patch for patch_size if needed (copied from vLLM LLaVA) - if hasattr(hf_processor, - 'patch_size') and hf_processor.patch_size is None: - patch_size = self.get_vision_encoder_info().get_patch_size() - hf_processor.patch_size = patch_size - return hf_processor + vision_info = self.get_vision_encoder_info() + + kwargs.setdefault("patch_size", vision_info.get_patch_size()) + + return self.ctx.get_hf_processor(TarsierProcessor, **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 8cd95605cdfae..e67548800c354 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -48,7 +48,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.processor import cached_get_processor from vllm.utils import is_list_of from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP, @@ -189,10 +188,6 @@ class MultiModalProcessingInfo(BaseProcessingInfo): image_tokens = mm_tokens["num_image_tokens"][0] return image_tokens - def get_hf_processor(self): - processor = cached_get_processor(self.ctx.model_config.model) - return processor - def get_max_image_size(self): return 10_000, 10_000 # hardcode for arbitrary very large size diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index a4569ccd5a845..bef34c1be49fe 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -71,13 +71,7 @@ UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs, class UltravoxProcessingInfo(BaseProcessingInfo): - def get_hf_processor( - self, - *, - # Ignored in initialization - sampling_rate: Optional[int] = None, - **kwargs: object, - ) -> ProcessorMixin: + def get_hf_processor(self, **kwargs: object) -> ProcessorMixin: config = self.ctx.model_config.hf_config hf_processor = self.ctx.get_hf_processor(**kwargs) @@ -89,13 +83,9 @@ class UltravoxProcessingInfo(BaseProcessingInfo): return hf_processor - def get_feature_extractor( - self, - *, - # Ignored in initialization - sampling_rate: Optional[int] = None, - ) -> WhisperFeatureExtractor: - hf_processor = self.get_hf_processor(sampling_rate=sampling_rate) + def get_feature_extractor(self, + **kwargs: object) -> WhisperFeatureExtractor: + hf_processor = self.get_hf_processor(**kwargs) audio_processor = hf_processor.audio_processor # type: ignore feature_extractor = audio_processor.feature_extractor # type: ignore assert isinstance(feature_extractor, WhisperFeatureExtractor) @@ -156,7 +146,7 @@ class UltravoxMultiModalProcessor( audios = mm_data.pop("audios", []) assert isinstance(audios, list) - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor(**mm_kwargs) mm_kwargs = dict( **mm_kwargs, sampling_rate=feature_extractor.sampling_rate, diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index d7bafb9ef84d9..ca02ecd828ba3 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -623,23 +623,22 @@ class WhisperProcessingInfo(BaseProcessingInfo): def get_hf_config(self) -> WhisperConfig: return self.ctx.get_hf_config(WhisperConfig) - def get_hf_processor(self, - sampling_rate: Optional[int] = None - ) -> WhisperProcessor: - # HACK: Transformers 4.53.0 has issue with whisper tokenizer to + def get_hf_processor(self, **kwargs: object) -> WhisperProcessor: + # HACK: Transformers 4.53.2 has issue with whisper tokenizer to # initialize processor. We use a monkeypatch to fix it here. # See: https://github.com/vllm-project/vllm/issues/20224 processor_class = WhisperProcessor tokenizer_class = ("WhisperTokenizer", "WhisperTokenizerFast") if processor_class.tokenizer_class != tokenizer_class: processor_class.tokenizer_class = tokenizer_class - return self.ctx.get_hf_processor(processor_class) + return self.ctx.get_hf_processor(processor_class, **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": 1} - def get_feature_extractor(self) -> WhisperFeatureExtractor: - hf_processor = self.get_hf_processor() + def get_feature_extractor(self, + **kwargs: object) -> WhisperFeatureExtractor: + hf_processor = self.get_hf_processor(**kwargs) feature_extractor = hf_processor.feature_extractor # type: ignore assert isinstance(feature_extractor, WhisperFeatureExtractor) return feature_extractor @@ -702,7 +701,7 @@ class WhisperMultiModalProcessor( tok_kwargs: Mapping[str, object], ) -> BatchFeature: if mm_data: - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor(**mm_kwargs) mm_data = dict(audio=mm_data.pop("audios")) mm_kwargs = dict( **mm_kwargs, diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index 70cd08263d372..a630d940b2578 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -4,9 +4,15 @@ from functools import lru_cache from typing import TYPE_CHECKING, Any, Optional, Union, cast +from transformers import (AutoFeatureExtractor, AutoImageProcessor, + AutoProcessor) +from transformers.feature_extraction_utils import FeatureExtractionMixin +from transformers.image_processing_utils import BaseImageProcessor from transformers.processing_utils import ProcessorMixin from typing_extensions import TypeVar +from vllm.utils import get_allowed_kwarg_only_overrides + if TYPE_CHECKING: from vllm.config import ModelConfig @@ -33,23 +39,42 @@ class HashableList(list): return hash(tuple(self)) -def _merge_mm_kwargs(model_config: "ModelConfig", **kwargs): - mm_config = model_config.get_multimodal_config() - base_kwargs = mm_config.mm_processor_kwargs - if base_kwargs is None: - base_kwargs = {} +def _get_processor_factory_fn(processor_cls: Union[type, tuple[type, ...]]): + if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin: + return AutoProcessor.from_pretrained + if hasattr(processor_cls, "from_pretrained"): + return processor_cls.from_pretrained - merged_kwargs = {**base_kwargs, **kwargs} + return processor_cls + + +def _merge_mm_kwargs( + model_config: "ModelConfig", + processor_cls: Union[type, tuple[type, ...]], + /, + **kwargs, +): + mm_config = model_config.get_multimodal_config() + merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs) + + factory = _get_processor_factory_fn(processor_cls) + allowed_kwargs = get_allowed_kwarg_only_overrides( + factory, + merged_kwargs, + requires_kw_only=False, + allow_var_kwargs=True, + ) # NOTE: Pythonic dict is not hashable and will raise unhashable type # error when calling `cached_get_processor`, therefore we need to # wrap it to a hashable dict. - for key, value in merged_kwargs.items(): + for key, value in allowed_kwargs.items(): if isinstance(value, dict): - merged_kwargs[key] = HashableDict(value) + allowed_kwargs[key] = HashableDict(value) if isinstance(value, list): - merged_kwargs[key] = HashableList(value) - return merged_kwargs + allowed_kwargs[key] = HashableList(value) + + return allowed_kwargs def get_processor( @@ -61,21 +86,29 @@ def get_processor( **kwargs: Any, ) -> _P: """Load a processor for the given model name via HuggingFace.""" - # don't put this import at the top level - # it will call torch.cuda.device_count() - from transformers import AutoProcessor - - processor_factory = (AutoProcessor if processor_cls == ProcessorMixin or - isinstance(processor_cls, tuple) else processor_cls) + if revision is None: + revision = "main" try: - processor = processor_factory.from_pretrained( - processor_name, - *args, - revision=revision, - trust_remote_code=trust_remote_code, - **kwargs, - ) + if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin: + processor = AutoProcessor.from_pretrained( + processor_name, + *args, + revision=revision, + trust_remote_code=trust_remote_code, + **kwargs, + ) + elif issubclass(processor_cls, ProcessorMixin): + processor = processor_cls.from_pretrained( + processor_name, + *args, + revision=revision, + trust_remote_code=trust_remote_code, + **kwargs, + ) + else: + # Processors that are standalone classes unrelated to HF + processor = processor_cls(*args, **kwargs) except ValueError as e: # If the error pertains to the processor class not existing or not # currently being imported, suggest using the --trust-remote-code flag. @@ -112,7 +145,7 @@ def cached_processor_from_config( revision=model_config.revision, trust_remote_code=model_config.trust_remote_code, processor_cls=processor_cls, # type: ignore[arg-type] - **_merge_mm_kwargs(model_config, **kwargs), + **_merge_mm_kwargs(model_config, processor_cls, **kwargs), ) @@ -125,10 +158,6 @@ def get_feature_extractor( ): """Load an audio feature extractor for the given model name via HuggingFace.""" - # don't put this import at the top level - # it will call torch.cuda.device_count() - from transformers import AutoFeatureExtractor - from transformers.feature_extraction_utils import FeatureExtractionMixin try: feature_extractor = AutoFeatureExtractor.from_pretrained( processor_name, @@ -164,7 +193,7 @@ def cached_feature_extractor_from_config( model_config.model, revision=model_config.revision, trust_remote_code=model_config.trust_remote_code, - **_merge_mm_kwargs(model_config, **kwargs), + **_merge_mm_kwargs(model_config, AutoFeatureExtractor, **kwargs), ) @@ -176,11 +205,6 @@ def get_image_processor( **kwargs: Any, ): """Load an image processor for the given model name via HuggingFace.""" - # don't put this import at the top level - # it will call torch.cuda.device_count() - from transformers import AutoImageProcessor - from transformers.image_processing_utils import BaseImageProcessor - try: processor = AutoImageProcessor.from_pretrained( processor_name, @@ -217,5 +241,5 @@ def cached_image_processor_from_config( model_config.model, revision=model_config.revision, trust_remote_code=model_config.trust_remote_code, - **_merge_mm_kwargs(model_config, **kwargs), + **_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs), ) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index ae978c855a8e5..a7f579b0c9c2d 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2010,49 +2010,6 @@ def supports_kw( return False -def resolve_mm_processor_kwargs( - init_kwargs: Optional[Mapping[str, object]], - inference_kwargs: Optional[Mapping[str, object]], - callable: Callable[..., object], - *, - requires_kw_only: bool = True, - allow_var_kwargs: bool = False, -) -> dict[str, Any]: - """Applies filtering to eliminate invalid mm_processor_kwargs, i.e., - those who are not explicit keywords to the given callable (of one is - given; otherwise no filtering is done), then merges the kwarg dicts, - giving priority to inference_kwargs if there are any collisions. - - In the case that no kwarg overrides are provided, returns an empty - dict so that it can still be kwarg expanded into the callable later on. - - If allow_var_kwargs=True, allows for things that can be expanded into - kwargs as long as they aren't naming collision for var_kwargs or potential - positional arguments. - """ - # Filter inference time multimodal processor kwargs provided - runtime_mm_kwargs = get_allowed_kwarg_only_overrides( - callable, - overrides=inference_kwargs, - requires_kw_only=requires_kw_only, - allow_var_kwargs=allow_var_kwargs, - ) - - # Filter init time multimodal processor kwargs provided - init_mm_kwargs = get_allowed_kwarg_only_overrides( - callable, - overrides=init_kwargs, - requires_kw_only=requires_kw_only, - allow_var_kwargs=allow_var_kwargs, - ) - - # Merge the final processor kwargs, prioritizing inference - # time values over the initialization time values. - mm_processor_kwargs = {**init_mm_kwargs, **runtime_mm_kwargs} - - return mm_processor_kwargs - - def get_allowed_kwarg_only_overrides( callable: Callable[..., object], overrides: Optional[Mapping[str, object]],