diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 200ed344c4e8..7a1f38606062 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -770,8 +770,9 @@ steps: - pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/multimodal/test_mapping.py - python3 examples/offline_inference/basic/chat.py - - python3 examples/offline_inference/audio_language.py --model-type whisper - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + # Whisper needs spawn method to avoid deadlock + - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - label: Blackwell Test # 38 min timeout_in_minutes: 60 diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 4eb8e0cfaa5d..ddc675b0849c 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -213,6 +213,7 @@ _IGNORE_MM_KEYS = { MM_DATA_PATCHES = { # GLM4.1V and Qwen3-VL requires video metadata to be included in the input "glm4v": glm4_1v_patch_mm_data, + "glm4v_moe": glm4_1v_patch_mm_data, "qwen3_vl": qwen3_vl_patch_mm_data, "qwen3_vl_moe": qwen3_vl_patch_mm_data, } diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index ba72c288b2b1..197d629b906f 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -16,6 +16,8 @@ from transformers import BatchFeature, InternVLProcessor, PretrainedConfig from transformers.activations import ACT2FN from transformers.models.got_ocr2.image_processing_got_ocr2_fast import ( GotOcr2ImageProcessorFast) +from transformers.models.internvl.video_processing_internvl import ( + InternVLVideoProcessor) from vllm.config import VllmConfig from vllm.model_executor.layers.quantization import QuantizationConfig @@ -31,6 +33,8 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.processor import ( + cached_video_processor_from_config) from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, @@ -152,7 +156,12 @@ class InternS1ProcessingInfo(BaseProcessingInfo): """ProcessingInfo for InternS1-style models.""" def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: - return self.ctx.get_hf_processor(InternVLProcessor, **kwargs) + hf_processor = self.ctx.get_hf_processor(InternVLProcessor, **kwargs) + hf_processor.video_processor = cached_video_processor_from_config( + self.ctx.model_config, + processor_cls=InternVLVideoProcessor, + **kwargs) + return hf_processor def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index a630d940b257..51bcce6c10e2 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -5,10 +5,11 @@ from functools import lru_cache from typing import TYPE_CHECKING, Any, Optional, Union, cast from transformers import (AutoFeatureExtractor, AutoImageProcessor, - AutoProcessor) + AutoProcessor, AutoVideoProcessor) from transformers.feature_extraction_utils import FeatureExtractionMixin from transformers.image_processing_utils import BaseImageProcessor from transformers.processing_utils import ProcessorMixin +from transformers.video_processing_utils import BaseVideoProcessor from typing_extensions import TypeVar from vllm.utils import get_allowed_kwarg_only_overrides @@ -17,6 +18,7 @@ if TYPE_CHECKING: from vllm.config import ModelConfig _P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin) +_V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor) class HashableDict(dict): @@ -243,3 +245,55 @@ def cached_image_processor_from_config( trust_remote_code=model_config.trust_remote_code, **_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs), ) + + +def get_video_processor( + processor_name: str, + *args: Any, + revision: Optional[str] = None, + trust_remote_code: bool = False, + processor_cls_overrides: Optional[type[_V]] = None, + **kwargs: Any, +): + """Load a video processor for the given model name via HuggingFace.""" + try: + processor_cls = processor_cls_overrides or AutoVideoProcessor + processor = processor_cls.from_pretrained( + processor_name, + *args, + revision=revision, + trust_remote_code=trust_remote_code, + **kwargs) + except ValueError as e: + # If the error pertains to the processor class not existing or not + # currently being imported, suggest using the --trust-remote-code flag. + # Unlike AutoTokenizer, AutoVideoProcessor does not separate such errors + if not trust_remote_code: + err_msg = ( + "Failed to load the video processor. If the video processor is " + "a custom processor not yet available in the HuggingFace " + "transformers library, consider setting " + "`trust_remote_code=True` in LLM or using the " + "`--trust-remote-code` flag in the CLI.") + raise RuntimeError(err_msg) from e + else: + raise e + + return cast(BaseVideoProcessor, processor) + + +cached_get_video_processor = lru_cache(get_video_processor) + + +def cached_video_processor_from_config( + model_config: "ModelConfig", + processor_cls: Optional[type[_V]] = None, + **kwargs: Any, +): + return cached_get_video_processor( + model_config.model, + revision=model_config.revision, + trust_remote_code=model_config.trust_remote_code, + processor_cls_overrides=processor_cls, # type: ignore[arg-type] + **_merge_mm_kwargs(model_config, AutoVideoProcessor, **kwargs), + )