[Bugfix] Fix InternS1 video processing after Transformers v4.56 (#25644)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-05-31 15:07:08 +08:00 · 2025-09-25 22:46:04 +08:00 · 2025-09-25 22:46:04 +08:00 · 03858e6d1c
commit 03858e6d1c
parent 532a6cfccb
4 changed files with 68 additions and 3 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -770,8 +770,9 @@ steps:
    - pytest -v -s tests/models/multimodal/processing/
    - pytest -v -s tests/models/multimodal/test_mapping.py
    - python3 examples/offline_inference/basic/chat.py
    - python3 examples/offline_inference/audio_language.py --model-type whisper
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
 - label: Blackwell Test # 38 min
  timeout_in_minutes: 60
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@ -213,6 +213,7 @@ _IGNORE_MM_KEYS = {
 MM_DATA_PATCHES = {
    # GLM4.1V and Qwen3-VL requires video metadata to be included in the input
    "glm4v": glm4_1v_patch_mm_data,
    "glm4v_moe": glm4_1v_patch_mm_data,
    "qwen3_vl": qwen3_vl_patch_mm_data,
    "qwen3_vl_moe": qwen3_vl_patch_mm_data,
 }
--- a/vllm/model_executor/models/interns1.py
+++ b/vllm/model_executor/models/interns1.py
@ -16,6 +16,8 @@ from transformers import BatchFeature, InternVLProcessor, PretrainedConfig
 from transformers.activations import ACT2FN
 from transformers.models.got_ocr2.image_processing_got_ocr2_fast import (
    GotOcr2ImageProcessorFast)
 from transformers.models.internvl.video_processing_internvl import (
    InternVLVideoProcessor)
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization import QuantizationConfig
@ -31,6 +33,8 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.processor import (
    cached_video_processor_from_config)
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@ -152,7 +156,12 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
    """ProcessingInfo for InternS1-style models."""
    def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
-        return self.ctx.get_hf_processor(InternVLProcessor, **kwargs)
+        hf_processor = self.ctx.get_hf_processor(InternVLProcessor, **kwargs)
        hf_processor.video_processor = cached_video_processor_from_config(
            self.ctx.model_config,
            processor_cls=InternVLVideoProcessor,
            **kwargs)
        return hf_processor
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None, "video": None}
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@ -5,10 +5,11 @@ from functools import lru_cache
 from typing import TYPE_CHECKING, Any, Optional, Union, cast
 from transformers import (AutoFeatureExtractor, AutoImageProcessor,
-                          AutoProcessor)
+                          AutoProcessor, AutoVideoProcessor)
 from transformers.feature_extraction_utils import FeatureExtractionMixin
 from transformers.image_processing_utils import BaseImageProcessor
 from transformers.processing_utils import ProcessorMixin
 from transformers.video_processing_utils import BaseVideoProcessor
 from typing_extensions import TypeVar
 from vllm.utils import get_allowed_kwarg_only_overrides
@ -17,6 +18,7 @@ if TYPE_CHECKING:
    from vllm.config import ModelConfig
 _P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
 _V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor)
 class HashableDict(dict):
@ -243,3 +245,55 @@ def cached_image_processor_from_config(
        trust_remote_code=model_config.trust_remote_code,
        **_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs),
    )
 def get_video_processor(
    processor_name: str,
    *args: Any,
    revision: Optional[str] = None,
    trust_remote_code: bool = False,
    processor_cls_overrides: Optional[type[_V]] = None,
    **kwargs: Any,
 ):
    """Load a video processor for the given model name via HuggingFace."""
    try:
        processor_cls = processor_cls_overrides or AutoVideoProcessor
        processor = processor_cls.from_pretrained(
            processor_name,
            *args,
            revision=revision,
            trust_remote_code=trust_remote_code,
            **kwargs)
    except ValueError as e:
        # If the error pertains to the processor class not existing or not
        # currently being imported, suggest using the --trust-remote-code flag.
        # Unlike AutoTokenizer, AutoVideoProcessor does not separate such errors
        if not trust_remote_code:
            err_msg = (
                "Failed to load the video processor. If the video processor is "
                "a custom processor not yet available in the HuggingFace "
                "transformers library, consider setting "
                "`trust_remote_code=True` in LLM or using the "
                "`--trust-remote-code` flag in the CLI.")
            raise RuntimeError(err_msg) from e
        else:
            raise e
    return cast(BaseVideoProcessor, processor)
 cached_get_video_processor = lru_cache(get_video_processor)
 def cached_video_processor_from_config(
    model_config: "ModelConfig",
    processor_cls: Optional[type[_V]] = None,
    **kwargs: Any,
 ):
    return cached_get_video_processor(
        model_config.model,
        revision=model_config.revision,
        trust_remote_code=model_config.trust_remote_code,
        processor_cls_overrides=processor_cls,  # type: ignore[arg-type]
        **_merge_mm_kwargs(model_config, AutoVideoProcessor, **kwargs),
    )