[Bugfix] Fix InternS1 video processing after Transformers v4.56 (#25644)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Isotr0py 2025-09-25 22:46:04 +08:00 committed by GitHub
parent 532a6cfccb
commit 03858e6d1c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 68 additions and 3 deletions

View File

@ -770,8 +770,9 @@ steps:
- pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py - pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py - python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/audio_language.py --model-type whisper
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Blackwell Test # 38 min - label: Blackwell Test # 38 min
timeout_in_minutes: 60 timeout_in_minutes: 60

View File

@ -213,6 +213,7 @@ _IGNORE_MM_KEYS = {
MM_DATA_PATCHES = { MM_DATA_PATCHES = {
# GLM4.1V and Qwen3-VL requires video metadata to be included in the input # GLM4.1V and Qwen3-VL requires video metadata to be included in the input
"glm4v": glm4_1v_patch_mm_data, "glm4v": glm4_1v_patch_mm_data,
"glm4v_moe": glm4_1v_patch_mm_data,
"qwen3_vl": qwen3_vl_patch_mm_data, "qwen3_vl": qwen3_vl_patch_mm_data,
"qwen3_vl_moe": qwen3_vl_patch_mm_data, "qwen3_vl_moe": qwen3_vl_patch_mm_data,
} }

View File

@ -16,6 +16,8 @@ from transformers import BatchFeature, InternVLProcessor, PretrainedConfig
from transformers.activations import ACT2FN from transformers.activations import ACT2FN
from transformers.models.got_ocr2.image_processing_got_ocr2_fast import ( from transformers.models.got_ocr2.image_processing_got_ocr2_fast import (
GotOcr2ImageProcessorFast) GotOcr2ImageProcessorFast)
from transformers.models.internvl.video_processing_internvl import (
InternVLVideoProcessor)
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
@ -31,6 +33,8 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
PromptUpdate, PromptUpdateDetails) PromptUpdate, PromptUpdateDetails)
from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.processor import (
cached_video_processor_from_config)
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import (MultiModalEmbeddings, SupportsLoRA, from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
@ -152,7 +156,12 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
"""ProcessingInfo for InternS1-style models.""" """ProcessingInfo for InternS1-style models."""
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
return self.ctx.get_hf_processor(InternVLProcessor, **kwargs) hf_processor = self.ctx.get_hf_processor(InternVLProcessor, **kwargs)
hf_processor.video_processor = cached_video_processor_from_config(
self.ctx.model_config,
processor_cls=InternVLVideoProcessor,
**kwargs)
return hf_processor
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None, "video": None} return {"image": None, "video": None}

View File

@ -5,10 +5,11 @@ from functools import lru_cache
from typing import TYPE_CHECKING, Any, Optional, Union, cast from typing import TYPE_CHECKING, Any, Optional, Union, cast
from transformers import (AutoFeatureExtractor, AutoImageProcessor, from transformers import (AutoFeatureExtractor, AutoImageProcessor,
AutoProcessor) AutoProcessor, AutoVideoProcessor)
from transformers.feature_extraction_utils import FeatureExtractionMixin from transformers.feature_extraction_utils import FeatureExtractionMixin
from transformers.image_processing_utils import BaseImageProcessor from transformers.image_processing_utils import BaseImageProcessor
from transformers.processing_utils import ProcessorMixin from transformers.processing_utils import ProcessorMixin
from transformers.video_processing_utils import BaseVideoProcessor
from typing_extensions import TypeVar from typing_extensions import TypeVar
from vllm.utils import get_allowed_kwarg_only_overrides from vllm.utils import get_allowed_kwarg_only_overrides
@ -17,6 +18,7 @@ if TYPE_CHECKING:
from vllm.config import ModelConfig from vllm.config import ModelConfig
_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin) _P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
_V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor)
class HashableDict(dict): class HashableDict(dict):
@ -243,3 +245,55 @@ def cached_image_processor_from_config(
trust_remote_code=model_config.trust_remote_code, trust_remote_code=model_config.trust_remote_code,
**_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs), **_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs),
) )
def get_video_processor(
processor_name: str,
*args: Any,
revision: Optional[str] = None,
trust_remote_code: bool = False,
processor_cls_overrides: Optional[type[_V]] = None,
**kwargs: Any,
):
"""Load a video processor for the given model name via HuggingFace."""
try:
processor_cls = processor_cls_overrides or AutoVideoProcessor
processor = processor_cls.from_pretrained(
processor_name,
*args,
revision=revision,
trust_remote_code=trust_remote_code,
**kwargs)
except ValueError as e:
# If the error pertains to the processor class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
# Unlike AutoTokenizer, AutoVideoProcessor does not separate such errors
if not trust_remote_code:
err_msg = (
"Failed to load the video processor. If the video processor is "
"a custom processor not yet available in the HuggingFace "
"transformers library, consider setting "
"`trust_remote_code=True` in LLM or using the "
"`--trust-remote-code` flag in the CLI.")
raise RuntimeError(err_msg) from e
else:
raise e
return cast(BaseVideoProcessor, processor)
cached_get_video_processor = lru_cache(get_video_processor)
def cached_video_processor_from_config(
model_config: "ModelConfig",
processor_cls: Optional[type[_V]] = None,
**kwargs: Any,
):
return cached_get_video_processor(
model_config.model,
revision=model_config.revision,
trust_remote_code=model_config.trust_remote_code,
processor_cls_overrides=processor_cls, # type: ignore[arg-type]
**_merge_mm_kwargs(model_config, AutoVideoProcessor, **kwargs),
)