mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 22:55:51 +08:00
[Bugfix] Fix InternS1 video processing after Transformers v4.56 (#25644)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
parent
532a6cfccb
commit
03858e6d1c
@ -770,8 +770,9 @@ steps:
|
|||||||
- pytest -v -s tests/models/multimodal/processing/
|
- pytest -v -s tests/models/multimodal/processing/
|
||||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
- pytest -v -s tests/models/multimodal/test_mapping.py
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
- python3 examples/offline_inference/audio_language.py --model-type whisper
|
|
||||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||||
|
# Whisper needs spawn method to avoid deadlock
|
||||||
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
|
|
||||||
- label: Blackwell Test # 38 min
|
- label: Blackwell Test # 38 min
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
|
|||||||
@ -213,6 +213,7 @@ _IGNORE_MM_KEYS = {
|
|||||||
MM_DATA_PATCHES = {
|
MM_DATA_PATCHES = {
|
||||||
# GLM4.1V and Qwen3-VL requires video metadata to be included in the input
|
# GLM4.1V and Qwen3-VL requires video metadata to be included in the input
|
||||||
"glm4v": glm4_1v_patch_mm_data,
|
"glm4v": glm4_1v_patch_mm_data,
|
||||||
|
"glm4v_moe": glm4_1v_patch_mm_data,
|
||||||
"qwen3_vl": qwen3_vl_patch_mm_data,
|
"qwen3_vl": qwen3_vl_patch_mm_data,
|
||||||
"qwen3_vl_moe": qwen3_vl_patch_mm_data,
|
"qwen3_vl_moe": qwen3_vl_patch_mm_data,
|
||||||
}
|
}
|
||||||
|
|||||||
@ -16,6 +16,8 @@ from transformers import BatchFeature, InternVLProcessor, PretrainedConfig
|
|||||||
from transformers.activations import ACT2FN
|
from transformers.activations import ACT2FN
|
||||||
from transformers.models.got_ocr2.image_processing_got_ocr2_fast import (
|
from transformers.models.got_ocr2.image_processing_got_ocr2_fast import (
|
||||||
GotOcr2ImageProcessorFast)
|
GotOcr2ImageProcessorFast)
|
||||||
|
from transformers.models.internvl.video_processing_internvl import (
|
||||||
|
InternVLVideoProcessor)
|
||||||
|
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||||
@ -31,6 +33,8 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
|||||||
PromptUpdate, PromptUpdateDetails)
|
PromptUpdate, PromptUpdateDetails)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
from vllm.transformers_utils.processor import (
|
||||||
|
cached_video_processor_from_config)
|
||||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||||
|
|
||||||
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
||||||
@ -152,7 +156,12 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
|
|||||||
"""ProcessingInfo for InternS1-style models."""
|
"""ProcessingInfo for InternS1-style models."""
|
||||||
|
|
||||||
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
|
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
|
||||||
return self.ctx.get_hf_processor(InternVLProcessor, **kwargs)
|
hf_processor = self.ctx.get_hf_processor(InternVLProcessor, **kwargs)
|
||||||
|
hf_processor.video_processor = cached_video_processor_from_config(
|
||||||
|
self.ctx.model_config,
|
||||||
|
processor_cls=InternVLVideoProcessor,
|
||||||
|
**kwargs)
|
||||||
|
return hf_processor
|
||||||
|
|
||||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||||
return {"image": None, "video": None}
|
return {"image": None, "video": None}
|
||||||
|
|||||||
@ -5,10 +5,11 @@ from functools import lru_cache
|
|||||||
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
from typing import TYPE_CHECKING, Any, Optional, Union, cast
|
||||||
|
|
||||||
from transformers import (AutoFeatureExtractor, AutoImageProcessor,
|
from transformers import (AutoFeatureExtractor, AutoImageProcessor,
|
||||||
AutoProcessor)
|
AutoProcessor, AutoVideoProcessor)
|
||||||
from transformers.feature_extraction_utils import FeatureExtractionMixin
|
from transformers.feature_extraction_utils import FeatureExtractionMixin
|
||||||
from transformers.image_processing_utils import BaseImageProcessor
|
from transformers.image_processing_utils import BaseImageProcessor
|
||||||
from transformers.processing_utils import ProcessorMixin
|
from transformers.processing_utils import ProcessorMixin
|
||||||
|
from transformers.video_processing_utils import BaseVideoProcessor
|
||||||
from typing_extensions import TypeVar
|
from typing_extensions import TypeVar
|
||||||
|
|
||||||
from vllm.utils import get_allowed_kwarg_only_overrides
|
from vllm.utils import get_allowed_kwarg_only_overrides
|
||||||
@ -17,6 +18,7 @@ if TYPE_CHECKING:
|
|||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
|
|
||||||
_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
|
_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
|
||||||
|
_V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor)
|
||||||
|
|
||||||
|
|
||||||
class HashableDict(dict):
|
class HashableDict(dict):
|
||||||
@ -243,3 +245,55 @@ def cached_image_processor_from_config(
|
|||||||
trust_remote_code=model_config.trust_remote_code,
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
**_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs),
|
**_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_video_processor(
|
||||||
|
processor_name: str,
|
||||||
|
*args: Any,
|
||||||
|
revision: Optional[str] = None,
|
||||||
|
trust_remote_code: bool = False,
|
||||||
|
processor_cls_overrides: Optional[type[_V]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
):
|
||||||
|
"""Load a video processor for the given model name via HuggingFace."""
|
||||||
|
try:
|
||||||
|
processor_cls = processor_cls_overrides or AutoVideoProcessor
|
||||||
|
processor = processor_cls.from_pretrained(
|
||||||
|
processor_name,
|
||||||
|
*args,
|
||||||
|
revision=revision,
|
||||||
|
trust_remote_code=trust_remote_code,
|
||||||
|
**kwargs)
|
||||||
|
except ValueError as e:
|
||||||
|
# If the error pertains to the processor class not existing or not
|
||||||
|
# currently being imported, suggest using the --trust-remote-code flag.
|
||||||
|
# Unlike AutoTokenizer, AutoVideoProcessor does not separate such errors
|
||||||
|
if not trust_remote_code:
|
||||||
|
err_msg = (
|
||||||
|
"Failed to load the video processor. If the video processor is "
|
||||||
|
"a custom processor not yet available in the HuggingFace "
|
||||||
|
"transformers library, consider setting "
|
||||||
|
"`trust_remote_code=True` in LLM or using the "
|
||||||
|
"`--trust-remote-code` flag in the CLI.")
|
||||||
|
raise RuntimeError(err_msg) from e
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
|
|
||||||
|
return cast(BaseVideoProcessor, processor)
|
||||||
|
|
||||||
|
|
||||||
|
cached_get_video_processor = lru_cache(get_video_processor)
|
||||||
|
|
||||||
|
|
||||||
|
def cached_video_processor_from_config(
|
||||||
|
model_config: "ModelConfig",
|
||||||
|
processor_cls: Optional[type[_V]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
):
|
||||||
|
return cached_get_video_processor(
|
||||||
|
model_config.model,
|
||||||
|
revision=model_config.revision,
|
||||||
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
|
processor_cls_overrides=processor_cls, # type: ignore[arg-type]
|
||||||
|
**_merge_mm_kwargs(model_config, AutoVideoProcessor, **kwargs),
|
||||||
|
)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user