mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-20 14:17:04 +08:00
[BugFix] Fix the issue where image embeddings were incorrectly split.… (#23366)
Signed-off-by: bppps <bpppsaka@gmail.com> Co-authored-by: zouyu.zzx <zouyu.zzx@alibaba-inc.com> Co-authored-by: bppps <bpppsaka@gmail.com>
This commit is contained in:
parent
88491c1b6b
commit
424fb7a5d2
@ -74,7 +74,8 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
|||||||
from ..layers.activation import SiluAndMul
|
from ..layers.activation import SiluAndMul
|
||||||
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
||||||
SupportsMultiModal, SupportsPP)
|
SupportsMultiModal, SupportsPP)
|
||||||
from .qwen2_vl import _qwen2vl_field_config, apply_rotary_pos_emb_vision
|
from .qwen2_vl import (_create_qwen2vl_field_factory,
|
||||||
|
apply_rotary_pos_emb_vision)
|
||||||
from .utils import (AutoWeightsLoader, WeightsMapper,
|
from .utils import (AutoWeightsLoader, WeightsMapper,
|
||||||
init_vllm_registered_model, maybe_prefix,
|
init_vllm_registered_model, maybe_prefix,
|
||||||
merge_multimodal_embeddings)
|
merge_multimodal_embeddings)
|
||||||
@ -1153,7 +1154,9 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
|
|||||||
hf_inputs: BatchFeature,
|
hf_inputs: BatchFeature,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
) -> Mapping[str, MultiModalFieldConfig]:
|
) -> Mapping[str, MultiModalFieldConfig]:
|
||||||
return _qwen2vl_field_config(hf_inputs)
|
return _create_qwen2vl_field_factory(
|
||||||
|
self.info.get_hf_config().vision_config.spatial_merge_size)(
|
||||||
|
hf_inputs)
|
||||||
|
|
||||||
def _get_prompt_updates(
|
def _get_prompt_updates(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -25,7 +25,7 @@
|
|||||||
from collections.abc import Iterable, Mapping, Sequence
|
from collections.abc import Iterable, Mapping, Sequence
|
||||||
from copy import copy
|
from copy import copy
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import Any, Optional, Union
|
from typing import Any, Callable, Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
@ -79,40 +79,57 @@ except (ImportError, ModuleNotFoundError):
|
|||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def _qwen2_5_omni_thinker_field_config(hf_inputs: Mapping[str, torch.Tensor]):
|
def create_qwen2_5_omni_thinker_field_factory(
|
||||||
audio_feature_lengths = hf_inputs.get("audio_feature_lengths",
|
spatial_merge_size: int
|
||||||
torch.empty((0, )))
|
) -> Callable[[Mapping[str, torch.Tensor]], Mapping[str,
|
||||||
|
MultiModalFieldConfig]]:
|
||||||
|
|
||||||
image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
|
def _qwen2_5_omni_thinker_field_config(hf_inputs: Mapping[str,
|
||||||
image_grid_sizes = image_grid_thw.prod(-1)
|
torch.Tensor]):
|
||||||
|
audio_feature_lengths = hf_inputs.get("audio_feature_lengths",
|
||||||
|
torch.empty((0, )))
|
||||||
|
|
||||||
video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
|
image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
|
||||||
video_grid_sizes = video_grid_thw.prod(-1)
|
image_pixel_grid_sizes = image_grid_thw.prod(-1)
|
||||||
|
image_embed_grid_sizes = (image_pixel_grid_sizes //
|
||||||
|
spatial_merge_size // spatial_merge_size)
|
||||||
|
|
||||||
num_videos = len(video_grid_sizes)
|
video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
|
||||||
|
video_grid_sizes = video_grid_thw.prod(-1)
|
||||||
|
video_embed_grid_sizes = (video_grid_sizes // spatial_merge_size //
|
||||||
|
spatial_merge_size)
|
||||||
|
|
||||||
return dict(
|
num_videos = len(video_grid_sizes)
|
||||||
input_audio_features=MultiModalFieldConfig.flat_from_sizes(
|
|
||||||
"audio", audio_feature_lengths, dim=1),
|
return dict(
|
||||||
feature_attention_mask=MultiModalFieldConfig.batched("audio"),
|
input_audio_features=MultiModalFieldConfig.flat_from_sizes(
|
||||||
audio_feature_lengths=MultiModalFieldConfig.batched("audio"),
|
"audio", audio_feature_lengths, dim=1),
|
||||||
pixel_values=MultiModalFieldConfig.flat_from_sizes(
|
feature_attention_mask=MultiModalFieldConfig.batched("audio"),
|
||||||
"image", image_grid_sizes),
|
audio_feature_lengths=MultiModalFieldConfig.batched("audio"),
|
||||||
image_embeds=MultiModalFieldConfig.flat_from_sizes(
|
pixel_values=MultiModalFieldConfig.flat_from_sizes(
|
||||||
"image", image_grid_sizes),
|
"image", image_pixel_grid_sizes),
|
||||||
image_grid_thw=MultiModalFieldConfig.batched("image"),
|
image_embeds=MultiModalFieldConfig.flat_from_sizes(
|
||||||
pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
|
"image", image_embed_grid_sizes),
|
||||||
"video", video_grid_sizes),
|
image_grid_thw=MultiModalFieldConfig.batched("image"),
|
||||||
video_embeds=MultiModalFieldConfig.flat_from_sizes(
|
pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
|
||||||
"video", video_grid_sizes),
|
"video", video_grid_sizes),
|
||||||
video_grid_thw=MultiModalFieldConfig.batched("video"),
|
video_embeds=MultiModalFieldConfig.flat_from_sizes(
|
||||||
second_per_grid_ts=MultiModalFieldConfig.batched("video"),
|
"video", video_embed_grid_sizes),
|
||||||
use_audio_in_video=MultiModalFieldConfig.shared("video", num_videos),
|
video_grid_thw=MultiModalFieldConfig.batched("video"),
|
||||||
)
|
second_per_grid_ts=MultiModalFieldConfig.batched("video"),
|
||||||
|
use_audio_in_video=MultiModalFieldConfig.shared(
|
||||||
|
"video", num_videos),
|
||||||
|
)
|
||||||
|
|
||||||
|
return _qwen2_5_omni_thinker_field_config
|
||||||
|
|
||||||
|
|
||||||
class Qwen2_5OmniThinkerMultiModalDataParser(Qwen2VLMultiModalDataParser):
|
class Qwen2_5OmniThinkerMultiModalDataParser(Qwen2VLMultiModalDataParser):
|
||||||
|
|
||||||
|
def __init__(self, spatial_merge_size: int, *args, **kwargs):
|
||||||
|
self._spatial_merge_size = spatial_merge_size
|
||||||
|
super().__init__(self._spatial_merge_size, *args, **kwargs)
|
||||||
|
|
||||||
def _parse_audio_data(
|
def _parse_audio_data(
|
||||||
self,
|
self,
|
||||||
data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
|
data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
|
||||||
@ -124,7 +141,8 @@ class Qwen2_5OmniThinkerMultiModalDataParser(Qwen2VLMultiModalDataParser):
|
|||||||
required_fields={
|
required_fields={
|
||||||
"input_audio_features", "audio_feature_lengths"
|
"input_audio_features", "audio_feature_lengths"
|
||||||
},
|
},
|
||||||
fields_factory=_qwen2_5_omni_thinker_field_config,
|
fields_factory=create_qwen2_5_omni_thinker_field_factory(
|
||||||
|
self._spatial_merge_size),
|
||||||
)
|
)
|
||||||
|
|
||||||
return super()._parse_audio_data(data)
|
return super()._parse_audio_data(data)
|
||||||
@ -214,6 +232,8 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
|||||||
def _get_data_parser(self) -> MultiModalDataParser:
|
def _get_data_parser(self) -> MultiModalDataParser:
|
||||||
feature_extractor = self.info.get_feature_extractor()
|
feature_extractor = self.info.get_feature_extractor()
|
||||||
return Qwen2_5OmniThinkerMultiModalDataParser(
|
return Qwen2_5OmniThinkerMultiModalDataParser(
|
||||||
|
spatial_merge_size=self.info.get_hf_config(
|
||||||
|
).vision_config.spatial_merge_size,
|
||||||
target_sr=feature_extractor.sampling_rate)
|
target_sr=feature_extractor.sampling_rate)
|
||||||
|
|
||||||
def _call_hf_processor(
|
def _call_hf_processor(
|
||||||
@ -265,7 +285,9 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
|||||||
hf_inputs: BatchFeature,
|
hf_inputs: BatchFeature,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
) -> Mapping[str, MultiModalFieldConfig]:
|
) -> Mapping[str, MultiModalFieldConfig]:
|
||||||
return _qwen2_5_omni_thinker_field_config(hf_inputs)
|
return create_qwen2_5_omni_thinker_field_factory(
|
||||||
|
self.info.get_hf_config().vision_config.spatial_merge_size)(
|
||||||
|
hf_inputs)
|
||||||
|
|
||||||
def _maybe_apply_prompt_updates(
|
def _maybe_apply_prompt_updates(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -699,29 +699,46 @@ class Qwen2VisionTransformer(nn.Module):
|
|||||||
return loaded_params
|
return loaded_params
|
||||||
|
|
||||||
|
|
||||||
def _qwen2vl_field_config(hf_inputs: Mapping[str, torch.Tensor]):
|
def _create_qwen2vl_field_factory(
|
||||||
image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
|
spatial_merge_size: int
|
||||||
image_grid_sizes = image_grid_thw.prod(-1)
|
) -> Callable[
|
||||||
|
[Mapping[str, torch.Tensor]],
|
||||||
|
Mapping[str, MultiModalFieldConfig],
|
||||||
|
]:
|
||||||
|
|
||||||
video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
|
def _qwen2vl_field_config(hf_inputs: Mapping[str, torch.Tensor]):
|
||||||
video_grid_sizes = video_grid_thw.prod(-1)
|
image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
|
||||||
|
image_pixel_grid_sizes = image_grid_thw.prod(-1)
|
||||||
|
image_embed_grid_sizes = (image_pixel_grid_sizes //
|
||||||
|
spatial_merge_size // spatial_merge_size)
|
||||||
|
|
||||||
return dict(
|
video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
|
||||||
pixel_values=MultiModalFieldConfig.flat_from_sizes(
|
video_grid_sizes = video_grid_thw.prod(-1)
|
||||||
"image", image_grid_sizes),
|
video_embed_grid_sizes = (video_grid_sizes // spatial_merge_size //
|
||||||
image_embeds=MultiModalFieldConfig.flat_from_sizes(
|
spatial_merge_size)
|
||||||
"image", image_grid_sizes),
|
|
||||||
image_grid_thw=MultiModalFieldConfig.batched("image"),
|
return dict(
|
||||||
pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
|
pixel_values=MultiModalFieldConfig.flat_from_sizes(
|
||||||
"video", video_grid_sizes),
|
"image", image_pixel_grid_sizes),
|
||||||
video_embeds=MultiModalFieldConfig.flat_from_sizes(
|
image_embeds=MultiModalFieldConfig.flat_from_sizes(
|
||||||
"video", video_grid_sizes),
|
"image", image_embed_grid_sizes),
|
||||||
video_grid_thw=MultiModalFieldConfig.batched("video"),
|
image_grid_thw=MultiModalFieldConfig.batched("image"),
|
||||||
)
|
pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
|
||||||
|
"video", video_grid_sizes),
|
||||||
|
video_embeds=MultiModalFieldConfig.flat_from_sizes(
|
||||||
|
"video", video_embed_grid_sizes),
|
||||||
|
video_grid_thw=MultiModalFieldConfig.batched("video"),
|
||||||
|
)
|
||||||
|
|
||||||
|
return _qwen2vl_field_config
|
||||||
|
|
||||||
|
|
||||||
class Qwen2VLMultiModalDataParser(MultiModalDataParser):
|
class Qwen2VLMultiModalDataParser(MultiModalDataParser):
|
||||||
|
|
||||||
|
def __init__(self, spatial_merge_size: int, *args, **kwargs):
|
||||||
|
self._spatial_merge_size = spatial_merge_size
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
def _parse_image_data(
|
def _parse_image_data(
|
||||||
self,
|
self,
|
||||||
data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
|
data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
|
||||||
@ -731,7 +748,8 @@ class Qwen2VLMultiModalDataParser(MultiModalDataParser):
|
|||||||
data,
|
data,
|
||||||
modality="image",
|
modality="image",
|
||||||
required_fields={"image_embeds", "image_grid_thw"},
|
required_fields={"image_embeds", "image_grid_thw"},
|
||||||
fields_factory=_qwen2vl_field_config,
|
fields_factory=_create_qwen2vl_field_factory(
|
||||||
|
self._spatial_merge_size),
|
||||||
)
|
)
|
||||||
|
|
||||||
return super()._parse_image_data(data)
|
return super()._parse_image_data(data)
|
||||||
@ -745,7 +763,8 @@ class Qwen2VLMultiModalDataParser(MultiModalDataParser):
|
|||||||
data,
|
data,
|
||||||
modality="video",
|
modality="video",
|
||||||
required_fields={"video_embeds", "video_grid_thw"},
|
required_fields={"video_embeds", "video_grid_thw"},
|
||||||
fields_factory=_qwen2vl_field_config,
|
fields_factory=_create_qwen2vl_field_factory(
|
||||||
|
self._spatial_merge_size),
|
||||||
)
|
)
|
||||||
|
|
||||||
return super()._parse_video_data(data)
|
return super()._parse_video_data(data)
|
||||||
@ -967,7 +986,8 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
|
|||||||
):
|
):
|
||||||
|
|
||||||
def _get_data_parser(self) -> MultiModalDataParser:
|
def _get_data_parser(self) -> MultiModalDataParser:
|
||||||
return Qwen2VLMultiModalDataParser()
|
return Qwen2VLMultiModalDataParser(
|
||||||
|
self.info.get_hf_config().vision_config.spatial_merge_size)
|
||||||
|
|
||||||
def _get_prompt_updates(
|
def _get_prompt_updates(
|
||||||
self,
|
self,
|
||||||
@ -1010,7 +1030,9 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
|
|||||||
hf_inputs: BatchFeature,
|
hf_inputs: BatchFeature,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
) -> Mapping[str, MultiModalFieldConfig]:
|
) -> Mapping[str, MultiModalFieldConfig]:
|
||||||
return _qwen2vl_field_config(hf_inputs)
|
return _create_qwen2vl_field_factory(
|
||||||
|
self.info.get_hf_config().vision_config.spatial_merge_size)(
|
||||||
|
hf_inputs)
|
||||||
|
|
||||||
|
|
||||||
@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor,
|
@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user