mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-14 20:54:59 +08:00
[Model] Pass mm_features directly into get_mrope_input_positions (#28399)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
7dbe6d81d6
commit
afffd3cc8a
@ -34,7 +34,7 @@ import torch
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from einops import rearrange, repeat
|
from einops import rearrange, repeat
|
||||||
from transformers import BatchFeature, PretrainedConfig
|
from transformers import BatchFeature
|
||||||
|
|
||||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||||
from vllm.attention.layer import (
|
from vllm.attention.layer import (
|
||||||
@ -58,6 +58,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
|||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (
|
from vllm.multimodal.inputs import (
|
||||||
MultiModalDataDict,
|
MultiModalDataDict,
|
||||||
|
MultiModalFeatureSpec,
|
||||||
MultiModalFieldConfig,
|
MultiModalFieldConfig,
|
||||||
MultiModalKwargsItems,
|
MultiModalKwargsItems,
|
||||||
)
|
)
|
||||||
@ -1433,15 +1434,16 @@ class Ernie4_5_VLMoeForConditionalGeneration(
|
|||||||
def get_mrope_input_positions(
|
def get_mrope_input_positions(
|
||||||
self,
|
self,
|
||||||
input_tokens: list[int],
|
input_tokens: list[int],
|
||||||
hf_config: PretrainedConfig,
|
mm_features: list[MultiModalFeatureSpec],
|
||||||
image_grid_thw: list[list[int]] | torch.Tensor,
|
|
||||||
video_grid_thw: list[list[int]] | torch.Tensor,
|
|
||||||
second_per_grid_ts: list[float] | None = None,
|
|
||||||
audio_feature_lengths: torch.Tensor | None = None,
|
|
||||||
use_audio_in_video: bool = False,
|
|
||||||
) -> tuple[torch.Tensor, int]:
|
) -> tuple[torch.Tensor, int]:
|
||||||
"""Get mrope input positions and delta value for Ernie VL."""
|
kwargs = MultiModalFeatureSpec.gather_kwargs(
|
||||||
|
mm_features,
|
||||||
|
{"image_grid_thw", "video_grid_thw"},
|
||||||
|
)
|
||||||
|
image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
|
||||||
|
video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
|
||||||
|
|
||||||
|
hf_config = self.config
|
||||||
image_token_id = hf_config.im_patch_id
|
image_token_id = hf_config.im_patch_id
|
||||||
video_start_token_id = hf_config.video_start_token_id
|
video_start_token_id = hf_config.video_start_token_id
|
||||||
video_end_token_id = hf_config.video_end_token_id
|
video_end_token_id = hf_config.video_end_token_id
|
||||||
@ -1449,10 +1451,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(
|
|||||||
temporal_conv_size = hf_config.temporal_conv_size
|
temporal_conv_size = hf_config.temporal_conv_size
|
||||||
llm_pos_ids_list: list = []
|
llm_pos_ids_list: list = []
|
||||||
|
|
||||||
if not (image_grid_thw is None and video_grid_thw is None):
|
if image_grid_thw or video_grid_thw:
|
||||||
if isinstance(image_grid_thw, torch.Tensor):
|
|
||||||
image_grid_thw = image_grid_thw.tolist()
|
|
||||||
|
|
||||||
input_token_type: list[str] = []
|
input_token_type: list[str] = []
|
||||||
video_check_flg = False
|
video_check_flg = False
|
||||||
for token in input_tokens:
|
for token in input_tokens:
|
||||||
@ -1484,11 +1483,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(
|
|||||||
llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
|
llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
|
||||||
)
|
)
|
||||||
if modality_type == "image":
|
if modality_type == "image":
|
||||||
t, h, w = (
|
t, h, w = image_grid_thw[mm_data_idx]
|
||||||
image_grid_thw[mm_data_idx][0],
|
|
||||||
image_grid_thw[mm_data_idx][1],
|
|
||||||
image_grid_thw[mm_data_idx][2],
|
|
||||||
)
|
|
||||||
llm_grid_t, llm_grid_h, llm_grid_w = (
|
llm_grid_t, llm_grid_h, llm_grid_w = (
|
||||||
t,
|
t,
|
||||||
h // spatial_conv_size,
|
h // spatial_conv_size,
|
||||||
@ -1519,11 +1514,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(
|
|||||||
mm_data_idx += 1
|
mm_data_idx += 1
|
||||||
|
|
||||||
elif modality_type == "video":
|
elif modality_type == "video":
|
||||||
t, h, w = (
|
t, h, w = video_grid_thw[mm_data_idx]
|
||||||
video_grid_thw[mm_data_idx][0],
|
|
||||||
video_grid_thw[mm_data_idx][1],
|
|
||||||
video_grid_thw[mm_data_idx][2],
|
|
||||||
)
|
|
||||||
llm_grid_t, llm_grid_h, llm_grid_w = (
|
llm_grid_t, llm_grid_h, llm_grid_w = (
|
||||||
t // temporal_conv_size,
|
t // temporal_conv_size,
|
||||||
h // spatial_conv_size,
|
h // spatial_conv_size,
|
||||||
|
|||||||
@ -37,7 +37,7 @@ import torch
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from einops import rearrange
|
from einops import rearrange
|
||||||
from transformers import BatchFeature, PretrainedConfig
|
from transformers import BatchFeature
|
||||||
from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig
|
from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig
|
||||||
from transformers.models.glm4v.image_processing_glm4v import (
|
from transformers.models.glm4v.image_processing_glm4v import (
|
||||||
Glm4vImageProcessor,
|
Glm4vImageProcessor,
|
||||||
@ -70,6 +70,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
|
|||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (
|
from vllm.multimodal.inputs import (
|
||||||
MultiModalDataDict,
|
MultiModalDataDict,
|
||||||
|
MultiModalFeatureSpec,
|
||||||
MultiModalFieldConfig,
|
MultiModalFieldConfig,
|
||||||
MultiModalKwargsItems,
|
MultiModalKwargsItems,
|
||||||
VideoItem,
|
VideoItem,
|
||||||
@ -1619,25 +1620,23 @@ class Glm4vForConditionalGeneration(
|
|||||||
def get_mrope_input_positions(
|
def get_mrope_input_positions(
|
||||||
self,
|
self,
|
||||||
input_tokens: list[int],
|
input_tokens: list[int],
|
||||||
hf_config: "PretrainedConfig",
|
mm_features: list[MultiModalFeatureSpec],
|
||||||
image_grid_thw: list[list[int]] | torch.Tensor | None,
|
|
||||||
video_grid_thw: list[list[int]] | torch.Tensor | None,
|
|
||||||
second_per_grid_ts: list[float] | None = None,
|
|
||||||
audio_feature_lengths: torch.Tensor | None = None,
|
|
||||||
use_audio_in_video: bool = False,
|
|
||||||
) -> tuple[torch.Tensor, int]:
|
) -> tuple[torch.Tensor, int]:
|
||||||
"""Get mrope input positions and delta value for GLM4V."""
|
kwargs = MultiModalFeatureSpec.gather_kwargs(
|
||||||
|
mm_features,
|
||||||
|
{"image_grid_thw", "video_grid_thw"},
|
||||||
|
)
|
||||||
|
image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
|
||||||
|
video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
|
||||||
|
|
||||||
|
hf_config = self.config
|
||||||
image_token_id = hf_config.image_token_id
|
image_token_id = hf_config.image_token_id
|
||||||
video_start_token_id = hf_config.video_start_token_id
|
video_start_token_id = hf_config.video_start_token_id
|
||||||
video_end_token_id = hf_config.video_end_token_id
|
video_end_token_id = hf_config.video_end_token_id
|
||||||
spatial_merge_size = hf_config.vision_config.spatial_merge_size
|
spatial_merge_size = hf_config.vision_config.spatial_merge_size
|
||||||
llm_pos_ids_list: list = []
|
llm_pos_ids_list: list = []
|
||||||
|
|
||||||
if not (image_grid_thw is None and video_grid_thw is None):
|
if image_grid_thw or video_grid_thw:
|
||||||
if isinstance(image_grid_thw, torch.Tensor):
|
|
||||||
image_grid_thw = image_grid_thw.tolist()
|
|
||||||
|
|
||||||
input_token_type: list[str] = []
|
input_token_type: list[str] = []
|
||||||
video_check_flg = False
|
video_check_flg = False
|
||||||
for token in input_tokens:
|
for token in input_tokens:
|
||||||
@ -1669,11 +1668,7 @@ class Glm4vForConditionalGeneration(
|
|||||||
llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
|
llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
|
||||||
)
|
)
|
||||||
if modality_type == "image":
|
if modality_type == "image":
|
||||||
t, h, w = (
|
t, h, w = image_grid_thw[mm_data_idx]
|
||||||
image_grid_thw[mm_data_idx][0],
|
|
||||||
image_grid_thw[mm_data_idx][1],
|
|
||||||
image_grid_thw[mm_data_idx][2],
|
|
||||||
)
|
|
||||||
llm_grid_t, llm_grid_h, llm_grid_w = (
|
llm_grid_t, llm_grid_h, llm_grid_w = (
|
||||||
t,
|
t,
|
||||||
h // spatial_merge_size,
|
h // spatial_merge_size,
|
||||||
@ -1706,8 +1701,7 @@ class Glm4vForConditionalGeneration(
|
|||||||
elif modality_type == "video":
|
elif modality_type == "video":
|
||||||
t, h, w = (
|
t, h, w = (
|
||||||
video_frame_num,
|
video_frame_num,
|
||||||
image_grid_thw[mm_data_idx][1],
|
*image_grid_thw[mm_data_idx][1:],
|
||||||
image_grid_thw[mm_data_idx][2],
|
|
||||||
)
|
)
|
||||||
llm_grid_t, llm_grid_h, llm_grid_w = (
|
llm_grid_t, llm_grid_h, llm_grid_w = (
|
||||||
t,
|
t,
|
||||||
|
|||||||
@ -15,7 +15,7 @@ from torch import nn
|
|||||||
from torch.nn import LayerNorm
|
from torch.nn import LayerNorm
|
||||||
from torchvision import transforms
|
from torchvision import transforms
|
||||||
from torchvision.transforms import InterpolationMode
|
from torchvision.transforms import InterpolationMode
|
||||||
from transformers import BatchFeature, PretrainedConfig, PreTrainedTokenizer, TensorType
|
from transformers import BatchFeature, PreTrainedTokenizer, TensorType
|
||||||
from transformers.image_utils import ImageInput
|
from transformers.image_utils import ImageInput
|
||||||
from transformers.tokenization_utils_base import TextInput
|
from transformers.tokenization_utils_base import TextInput
|
||||||
|
|
||||||
@ -36,6 +36,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
|
|||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (
|
from vllm.multimodal.inputs import (
|
||||||
MultiModalDataDict,
|
MultiModalDataDict,
|
||||||
|
MultiModalFeatureSpec,
|
||||||
MultiModalFieldConfig,
|
MultiModalFieldConfig,
|
||||||
MultiModalKwargsItems,
|
MultiModalKwargsItems,
|
||||||
)
|
)
|
||||||
@ -622,25 +623,23 @@ class GLM4VForCausalLM(
|
|||||||
def get_mrope_input_positions(
|
def get_mrope_input_positions(
|
||||||
self,
|
self,
|
||||||
input_tokens: list[int],
|
input_tokens: list[int],
|
||||||
hf_config: PretrainedConfig,
|
mm_features: list[MultiModalFeatureSpec],
|
||||||
image_grid_thw: list[list[int]] | torch.Tensor,
|
|
||||||
video_grid_thw: list[list[int]] | torch.Tensor,
|
|
||||||
second_per_grid_ts: list[float] | None = None,
|
|
||||||
audio_feature_lengths: torch.Tensor | None = None,
|
|
||||||
use_audio_in_video: bool = False,
|
|
||||||
) -> tuple[torch.Tensor, int]:
|
) -> tuple[torch.Tensor, int]:
|
||||||
"""Get mrope input positions and delta value for GLM4V."""
|
kwargs = MultiModalFeatureSpec.gather_kwargs(
|
||||||
|
mm_features,
|
||||||
|
{"image_grid_thw", "video_grid_thw"},
|
||||||
|
)
|
||||||
|
image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
|
||||||
|
video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
|
||||||
|
|
||||||
|
hf_config = self.config
|
||||||
image_token_id = hf_config.image_token_id
|
image_token_id = hf_config.image_token_id
|
||||||
video_start_token_id = hf_config.video_start_token_id
|
video_start_token_id = hf_config.video_start_token_id
|
||||||
video_end_token_id = hf_config.video_end_token_id
|
video_end_token_id = hf_config.video_end_token_id
|
||||||
spatial_merge_size = hf_config.vision_config.spatial_merge_size
|
spatial_merge_size = hf_config.vision_config.spatial_merge_size
|
||||||
llm_pos_ids_list: list = []
|
llm_pos_ids_list: list = []
|
||||||
|
|
||||||
if not (image_grid_thw is None and video_grid_thw is None):
|
if image_grid_thw or video_grid_thw:
|
||||||
if isinstance(image_grid_thw, torch.Tensor):
|
|
||||||
image_grid_thw = image_grid_thw.tolist()
|
|
||||||
|
|
||||||
input_token_type: list[str] = []
|
input_token_type: list[str] = []
|
||||||
video_check_flg = False
|
video_check_flg = False
|
||||||
for token in input_tokens:
|
for token in input_tokens:
|
||||||
@ -672,11 +671,7 @@ class GLM4VForCausalLM(
|
|||||||
llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
|
llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
|
||||||
)
|
)
|
||||||
if modality_type == "image":
|
if modality_type == "image":
|
||||||
t, h, w = (
|
t, h, w = image_grid_thw[mm_data_idx]
|
||||||
image_grid_thw[mm_data_idx][0],
|
|
||||||
image_grid_thw[mm_data_idx][1],
|
|
||||||
image_grid_thw[mm_data_idx][2],
|
|
||||||
)
|
|
||||||
llm_grid_t, llm_grid_h, llm_grid_w = (
|
llm_grid_t, llm_grid_h, llm_grid_w = (
|
||||||
t,
|
t,
|
||||||
h // spatial_merge_size,
|
h // spatial_merge_size,
|
||||||
@ -709,8 +704,7 @@ class GLM4VForCausalLM(
|
|||||||
elif modality_type == "video":
|
elif modality_type == "video":
|
||||||
t, h, w = (
|
t, h, w = (
|
||||||
video_frame_num,
|
video_frame_num,
|
||||||
image_grid_thw[mm_data_idx][1],
|
*image_grid_thw[mm_data_idx][1:],
|
||||||
image_grid_thw[mm_data_idx][2],
|
|
||||||
)
|
)
|
||||||
llm_grid_t, llm_grid_h, llm_grid_w = (
|
llm_grid_t, llm_grid_h, llm_grid_w = (
|
||||||
t,
|
t,
|
||||||
|
|||||||
@ -16,7 +16,6 @@ import numpy as np
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
from transformers import PretrainedConfig
|
|
||||||
from transformers.models.whisper.tokenization_whisper import LANGUAGES
|
from transformers.models.whisper.tokenization_whisper import LANGUAGES
|
||||||
from typing_extensions import Self, TypeIs
|
from typing_extensions import Self, TypeIs
|
||||||
|
|
||||||
@ -32,10 +31,12 @@ from .interfaces_base import VllmModel, is_pooling_model
|
|||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.model_executor.models.utils import WeightsMapper
|
from vllm.model_executor.models.utils import WeightsMapper
|
||||||
|
from vllm.multimodal.inputs import MultiModalFeatureSpec
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
else:
|
else:
|
||||||
VllmConfig = object
|
VllmConfig = object
|
||||||
WeightsMapper = object
|
WeightsMapper = object
|
||||||
|
MultiModalFeatureSpec = object
|
||||||
IntermediateTensors = object
|
IntermediateTensors = object
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
@ -991,12 +992,7 @@ class SupportsMRoPE(Protocol):
|
|||||||
def get_mrope_input_positions(
|
def get_mrope_input_positions(
|
||||||
self,
|
self,
|
||||||
input_tokens: list[int],
|
input_tokens: list[int],
|
||||||
hf_config: PretrainedConfig,
|
mm_features: list["MultiModalFeatureSpec"],
|
||||||
image_grid_thw: list[list[int]] | torch.Tensor | None,
|
|
||||||
video_grid_thw: list[list[int]] | torch.Tensor | None,
|
|
||||||
second_per_grid_ts: list[float] | None = None,
|
|
||||||
audio_feature_lengths: torch.Tensor | None = None,
|
|
||||||
use_audio_in_video: bool = False,
|
|
||||||
) -> tuple[torch.Tensor, int]:
|
) -> tuple[torch.Tensor, int]:
|
||||||
"""
|
"""
|
||||||
Get M-RoPE input positions and delta value for this specific model.
|
Get M-RoPE input positions and delta value for this specific model.
|
||||||
@ -1006,17 +1002,11 @@ class SupportsMRoPE(Protocol):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
input_tokens: List of input token IDs
|
input_tokens: List of input token IDs
|
||||||
hf_config: HuggingFace model configuration
|
mm_features: Information about each multi-modal data item
|
||||||
image_grid_thw: Image grid dimensions (t, h, w)
|
|
||||||
video_grid_thw: Video grid dimensions (t, h, w)
|
|
||||||
second_per_grid_ts: Seconds per grid timestep for videos
|
|
||||||
audio_feature_lengths: Audio feature lengths for multimodal models
|
|
||||||
use_audio_in_video: Whether to use audio in video for interleaving
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (llm_positions, mrope_position_delta)
|
Tuple of `(llm_positions, mrope_position_delta)`
|
||||||
- llm_positions: Tensor of shape [3, num_tokens]
|
- llm_positions: Tensor of shape `[3, num_tokens]` with T/H/W positions
|
||||||
with T/H/W positions
|
|
||||||
- mrope_position_delta: Delta for position calculations
|
- mrope_position_delta: Delta for position calculations
|
||||||
"""
|
"""
|
||||||
...
|
...
|
||||||
|
|||||||
@ -40,6 +40,7 @@ from vllm.multimodal.inputs import (
|
|||||||
ImageItem,
|
ImageItem,
|
||||||
ModalityData,
|
ModalityData,
|
||||||
MultiModalDataDict,
|
MultiModalDataDict,
|
||||||
|
MultiModalFeatureSpec,
|
||||||
MultiModalFieldConfig,
|
MultiModalFieldConfig,
|
||||||
MultiModalKwargsItems,
|
MultiModalKwargsItems,
|
||||||
VideoItem,
|
VideoItem,
|
||||||
@ -1627,16 +1628,17 @@ class KeyeForConditionalGeneration(
|
|||||||
def get_mrope_input_positions(
|
def get_mrope_input_positions(
|
||||||
self,
|
self,
|
||||||
input_tokens: list[int],
|
input_tokens: list[int],
|
||||||
hf_config: PretrainedConfig,
|
mm_features: list[MultiModalFeatureSpec],
|
||||||
image_grid_thw: list[list[int]] | torch.Tensor,
|
|
||||||
video_grid_thw: list[list[int]] | torch.Tensor,
|
|
||||||
second_per_grid_ts: list[float] | None = None,
|
|
||||||
audio_feature_lengths: torch.Tensor | None = None,
|
|
||||||
use_audio_in_video: bool = False,
|
|
||||||
) -> tuple[torch.Tensor, int]:
|
) -> tuple[torch.Tensor, int]:
|
||||||
|
kwargs = MultiModalFeatureSpec.gather_kwargs(
|
||||||
|
mm_features,
|
||||||
|
{"image_grid_thw", "video_grid_thw"},
|
||||||
|
)
|
||||||
|
image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
|
||||||
|
video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
|
||||||
|
|
||||||
if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0:
|
if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0:
|
||||||
video_grid_thw = video_grid_thw[0]
|
video_grid_thw = video_grid_thw[0]
|
||||||
"""Get mrope input positions and delta value (Keye series)."""
|
|
||||||
|
|
||||||
def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]:
|
def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]:
|
||||||
"""
|
"""
|
||||||
@ -1662,6 +1664,7 @@ class KeyeForConditionalGeneration(
|
|||||||
|
|
||||||
video_grid_thw = split_thw(video_grid_thw)
|
video_grid_thw = split_thw(video_grid_thw)
|
||||||
|
|
||||||
|
hf_config = self.config
|
||||||
image_token_id = hf_config.image_token_id
|
image_token_id = hf_config.image_token_id
|
||||||
video_token_id = hf_config.video_token_id
|
video_token_id = hf_config.video_token_id
|
||||||
spatial_merge_size = hf_config.vision_config.spatial_merge_size
|
spatial_merge_size = hf_config.vision_config.spatial_merge_size
|
||||||
@ -1691,20 +1694,12 @@ class KeyeForConditionalGeneration(
|
|||||||
ed_video = len(input_tokens) + 1
|
ed_video = len(input_tokens) + 1
|
||||||
|
|
||||||
if ed_image < ed_video:
|
if ed_image < ed_video:
|
||||||
t, h, w = (
|
t, h, w = image_grid_thw[image_index]
|
||||||
image_grid_thw[image_index][0],
|
|
||||||
image_grid_thw[image_index][1],
|
|
||||||
image_grid_thw[image_index][2],
|
|
||||||
)
|
|
||||||
image_index += 1
|
image_index += 1
|
||||||
remain_images -= 1
|
remain_images -= 1
|
||||||
ed = ed_image
|
ed = ed_image
|
||||||
else:
|
else:
|
||||||
t, h, w = (
|
t, h, w = video_grid_thw[video_index]
|
||||||
video_grid_thw[video_index][0],
|
|
||||||
video_grid_thw[video_index][1],
|
|
||||||
video_grid_thw[video_index][2],
|
|
||||||
)
|
|
||||||
video_index += 1
|
video_index += 1
|
||||||
remain_frames -= 1
|
remain_frames -= 1
|
||||||
ed = ed_video
|
ed = ed_video
|
||||||
|
|||||||
@ -21,6 +21,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
|
|||||||
from vllm.multimodal.inputs import (
|
from vllm.multimodal.inputs import (
|
||||||
ImageItem,
|
ImageItem,
|
||||||
ModalityData,
|
ModalityData,
|
||||||
|
MultiModalFeatureSpec,
|
||||||
MultiModalFieldConfig,
|
MultiModalFieldConfig,
|
||||||
MultiModalKwargsItems,
|
MultiModalKwargsItems,
|
||||||
VideoItem,
|
VideoItem,
|
||||||
@ -597,16 +598,17 @@ class KeyeVL1_5ForConditionalGeneration(
|
|||||||
def get_mrope_input_positions(
|
def get_mrope_input_positions(
|
||||||
self,
|
self,
|
||||||
input_tokens: list[int],
|
input_tokens: list[int],
|
||||||
hf_config: PretrainedConfig,
|
mm_features: list[MultiModalFeatureSpec],
|
||||||
image_grid_thw: list[list[int]] | torch.Tensor,
|
|
||||||
video_grid_thw: list[list[int]] | torch.Tensor,
|
|
||||||
second_per_grid_ts: list[float] | None = None,
|
|
||||||
audio_feature_lengths: torch.Tensor | None = None,
|
|
||||||
use_audio_in_video: bool = False,
|
|
||||||
) -> tuple[torch.Tensor, int]:
|
) -> tuple[torch.Tensor, int]:
|
||||||
|
kwargs = MultiModalFeatureSpec.gather_kwargs(
|
||||||
|
mm_features,
|
||||||
|
{"image_grid_thw", "video_grid_thw"},
|
||||||
|
)
|
||||||
|
image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
|
||||||
|
video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
|
||||||
|
|
||||||
if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0:
|
if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0:
|
||||||
video_grid_thw = video_grid_thw[0]
|
video_grid_thw = video_grid_thw[0]
|
||||||
"""Get mrope input positions and delta value (Keye series)."""
|
|
||||||
|
|
||||||
def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]:
|
def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]:
|
||||||
"""
|
"""
|
||||||
@ -632,6 +634,7 @@ class KeyeVL1_5ForConditionalGeneration(
|
|||||||
|
|
||||||
video_grid_thw = split_thw(video_grid_thw)
|
video_grid_thw = split_thw(video_grid_thw)
|
||||||
|
|
||||||
|
hf_config = self.config
|
||||||
image_token_id = hf_config.image_token_id
|
image_token_id = hf_config.image_token_id
|
||||||
video_token_id = hf_config.video_token_id
|
video_token_id = hf_config.video_token_id
|
||||||
spatial_merge_size = hf_config.vision_config.spatial_merge_size
|
spatial_merge_size = hf_config.vision_config.spatial_merge_size
|
||||||
@ -661,20 +664,12 @@ class KeyeVL1_5ForConditionalGeneration(
|
|||||||
ed_video = len(input_tokens) + 1
|
ed_video = len(input_tokens) + 1
|
||||||
|
|
||||||
if ed_image < ed_video:
|
if ed_image < ed_video:
|
||||||
t, h, w = (
|
t, h, w = image_grid_thw[image_index]
|
||||||
image_grid_thw[image_index][0],
|
|
||||||
image_grid_thw[image_index][1],
|
|
||||||
image_grid_thw[image_index][2],
|
|
||||||
)
|
|
||||||
image_index += 1
|
image_index += 1
|
||||||
remain_images -= 1
|
remain_images -= 1
|
||||||
ed = ed_image
|
ed = ed_image
|
||||||
else:
|
else:
|
||||||
t, h, w = (
|
t, h, w = video_grid_thw[video_index]
|
||||||
video_grid_thw[video_index][0],
|
|
||||||
video_grid_thw[video_index][1],
|
|
||||||
video_grid_thw[video_index][2],
|
|
||||||
)
|
|
||||||
video_index += 1
|
video_index += 1
|
||||||
remain_frames -= 1
|
remain_frames -= 1
|
||||||
ed = ed_video
|
ed = ed_video
|
||||||
|
|||||||
@ -61,6 +61,7 @@ from vllm.model_executor.model_loader.weight_utils import (
|
|||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (
|
from vllm.multimodal.inputs import (
|
||||||
MultiModalDataDict,
|
MultiModalDataDict,
|
||||||
|
MultiModalFeatureSpec,
|
||||||
MultiModalFieldConfig,
|
MultiModalFieldConfig,
|
||||||
MultiModalKwargs,
|
MultiModalKwargs,
|
||||||
)
|
)
|
||||||
@ -1184,15 +1185,17 @@ class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, Support
|
|||||||
def get_mrope_input_positions(
|
def get_mrope_input_positions(
|
||||||
self,
|
self,
|
||||||
input_tokens: list[int],
|
input_tokens: list[int],
|
||||||
hf_config: PretrainedConfig,
|
mm_features: list[MultiModalFeatureSpec],
|
||||||
image_grid_thw: list[list[int]] | torch.Tensor,
|
|
||||||
video_grid_thw: list[list[int]] | torch.Tensor,
|
|
||||||
second_per_grid_ts: list[float],
|
|
||||||
audio_feature_lengths: torch.Tensor | None = None,
|
|
||||||
use_audio_in_video: bool = False,
|
|
||||||
) -> tuple[torch.Tensor, int]:
|
) -> tuple[torch.Tensor, int]:
|
||||||
"""Get mrope input positions and delta value."""
|
kwargs = MultiModalFeatureSpec.gather_kwargs(
|
||||||
|
mm_features,
|
||||||
|
{"image_grid_thw", "video_grid_thw", "second_per_grid_ts"},
|
||||||
|
)
|
||||||
|
image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
|
||||||
|
video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
|
||||||
|
second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
|
||||||
|
|
||||||
|
hf_config = self.config
|
||||||
image_token_id = hf_config.image_token_id
|
image_token_id = hf_config.image_token_id
|
||||||
video_token_id = hf_config.video_token_id
|
video_token_id = hf_config.video_token_id
|
||||||
vision_start_token_id = hf_config.vision_start_token_id
|
vision_start_token_id = hf_config.vision_start_token_id
|
||||||
@ -1229,20 +1232,12 @@ class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, Support
|
|||||||
else:
|
else:
|
||||||
ed_video = len(input_tokens) + 1
|
ed_video = len(input_tokens) + 1
|
||||||
if ed_image < ed_video:
|
if ed_image < ed_video:
|
||||||
t, h, w = (
|
t, h, w = image_grid_thw[image_index]
|
||||||
image_grid_thw[image_index][0],
|
|
||||||
image_grid_thw[image_index][1],
|
|
||||||
image_grid_thw[image_index][2],
|
|
||||||
)
|
|
||||||
image_index += 1
|
image_index += 1
|
||||||
remain_images -= 1
|
remain_images -= 1
|
||||||
ed = ed_image
|
ed = ed_image
|
||||||
else:
|
else:
|
||||||
t, h, w = (
|
t, h, w = video_grid_thw[video_index]
|
||||||
video_grid_thw[video_index][0],
|
|
||||||
video_grid_thw[video_index][1],
|
|
||||||
video_grid_thw[video_index][2],
|
|
||||||
)
|
|
||||||
video_second_per_grid_t = 1.0
|
video_second_per_grid_t = 1.0
|
||||||
if second_per_grid_ts:
|
if second_per_grid_ts:
|
||||||
video_second_per_grid_t = second_per_grid_ts[video_index]
|
video_second_per_grid_t = second_per_grid_ts[video_index]
|
||||||
|
|||||||
@ -68,6 +68,7 @@ from vllm.multimodal.inputs import (
|
|||||||
ImageItem,
|
ImageItem,
|
||||||
ModalityData,
|
ModalityData,
|
||||||
MultiModalDataDict,
|
MultiModalDataDict,
|
||||||
|
MultiModalFeatureSpec,
|
||||||
MultiModalFieldConfig,
|
MultiModalFieldConfig,
|
||||||
MultiModalKwargsItems,
|
MultiModalKwargsItems,
|
||||||
NestedTensors,
|
NestedTensors,
|
||||||
@ -923,21 +924,9 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
|
|||||||
def get_mrope_input_positions(
|
def get_mrope_input_positions(
|
||||||
self,
|
self,
|
||||||
input_tokens: list[int],
|
input_tokens: list[int],
|
||||||
hf_config: PretrainedConfig,
|
mm_features: list[MultiModalFeatureSpec],
|
||||||
image_grid_thw: list[list[int]] | torch.Tensor,
|
|
||||||
video_grid_thw: list[list[int]] | torch.Tensor,
|
|
||||||
second_per_grid_ts: list[float] | None = None,
|
|
||||||
audio_feature_lengths: torch.Tensor | None = None,
|
|
||||||
use_audio_in_video: bool = False,
|
|
||||||
) -> tuple[torch.Tensor, int]:
|
) -> tuple[torch.Tensor, int]:
|
||||||
"""Get mrope input positions and delta value (Qwen2.5-Omni version).
|
"""
|
||||||
|
|
||||||
Differences from MRotaryEmbedding:
|
|
||||||
1. Add audio support (and related `audio_feature_lengths`).
|
|
||||||
2. Add `use_audio_in_video` option to read audio from video inputs.
|
|
||||||
In this case, audio and vision position ids will be split into
|
|
||||||
chunks and interleaved.
|
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
(V_i are vision position ids, A_i are audio position ids)
|
(V_i are vision position ids, A_i are audio position ids)
|
||||||
@ -945,11 +934,33 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
|
|||||||
|V_1 ... V_n|A_1 ... A_n|V_n+1 ... V_2n|A_n+1 ... A_2n|...
|
|V_1 ... V_n|A_1 ... A_n|V_n+1 ... V_2n|A_n+1 ... A_2n|...
|
||||||
|vision chunk 1|audio chunk 1|vision chunk 2|audio chunk 2 |...
|
|vision chunk 1|audio chunk 1|vision chunk 2|audio chunk 2 |...
|
||||||
"""
|
"""
|
||||||
|
kwargs = MultiModalFeatureSpec.gather_kwargs(
|
||||||
|
mm_features,
|
||||||
|
{
|
||||||
|
"image_grid_thw",
|
||||||
|
"video_grid_thw",
|
||||||
|
"second_per_grid_ts",
|
||||||
|
"audio_feature_lengths",
|
||||||
|
"use_audio_in_video",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
image_grid_thw = kwargs.get("image_grid_thw", [])
|
||||||
|
video_grid_thw = kwargs.get("video_grid_thw", [])
|
||||||
|
second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
|
||||||
|
audio_feature_lengths = kwargs.get("audio_feature_lengths", [])
|
||||||
|
use_audio_in_video = any(kwargs.get("use_audio_in_video", []))
|
||||||
|
|
||||||
|
image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)(
|
||||||
|
image_grid_thw
|
||||||
|
)
|
||||||
|
video_grid_thw = (torch.stack if video_grid_thw else torch.tensor)(
|
||||||
|
video_grid_thw
|
||||||
|
)
|
||||||
|
|
||||||
# TODO(fyabc): refactor and share more code with
|
# TODO(fyabc): refactor and share more code with
|
||||||
# _vl_get_input_positions_tensor.
|
# _vl_get_input_positions_tensor.
|
||||||
|
|
||||||
thinker_config = hf_config.thinker_config
|
thinker_config = self.config
|
||||||
audio_token_id = thinker_config.audio_token_index
|
audio_token_id = thinker_config.audio_token_index
|
||||||
image_token_id = thinker_config.image_token_index
|
image_token_id = thinker_config.image_token_index
|
||||||
video_token_id = thinker_config.video_token_index
|
video_token_id = thinker_config.video_token_index
|
||||||
@ -963,11 +974,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
|
|||||||
thinker_config.vision_config, "tokens_per_second", 25
|
thinker_config.vision_config, "tokens_per_second", 25
|
||||||
)
|
)
|
||||||
|
|
||||||
if isinstance(image_grid_thw, list):
|
|
||||||
image_grid_thw = torch.tensor(image_grid_thw)
|
|
||||||
if isinstance(video_grid_thw, list):
|
|
||||||
video_grid_thw = torch.tensor(video_grid_thw)
|
|
||||||
|
|
||||||
src_item = input_tokens
|
src_item = input_tokens
|
||||||
audio_seqlens = audio_feature_lengths
|
audio_seqlens = audio_feature_lengths
|
||||||
if not second_per_grid_ts:
|
if not second_per_grid_ts:
|
||||||
|
|||||||
@ -35,7 +35,7 @@ import einops
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from transformers import BatchFeature, PretrainedConfig
|
from transformers import BatchFeature
|
||||||
from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
|
from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
|
||||||
from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
|
from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
|
||||||
Qwen2_5_VLConfig,
|
Qwen2_5_VLConfig,
|
||||||
@ -75,7 +75,11 @@ from vllm.multimodal.evs import (
|
|||||||
compute_retention_mask,
|
compute_retention_mask,
|
||||||
recompute_mrope_positions,
|
recompute_mrope_positions,
|
||||||
)
|
)
|
||||||
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
|
from vllm.multimodal.inputs import (
|
||||||
|
MultiModalFeatureSpec,
|
||||||
|
MultiModalFieldConfig,
|
||||||
|
MultiModalKwargs,
|
||||||
|
)
|
||||||
from vllm.multimodal.parse import MultiModalDataItems
|
from vllm.multimodal.parse import MultiModalDataItems
|
||||||
from vllm.multimodal.processing import PromptReplacement, PromptUpdate
|
from vllm.multimodal.processing import PromptReplacement, PromptUpdate
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
@ -1120,15 +1124,17 @@ class Qwen2_5_VLForConditionalGeneration(
|
|||||||
def get_mrope_input_positions(
|
def get_mrope_input_positions(
|
||||||
self,
|
self,
|
||||||
input_tokens: list[int],
|
input_tokens: list[int],
|
||||||
hf_config: PretrainedConfig,
|
mm_features: list[MultiModalFeatureSpec],
|
||||||
image_grid_thw: list[list[int]] | torch.Tensor,
|
|
||||||
video_grid_thw: list[list[int]] | torch.Tensor,
|
|
||||||
second_per_grid_ts: list[float],
|
|
||||||
audio_feature_lengths: torch.Tensor | None = None,
|
|
||||||
use_audio_in_video: bool = False,
|
|
||||||
) -> tuple[torch.Tensor, int]:
|
) -> tuple[torch.Tensor, int]:
|
||||||
"""Get mrope input positions and delta value."""
|
kwargs = MultiModalFeatureSpec.gather_kwargs(
|
||||||
|
mm_features,
|
||||||
|
{"image_grid_thw", "video_grid_thw", "second_per_grid_ts"},
|
||||||
|
)
|
||||||
|
image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
|
||||||
|
video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
|
||||||
|
second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
|
||||||
|
|
||||||
|
hf_config = self.config
|
||||||
image_token_id = hf_config.image_token_id
|
image_token_id = hf_config.image_token_id
|
||||||
video_token_id = hf_config.video_token_id
|
video_token_id = hf_config.video_token_id
|
||||||
vision_start_token_id = hf_config.vision_start_token_id
|
vision_start_token_id = hf_config.vision_start_token_id
|
||||||
@ -1165,20 +1171,12 @@ class Qwen2_5_VLForConditionalGeneration(
|
|||||||
else:
|
else:
|
||||||
ed_video = len(input_tokens) + 1
|
ed_video = len(input_tokens) + 1
|
||||||
if ed_image < ed_video:
|
if ed_image < ed_video:
|
||||||
t, h, w = (
|
t, h, w = image_grid_thw[image_index]
|
||||||
image_grid_thw[image_index][0],
|
|
||||||
image_grid_thw[image_index][1],
|
|
||||||
image_grid_thw[image_index][2],
|
|
||||||
)
|
|
||||||
image_index += 1
|
image_index += 1
|
||||||
remain_images -= 1
|
remain_images -= 1
|
||||||
ed = ed_image
|
ed = ed_image
|
||||||
else:
|
else:
|
||||||
t, h, w = (
|
t, h, w = video_grid_thw[video_index]
|
||||||
video_grid_thw[video_index][0],
|
|
||||||
video_grid_thw[video_index][1],
|
|
||||||
video_grid_thw[video_index][2],
|
|
||||||
)
|
|
||||||
video_second_per_grid_t = 1.0
|
video_second_per_grid_t = 1.0
|
||||||
if second_per_grid_ts:
|
if second_per_grid_ts:
|
||||||
video_second_per_grid_t = second_per_grid_ts[video_index]
|
video_second_per_grid_t = second_per_grid_ts[video_index]
|
||||||
|
|||||||
@ -34,7 +34,7 @@ import torch
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from einops import rearrange, repeat
|
from einops import rearrange, repeat
|
||||||
from transformers import BatchFeature, PretrainedConfig
|
from transformers import BatchFeature
|
||||||
from transformers.models.qwen2_vl import Qwen2VLImageProcessor, Qwen2VLProcessor
|
from transformers.models.qwen2_vl import Qwen2VLImageProcessor, Qwen2VLProcessor
|
||||||
from transformers.models.qwen2_vl.configuration_qwen2_vl import (
|
from transformers.models.qwen2_vl.configuration_qwen2_vl import (
|
||||||
Qwen2VLConfig,
|
Qwen2VLConfig,
|
||||||
@ -70,6 +70,7 @@ from vllm.multimodal.inputs import (
|
|||||||
ImageItem,
|
ImageItem,
|
||||||
ModalityData,
|
ModalityData,
|
||||||
MultiModalDataDict,
|
MultiModalDataDict,
|
||||||
|
MultiModalFeatureSpec,
|
||||||
MultiModalFieldConfig,
|
MultiModalFieldConfig,
|
||||||
MultiModalKwargsItems,
|
MultiModalKwargsItems,
|
||||||
VideoItem,
|
VideoItem,
|
||||||
@ -1240,21 +1241,17 @@ class Qwen2VLForConditionalGeneration(
|
|||||||
def get_mrope_input_positions(
|
def get_mrope_input_positions(
|
||||||
self,
|
self,
|
||||||
input_tokens: list[int],
|
input_tokens: list[int],
|
||||||
hf_config: PretrainedConfig,
|
mm_features: list[MultiModalFeatureSpec],
|
||||||
image_grid_thw: list[list[int]] | torch.Tensor | None,
|
|
||||||
video_grid_thw: list[list[int]] | torch.Tensor | None,
|
|
||||||
second_per_grid_ts: list[float] | None = None,
|
|
||||||
audio_feature_lengths: torch.Tensor | None = None,
|
|
||||||
use_audio_in_video: bool = False,
|
|
||||||
) -> tuple[torch.Tensor, int]:
|
) -> tuple[torch.Tensor, int]:
|
||||||
"""Get M-RoPE input positions for Qwen2-VL model."""
|
kwargs = MultiModalFeatureSpec.gather_kwargs(
|
||||||
if image_grid_thw is None:
|
mm_features,
|
||||||
image_grid_thw = []
|
{"image_grid_thw", "video_grid_thw", "second_per_grid_ts"},
|
||||||
if video_grid_thw is None:
|
)
|
||||||
video_grid_thw = []
|
image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
|
||||||
if second_per_grid_ts is None:
|
video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
|
||||||
second_per_grid_ts = []
|
second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
|
||||||
|
|
||||||
|
hf_config = self.config
|
||||||
image_token_id = hf_config.image_token_id
|
image_token_id = hf_config.image_token_id
|
||||||
video_token_id = hf_config.video_token_id
|
video_token_id = hf_config.video_token_id
|
||||||
vision_start_token_id = hf_config.vision_start_token_id
|
vision_start_token_id = hf_config.vision_start_token_id
|
||||||
@ -1291,20 +1288,12 @@ class Qwen2VLForConditionalGeneration(
|
|||||||
else:
|
else:
|
||||||
ed_video = len(input_tokens) + 1
|
ed_video = len(input_tokens) + 1
|
||||||
if ed_image < ed_video:
|
if ed_image < ed_video:
|
||||||
t, h, w = (
|
t, h, w = image_grid_thw[image_index]
|
||||||
image_grid_thw[image_index][0],
|
|
||||||
image_grid_thw[image_index][1],
|
|
||||||
image_grid_thw[image_index][2],
|
|
||||||
)
|
|
||||||
image_index += 1
|
image_index += 1
|
||||||
remain_images -= 1
|
remain_images -= 1
|
||||||
ed = ed_image
|
ed = ed_image
|
||||||
else:
|
else:
|
||||||
t, h, w = (
|
t, h, w = video_grid_thw[video_index]
|
||||||
video_grid_thw[video_index][0],
|
|
||||||
video_grid_thw[video_index][1],
|
|
||||||
video_grid_thw[video_index][2],
|
|
||||||
)
|
|
||||||
video_second_per_grid_t = 1.0
|
video_second_per_grid_t = 1.0
|
||||||
if second_per_grid_ts:
|
if second_per_grid_ts:
|
||||||
video_second_per_grid_t = second_per_grid_ts[video_index]
|
video_second_per_grid_t = second_per_grid_ts[video_index]
|
||||||
|
|||||||
@ -65,7 +65,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
|||||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||||
from vllm.model_executor.models.qwen2_audio import Qwen2AudioProcessingInfo
|
from vllm.model_executor.models.qwen2_audio import Qwen2AudioProcessingInfo
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import MultiModalKwargsItems
|
from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
|
||||||
from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems
|
from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems
|
||||||
from vllm.multimodal.processing import (
|
from vllm.multimodal.processing import (
|
||||||
BaseMultiModalProcessor,
|
BaseMultiModalProcessor,
|
||||||
@ -1414,39 +1414,48 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
|
|||||||
def get_mrope_input_positions(
|
def get_mrope_input_positions(
|
||||||
self,
|
self,
|
||||||
input_tokens: list[int],
|
input_tokens: list[int],
|
||||||
hf_config: PretrainedConfig,
|
mm_features: list[MultiModalFeatureSpec],
|
||||||
image_grid_thw: list[list[int]] | torch.Tensor | None,
|
|
||||||
video_grid_thw: list[list[int]] | torch.Tensor | None,
|
|
||||||
second_per_grid_ts: list[float] | None = None,
|
|
||||||
audio_feature_lengths: torch.Tensor | None = None,
|
|
||||||
use_audio_in_video: bool = False,
|
|
||||||
) -> tuple[torch.Tensor, int]:
|
) -> tuple[torch.Tensor, int]:
|
||||||
config = hf_config.thinker_config
|
kwargs = MultiModalFeatureSpec.gather_kwargs(
|
||||||
if isinstance(image_grid_thw, list):
|
mm_features,
|
||||||
image_grid_thw = torch.tensor(image_grid_thw)
|
{
|
||||||
if isinstance(video_grid_thw, list):
|
"image_grid_thw",
|
||||||
video_grid_thw = torch.tensor(video_grid_thw)
|
"video_grid_thw",
|
||||||
|
"second_per_grid_ts",
|
||||||
|
"audio_feature_lengths",
|
||||||
|
"use_audio_in_video",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
image_grid_thw = kwargs.get("image_grid_thw", [])
|
||||||
|
video_grid_thw = kwargs.get("video_grid_thw", [])
|
||||||
|
second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
|
||||||
|
audio_feature_lengths = kwargs.get("audio_feature_lengths", [])
|
||||||
|
use_audio_in_video = any(kwargs.get("use_audio_in_video", []))
|
||||||
|
|
||||||
|
image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)(
|
||||||
|
image_grid_thw
|
||||||
|
)
|
||||||
|
video_grid_thw = (torch.stack if video_grid_thw else torch.tensor)(
|
||||||
|
video_grid_thw
|
||||||
|
)
|
||||||
|
|
||||||
input_ids = torch.tensor(input_tokens)
|
input_ids = torch.tensor(input_tokens)
|
||||||
if input_ids is None or input_ids.ndim != 1:
|
if input_ids is None or input_ids.ndim != 1:
|
||||||
raise ValueError("_omni3_get_input_positions_tensor expects 1D input_ids")
|
raise ValueError("_omni3_get_input_positions_tensor expects 1D input_ids")
|
||||||
|
|
||||||
seq_len = input_ids.shape[0]
|
seq_len = input_ids.shape[0]
|
||||||
if audio_feature_lengths is not None and not isinstance(
|
|
||||||
audio_feature_lengths, torch.Tensor
|
if isinstance(audio_feature_lengths, list):
|
||||||
):
|
audio_feature_lengths = torch.tensor(
|
||||||
audio_feature_lengths = torch.as_tensor(
|
|
||||||
audio_feature_lengths, dtype=torch.long
|
audio_feature_lengths, dtype=torch.long
|
||||||
)
|
)
|
||||||
if second_per_grid_ts is None:
|
|
||||||
if video_grid_thw is not None and video_grid_thw.numel() > 0:
|
if not len(second_per_grid_ts) and len(video_grid_thw):
|
||||||
second_per_grids = torch.ones(
|
second_per_grids = torch.ones(len(video_grid_thw), dtype=torch.float32)
|
||||||
video_grid_thw.shape[0], dtype=torch.float32
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
second_per_grids = torch.tensor([], dtype=torch.float32)
|
|
||||||
else:
|
else:
|
||||||
second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32)
|
second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32)
|
||||||
|
|
||||||
|
config = self.config
|
||||||
spatial_merge_size = config.vision_config.spatial_merge_size
|
spatial_merge_size = config.vision_config.spatial_merge_size
|
||||||
image_token_id = config.image_token_id
|
image_token_id = config.image_token_id
|
||||||
video_token_id = config.video_token_id
|
video_token_id = config.video_token_id
|
||||||
|
|||||||
@ -34,7 +34,7 @@ import numpy as np
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from transformers import BatchFeature, PretrainedConfig
|
from transformers import BatchFeature
|
||||||
from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast
|
from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast
|
||||||
from transformers.models.qwen2_vl.image_processing_qwen2_vl import (
|
from transformers.models.qwen2_vl.image_processing_qwen2_vl import (
|
||||||
smart_resize as image_smart_resize,
|
smart_resize as image_smart_resize,
|
||||||
@ -70,6 +70,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
|
|||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (
|
from vllm.multimodal.inputs import (
|
||||||
MultiModalDataDict,
|
MultiModalDataDict,
|
||||||
|
MultiModalFeatureSpec,
|
||||||
MultiModalFieldConfig,
|
MultiModalFieldConfig,
|
||||||
MultiModalKwargsItem,
|
MultiModalKwargsItem,
|
||||||
MultiModalKwargsItems,
|
MultiModalKwargsItems,
|
||||||
@ -1416,17 +1417,18 @@ class Qwen3VLForConditionalGeneration(
|
|||||||
def get_mrope_input_positions(
|
def get_mrope_input_positions(
|
||||||
self,
|
self,
|
||||||
input_tokens: list[int],
|
input_tokens: list[int],
|
||||||
hf_config: PretrainedConfig,
|
mm_features: list[MultiModalFeatureSpec],
|
||||||
image_grid_thw: list[list[int]] | torch.Tensor,
|
|
||||||
video_grid_thw: list[list[int]] | torch.Tensor,
|
|
||||||
second_per_grid_ts: list[float] | None = None,
|
|
||||||
audio_feature_lengths: torch.Tensor | None = None,
|
|
||||||
use_audio_in_video: bool = False,
|
|
||||||
) -> tuple[torch.Tensor, int]:
|
) -> tuple[torch.Tensor, int]:
|
||||||
"""Get mrope input positions and delta value."""
|
kwargs = MultiModalFeatureSpec.gather_kwargs(
|
||||||
|
mm_features,
|
||||||
|
{"image_grid_thw", "video_grid_thw"},
|
||||||
|
)
|
||||||
|
image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
|
||||||
|
video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
|
||||||
|
|
||||||
video_grid_thw = [[1, h, w] for t, h, w in video_grid_thw for _ in range(t)]
|
video_grid_thw = [[1, h, w] for t, h, w in video_grid_thw for _ in range(t)]
|
||||||
|
|
||||||
|
hf_config = self.config
|
||||||
image_token_id = hf_config.image_token_id
|
image_token_id = hf_config.image_token_id
|
||||||
video_token_id = hf_config.video_token_id
|
video_token_id = hf_config.video_token_id
|
||||||
vision_start_token_id = hf_config.vision_start_token_id
|
vision_start_token_id = hf_config.vision_start_token_id
|
||||||
@ -1455,20 +1457,12 @@ class Qwen3VLForConditionalGeneration(
|
|||||||
else:
|
else:
|
||||||
ed_video = len(input_tokens) + 1
|
ed_video = len(input_tokens) + 1
|
||||||
if ed_image < ed_video:
|
if ed_image < ed_video:
|
||||||
t, h, w = (
|
t, h, w = image_grid_thw[image_index]
|
||||||
image_grid_thw[image_index][0],
|
|
||||||
image_grid_thw[image_index][1],
|
|
||||||
image_grid_thw[image_index][2],
|
|
||||||
)
|
|
||||||
image_index += 1
|
image_index += 1
|
||||||
remain_images -= 1
|
remain_images -= 1
|
||||||
ed = ed_image
|
ed = ed_image
|
||||||
else:
|
else:
|
||||||
t, h, w = (
|
t, h, w = video_grid_thw[video_index]
|
||||||
video_grid_thw[video_index][0],
|
|
||||||
video_grid_thw[video_index][1],
|
|
||||||
video_grid_thw[video_index][2],
|
|
||||||
)
|
|
||||||
video_index += 1
|
video_index += 1
|
||||||
remain_videos -= 1
|
remain_videos -= 1
|
||||||
ed = ed_video
|
ed = ed_video
|
||||||
|
|||||||
@ -27,6 +27,7 @@ from vllm.model_executor.models.utils import WeightsMapper
|
|||||||
from vllm.multimodal import MultiModalKwargsItems
|
from vllm.multimodal import MultiModalKwargsItems
|
||||||
from vllm.multimodal.inputs import (
|
from vllm.multimodal.inputs import (
|
||||||
MultiModalDataDict,
|
MultiModalDataDict,
|
||||||
|
MultiModalFeatureSpec,
|
||||||
MultiModalFieldConfig,
|
MultiModalFieldConfig,
|
||||||
MultiModalInputs,
|
MultiModalInputs,
|
||||||
MultiModalUUIDDict,
|
MultiModalUUIDDict,
|
||||||
@ -38,7 +39,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
|||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from transformers import BatchFeature, PretrainedConfig
|
from transformers import BatchFeature
|
||||||
|
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
from vllm.config.multimodal import BaseDummyOptions
|
from vllm.config.multimodal import BaseDummyOptions
|
||||||
@ -367,20 +368,34 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
|
|||||||
def get_mrope_input_positions(
|
def get_mrope_input_positions(
|
||||||
self,
|
self,
|
||||||
input_tokens: list[int],
|
input_tokens: list[int],
|
||||||
hf_config: "PretrainedConfig",
|
mm_features: list[MultiModalFeatureSpec],
|
||||||
image_grid_thw: list[list[int]] | torch.Tensor | None,
|
|
||||||
video_grid_thw: list[list[int]] | torch.Tensor | None,
|
|
||||||
second_per_grid_ts: list[float] | None = None,
|
|
||||||
audio_feature_lengths: torch.Tensor | None = None,
|
|
||||||
use_audio_in_video: bool = False,
|
|
||||||
) -> tuple[torch.Tensor, int]:
|
) -> tuple[torch.Tensor, int]:
|
||||||
if any((second_per_grid_ts, audio_feature_lengths, use_audio_in_video)):
|
kwargs = MultiModalFeatureSpec.gather_kwargs(
|
||||||
|
mm_features,
|
||||||
|
{
|
||||||
|
"image_grid_thw",
|
||||||
|
"video_grid_thw",
|
||||||
|
"second_per_grid_ts",
|
||||||
|
"audio_feature_lengths",
|
||||||
|
"use_audio_in_video",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
if any(
|
||||||
|
v
|
||||||
|
for k, v in kwargs.items()
|
||||||
|
if k not in {"image_grid_thw", "video_grid_thw"}
|
||||||
|
):
|
||||||
raise NotImplementedError("Transformers backend only supports images.")
|
raise NotImplementedError("Transformers backend only supports images.")
|
||||||
|
|
||||||
if isinstance(image_grid_thw, list):
|
image_grid_thw = kwargs.get("image_grid_thw", [])
|
||||||
image_grid_thw = torch.tensor(image_grid_thw)
|
video_grid_thw = kwargs.get("video_grid_thw", [])
|
||||||
if isinstance(video_grid_thw, list):
|
|
||||||
video_grid_thw = torch.tensor(video_grid_thw)
|
image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)(
|
||||||
|
image_grid_thw
|
||||||
|
)
|
||||||
|
video_grid_thw = (torch.stack if video_grid_thw else torch.tensor)(
|
||||||
|
video_grid_thw
|
||||||
|
)
|
||||||
|
|
||||||
mrope_positions, mrope_position_delta = self.model.get_rope_index(
|
mrope_positions, mrope_position_delta = self.model.get_rope_index(
|
||||||
input_ids=torch.tensor(input_tokens).unsqueeze(0),
|
input_ids=torch.tensor(input_tokens).unsqueeze(0),
|
||||||
|
|||||||
@ -249,6 +249,19 @@ class MultiModalFeatureSpec:
|
|||||||
mm_position: PlaceholderRange
|
mm_position: PlaceholderRange
|
||||||
"""e.g., PlaceholderRange(offset=2, length=336)"""
|
"""e.g., PlaceholderRange(offset=2, length=336)"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def gather_kwargs(features: list["MultiModalFeatureSpec"], keys: set[str]):
|
||||||
|
kwargs = defaultdict[str, list[NestedTensors]](list)
|
||||||
|
|
||||||
|
for f in features:
|
||||||
|
item = f.data
|
||||||
|
if item is not None:
|
||||||
|
for k in keys:
|
||||||
|
if k in item:
|
||||||
|
kwargs[k].append(item[k].data)
|
||||||
|
|
||||||
|
return dict(kwargs)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class MultiModalFieldElem:
|
class MultiModalFieldElem:
|
||||||
|
|||||||
@ -892,38 +892,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
self.input_batch.num_accepted_tokens_cpu[i] = num_tokens
|
self.input_batch.num_accepted_tokens_cpu[i] = num_tokens
|
||||||
|
|
||||||
def _init_mrope_positions(self, req_state: CachedRequestState):
|
def _init_mrope_positions(self, req_state: CachedRequestState):
|
||||||
image_grid_thw = []
|
model = self.get_model()
|
||||||
video_grid_thw = []
|
assert supports_mrope(model), "M-RoPE support is not implemented."
|
||||||
second_per_grid_ts = []
|
|
||||||
audio_feature_lengths = []
|
|
||||||
use_audio_in_video = False
|
|
||||||
for mm_feature in req_state.mm_features:
|
|
||||||
mm_item = mm_feature.data
|
|
||||||
if mm_item is None:
|
|
||||||
continue
|
|
||||||
mm_input = mm_item.get_data()
|
|
||||||
if (t := mm_input.get("image_grid_thw")) is not None:
|
|
||||||
image_grid_thw.append(t.tolist())
|
|
||||||
if (t := mm_input.get("video_grid_thw")) is not None:
|
|
||||||
video_grid_thw.append(t.tolist())
|
|
||||||
if (t := mm_input.get("second_per_grid_ts")) is not None:
|
|
||||||
second_per_grid_ts.append(t)
|
|
||||||
if (t := mm_input.get("audio_feature_lengths")) is not None:
|
|
||||||
audio_feature_lengths.append(t)
|
|
||||||
if mm_input.get("use_audio_in_video") is True:
|
|
||||||
use_audio_in_video = True
|
|
||||||
|
|
||||||
assert supports_mrope(self.get_model()), "M-RoPE support is not implemented."
|
|
||||||
|
|
||||||
req_state.mrope_positions, req_state.mrope_position_delta = (
|
req_state.mrope_positions, req_state.mrope_position_delta = (
|
||||||
self.model.get_mrope_input_positions(
|
model.get_mrope_input_positions(
|
||||||
req_state.prompt_token_ids,
|
req_state.prompt_token_ids,
|
||||||
hf_config=self.model_config.hf_config,
|
req_state.mm_features,
|
||||||
image_grid_thw=image_grid_thw,
|
|
||||||
video_grid_thw=video_grid_thw,
|
|
||||||
second_per_grid_ts=second_per_grid_ts,
|
|
||||||
audio_feature_lengths=audio_feature_lengths,
|
|
||||||
use_audio_in_video=use_audio_in_video,
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user