[Model] Pass mm_features directly into get_mrope_input_positions (#28399)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-11-11 21:14:48 +08:00 committed by GitHub
parent 7dbe6d81d6
commit afffd3cc8a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 225 additions and 272 deletions

View File

@ -34,7 +34,7 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from einops import rearrange, repeat from einops import rearrange, repeat
from transformers import BatchFeature, PretrainedConfig from transformers import BatchFeature
from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.attention.layer import ( from vllm.attention.layer import (
@ -58,6 +58,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict, MultiModalDataDict,
MultiModalFeatureSpec,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
@ -1433,15 +1434,16 @@ class Ernie4_5_VLMoeForConditionalGeneration(
def get_mrope_input_positions( def get_mrope_input_positions(
self, self,
input_tokens: list[int], input_tokens: list[int],
hf_config: PretrainedConfig, mm_features: list[MultiModalFeatureSpec],
image_grid_thw: list[list[int]] | torch.Tensor,
video_grid_thw: list[list[int]] | torch.Tensor,
second_per_grid_ts: list[float] | None = None,
audio_feature_lengths: torch.Tensor | None = None,
use_audio_in_video: bool = False,
) -> tuple[torch.Tensor, int]: ) -> tuple[torch.Tensor, int]:
"""Get mrope input positions and delta value for Ernie VL.""" kwargs = MultiModalFeatureSpec.gather_kwargs(
mm_features,
{"image_grid_thw", "video_grid_thw"},
)
image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
hf_config = self.config
image_token_id = hf_config.im_patch_id image_token_id = hf_config.im_patch_id
video_start_token_id = hf_config.video_start_token_id video_start_token_id = hf_config.video_start_token_id
video_end_token_id = hf_config.video_end_token_id video_end_token_id = hf_config.video_end_token_id
@ -1449,10 +1451,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(
temporal_conv_size = hf_config.temporal_conv_size temporal_conv_size = hf_config.temporal_conv_size
llm_pos_ids_list: list = [] llm_pos_ids_list: list = []
if not (image_grid_thw is None and video_grid_thw is None): if image_grid_thw or video_grid_thw:
if isinstance(image_grid_thw, torch.Tensor):
image_grid_thw = image_grid_thw.tolist()
input_token_type: list[str] = [] input_token_type: list[str] = []
video_check_flg = False video_check_flg = False
for token in input_tokens: for token in input_tokens:
@ -1484,11 +1483,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(
llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
) )
if modality_type == "image": if modality_type == "image":
t, h, w = ( t, h, w = image_grid_thw[mm_data_idx]
image_grid_thw[mm_data_idx][0],
image_grid_thw[mm_data_idx][1],
image_grid_thw[mm_data_idx][2],
)
llm_grid_t, llm_grid_h, llm_grid_w = ( llm_grid_t, llm_grid_h, llm_grid_w = (
t, t,
h // spatial_conv_size, h // spatial_conv_size,
@ -1519,11 +1514,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(
mm_data_idx += 1 mm_data_idx += 1
elif modality_type == "video": elif modality_type == "video":
t, h, w = ( t, h, w = video_grid_thw[mm_data_idx]
video_grid_thw[mm_data_idx][0],
video_grid_thw[mm_data_idx][1],
video_grid_thw[mm_data_idx][2],
)
llm_grid_t, llm_grid_h, llm_grid_w = ( llm_grid_t, llm_grid_h, llm_grid_w = (
t // temporal_conv_size, t // temporal_conv_size,
h // spatial_conv_size, h // spatial_conv_size,

View File

@ -37,7 +37,7 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from einops import rearrange from einops import rearrange
from transformers import BatchFeature, PretrainedConfig from transformers import BatchFeature
from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig
from transformers.models.glm4v.image_processing_glm4v import ( from transformers.models.glm4v.image_processing_glm4v import (
Glm4vImageProcessor, Glm4vImageProcessor,
@ -70,6 +70,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict, MultiModalDataDict,
MultiModalFeatureSpec,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
VideoItem, VideoItem,
@ -1619,25 +1620,23 @@ class Glm4vForConditionalGeneration(
def get_mrope_input_positions( def get_mrope_input_positions(
self, self,
input_tokens: list[int], input_tokens: list[int],
hf_config: "PretrainedConfig", mm_features: list[MultiModalFeatureSpec],
image_grid_thw: list[list[int]] | torch.Tensor | None,
video_grid_thw: list[list[int]] | torch.Tensor | None,
second_per_grid_ts: list[float] | None = None,
audio_feature_lengths: torch.Tensor | None = None,
use_audio_in_video: bool = False,
) -> tuple[torch.Tensor, int]: ) -> tuple[torch.Tensor, int]:
"""Get mrope input positions and delta value for GLM4V.""" kwargs = MultiModalFeatureSpec.gather_kwargs(
mm_features,
{"image_grid_thw", "video_grid_thw"},
)
image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
hf_config = self.config
image_token_id = hf_config.image_token_id image_token_id = hf_config.image_token_id
video_start_token_id = hf_config.video_start_token_id video_start_token_id = hf_config.video_start_token_id
video_end_token_id = hf_config.video_end_token_id video_end_token_id = hf_config.video_end_token_id
spatial_merge_size = hf_config.vision_config.spatial_merge_size spatial_merge_size = hf_config.vision_config.spatial_merge_size
llm_pos_ids_list: list = [] llm_pos_ids_list: list = []
if not (image_grid_thw is None and video_grid_thw is None): if image_grid_thw or video_grid_thw:
if isinstance(image_grid_thw, torch.Tensor):
image_grid_thw = image_grid_thw.tolist()
input_token_type: list[str] = [] input_token_type: list[str] = []
video_check_flg = False video_check_flg = False
for token in input_tokens: for token in input_tokens:
@ -1669,11 +1668,7 @@ class Glm4vForConditionalGeneration(
llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
) )
if modality_type == "image": if modality_type == "image":
t, h, w = ( t, h, w = image_grid_thw[mm_data_idx]
image_grid_thw[mm_data_idx][0],
image_grid_thw[mm_data_idx][1],
image_grid_thw[mm_data_idx][2],
)
llm_grid_t, llm_grid_h, llm_grid_w = ( llm_grid_t, llm_grid_h, llm_grid_w = (
t, t,
h // spatial_merge_size, h // spatial_merge_size,
@ -1706,8 +1701,7 @@ class Glm4vForConditionalGeneration(
elif modality_type == "video": elif modality_type == "video":
t, h, w = ( t, h, w = (
video_frame_num, video_frame_num,
image_grid_thw[mm_data_idx][1], *image_grid_thw[mm_data_idx][1:],
image_grid_thw[mm_data_idx][2],
) )
llm_grid_t, llm_grid_h, llm_grid_w = ( llm_grid_t, llm_grid_h, llm_grid_w = (
t, t,

View File

@ -15,7 +15,7 @@ from torch import nn
from torch.nn import LayerNorm from torch.nn import LayerNorm
from torchvision import transforms from torchvision import transforms
from torchvision.transforms import InterpolationMode from torchvision.transforms import InterpolationMode
from transformers import BatchFeature, PretrainedConfig, PreTrainedTokenizer, TensorType from transformers import BatchFeature, PreTrainedTokenizer, TensorType
from transformers.image_utils import ImageInput from transformers.image_utils import ImageInput
from transformers.tokenization_utils_base import TextInput from transformers.tokenization_utils_base import TextInput
@ -36,6 +36,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict, MultiModalDataDict,
MultiModalFeatureSpec,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
) )
@ -622,25 +623,23 @@ class GLM4VForCausalLM(
def get_mrope_input_positions( def get_mrope_input_positions(
self, self,
input_tokens: list[int], input_tokens: list[int],
hf_config: PretrainedConfig, mm_features: list[MultiModalFeatureSpec],
image_grid_thw: list[list[int]] | torch.Tensor,
video_grid_thw: list[list[int]] | torch.Tensor,
second_per_grid_ts: list[float] | None = None,
audio_feature_lengths: torch.Tensor | None = None,
use_audio_in_video: bool = False,
) -> tuple[torch.Tensor, int]: ) -> tuple[torch.Tensor, int]:
"""Get mrope input positions and delta value for GLM4V.""" kwargs = MultiModalFeatureSpec.gather_kwargs(
mm_features,
{"image_grid_thw", "video_grid_thw"},
)
image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
hf_config = self.config
image_token_id = hf_config.image_token_id image_token_id = hf_config.image_token_id
video_start_token_id = hf_config.video_start_token_id video_start_token_id = hf_config.video_start_token_id
video_end_token_id = hf_config.video_end_token_id video_end_token_id = hf_config.video_end_token_id
spatial_merge_size = hf_config.vision_config.spatial_merge_size spatial_merge_size = hf_config.vision_config.spatial_merge_size
llm_pos_ids_list: list = [] llm_pos_ids_list: list = []
if not (image_grid_thw is None and video_grid_thw is None): if image_grid_thw or video_grid_thw:
if isinstance(image_grid_thw, torch.Tensor):
image_grid_thw = image_grid_thw.tolist()
input_token_type: list[str] = [] input_token_type: list[str] = []
video_check_flg = False video_check_flg = False
for token in input_tokens: for token in input_tokens:
@ -672,11 +671,7 @@ class GLM4VForCausalLM(
llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
) )
if modality_type == "image": if modality_type == "image":
t, h, w = ( t, h, w = image_grid_thw[mm_data_idx]
image_grid_thw[mm_data_idx][0],
image_grid_thw[mm_data_idx][1],
image_grid_thw[mm_data_idx][2],
)
llm_grid_t, llm_grid_h, llm_grid_w = ( llm_grid_t, llm_grid_h, llm_grid_w = (
t, t,
h // spatial_merge_size, h // spatial_merge_size,
@ -709,8 +704,7 @@ class GLM4VForCausalLM(
elif modality_type == "video": elif modality_type == "video":
t, h, w = ( t, h, w = (
video_frame_num, video_frame_num,
image_grid_thw[mm_data_idx][1], *image_grid_thw[mm_data_idx][1:],
image_grid_thw[mm_data_idx][2],
) )
llm_grid_t, llm_grid_h, llm_grid_w = ( llm_grid_t, llm_grid_h, llm_grid_w = (
t, t,

View File

@ -16,7 +16,6 @@ import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
from torch import Tensor from torch import Tensor
from transformers import PretrainedConfig
from transformers.models.whisper.tokenization_whisper import LANGUAGES from transformers.models.whisper.tokenization_whisper import LANGUAGES
from typing_extensions import Self, TypeIs from typing_extensions import Self, TypeIs
@ -32,10 +31,12 @@ from .interfaces_base import VllmModel, is_pooling_model
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.model_executor.models.utils import WeightsMapper from vllm.model_executor.models.utils import WeightsMapper
from vllm.multimodal.inputs import MultiModalFeatureSpec
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
else: else:
VllmConfig = object VllmConfig = object
WeightsMapper = object WeightsMapper = object
MultiModalFeatureSpec = object
IntermediateTensors = object IntermediateTensors = object
logger = init_logger(__name__) logger = init_logger(__name__)
@ -991,12 +992,7 @@ class SupportsMRoPE(Protocol):
def get_mrope_input_positions( def get_mrope_input_positions(
self, self,
input_tokens: list[int], input_tokens: list[int],
hf_config: PretrainedConfig, mm_features: list["MultiModalFeatureSpec"],
image_grid_thw: list[list[int]] | torch.Tensor | None,
video_grid_thw: list[list[int]] | torch.Tensor | None,
second_per_grid_ts: list[float] | None = None,
audio_feature_lengths: torch.Tensor | None = None,
use_audio_in_video: bool = False,
) -> tuple[torch.Tensor, int]: ) -> tuple[torch.Tensor, int]:
""" """
Get M-RoPE input positions and delta value for this specific model. Get M-RoPE input positions and delta value for this specific model.
@ -1006,17 +1002,11 @@ class SupportsMRoPE(Protocol):
Args: Args:
input_tokens: List of input token IDs input_tokens: List of input token IDs
hf_config: HuggingFace model configuration mm_features: Information about each multi-modal data item
image_grid_thw: Image grid dimensions (t, h, w)
video_grid_thw: Video grid dimensions (t, h, w)
second_per_grid_ts: Seconds per grid timestep for videos
audio_feature_lengths: Audio feature lengths for multimodal models
use_audio_in_video: Whether to use audio in video for interleaving
Returns: Returns:
Tuple of (llm_positions, mrope_position_delta) Tuple of `(llm_positions, mrope_position_delta)`
- llm_positions: Tensor of shape [3, num_tokens] - llm_positions: Tensor of shape `[3, num_tokens]` with T/H/W positions
with T/H/W positions
- mrope_position_delta: Delta for position calculations - mrope_position_delta: Delta for position calculations
""" """
... ...

View File

@ -40,6 +40,7 @@ from vllm.multimodal.inputs import (
ImageItem, ImageItem,
ModalityData, ModalityData,
MultiModalDataDict, MultiModalDataDict,
MultiModalFeatureSpec,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
VideoItem, VideoItem,
@ -1627,16 +1628,17 @@ class KeyeForConditionalGeneration(
def get_mrope_input_positions( def get_mrope_input_positions(
self, self,
input_tokens: list[int], input_tokens: list[int],
hf_config: PretrainedConfig, mm_features: list[MultiModalFeatureSpec],
image_grid_thw: list[list[int]] | torch.Tensor,
video_grid_thw: list[list[int]] | torch.Tensor,
second_per_grid_ts: list[float] | None = None,
audio_feature_lengths: torch.Tensor | None = None,
use_audio_in_video: bool = False,
) -> tuple[torch.Tensor, int]: ) -> tuple[torch.Tensor, int]:
kwargs = MultiModalFeatureSpec.gather_kwargs(
mm_features,
{"image_grid_thw", "video_grid_thw"},
)
image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0: if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0:
video_grid_thw = video_grid_thw[0] video_grid_thw = video_grid_thw[0]
"""Get mrope input positions and delta value (Keye series)."""
def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]: def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]:
""" """
@ -1662,6 +1664,7 @@ class KeyeForConditionalGeneration(
video_grid_thw = split_thw(video_grid_thw) video_grid_thw = split_thw(video_grid_thw)
hf_config = self.config
image_token_id = hf_config.image_token_id image_token_id = hf_config.image_token_id
video_token_id = hf_config.video_token_id video_token_id = hf_config.video_token_id
spatial_merge_size = hf_config.vision_config.spatial_merge_size spatial_merge_size = hf_config.vision_config.spatial_merge_size
@ -1691,20 +1694,12 @@ class KeyeForConditionalGeneration(
ed_video = len(input_tokens) + 1 ed_video = len(input_tokens) + 1
if ed_image < ed_video: if ed_image < ed_video:
t, h, w = ( t, h, w = image_grid_thw[image_index]
image_grid_thw[image_index][0],
image_grid_thw[image_index][1],
image_grid_thw[image_index][2],
)
image_index += 1 image_index += 1
remain_images -= 1 remain_images -= 1
ed = ed_image ed = ed_image
else: else:
t, h, w = ( t, h, w = video_grid_thw[video_index]
video_grid_thw[video_index][0],
video_grid_thw[video_index][1],
video_grid_thw[video_index][2],
)
video_index += 1 video_index += 1
remain_frames -= 1 remain_frames -= 1
ed = ed_video ed = ed_video

View File

@ -21,6 +21,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
ImageItem, ImageItem,
ModalityData, ModalityData,
MultiModalFeatureSpec,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
VideoItem, VideoItem,
@ -597,16 +598,17 @@ class KeyeVL1_5ForConditionalGeneration(
def get_mrope_input_positions( def get_mrope_input_positions(
self, self,
input_tokens: list[int], input_tokens: list[int],
hf_config: PretrainedConfig, mm_features: list[MultiModalFeatureSpec],
image_grid_thw: list[list[int]] | torch.Tensor,
video_grid_thw: list[list[int]] | torch.Tensor,
second_per_grid_ts: list[float] | None = None,
audio_feature_lengths: torch.Tensor | None = None,
use_audio_in_video: bool = False,
) -> tuple[torch.Tensor, int]: ) -> tuple[torch.Tensor, int]:
kwargs = MultiModalFeatureSpec.gather_kwargs(
mm_features,
{"image_grid_thw", "video_grid_thw"},
)
image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0: if isinstance(video_grid_thw, list) and len(video_grid_thw) > 0:
video_grid_thw = video_grid_thw[0] video_grid_thw = video_grid_thw[0]
"""Get mrope input positions and delta value (Keye series)."""
def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]: def split_thw(grid_thw: torch.Tensor | list[int]) -> list[list[int]]:
""" """
@ -632,6 +634,7 @@ class KeyeVL1_5ForConditionalGeneration(
video_grid_thw = split_thw(video_grid_thw) video_grid_thw = split_thw(video_grid_thw)
hf_config = self.config
image_token_id = hf_config.image_token_id image_token_id = hf_config.image_token_id
video_token_id = hf_config.video_token_id video_token_id = hf_config.video_token_id
spatial_merge_size = hf_config.vision_config.spatial_merge_size spatial_merge_size = hf_config.vision_config.spatial_merge_size
@ -661,20 +664,12 @@ class KeyeVL1_5ForConditionalGeneration(
ed_video = len(input_tokens) + 1 ed_video = len(input_tokens) + 1
if ed_image < ed_video: if ed_image < ed_video:
t, h, w = ( t, h, w = image_grid_thw[image_index]
image_grid_thw[image_index][0],
image_grid_thw[image_index][1],
image_grid_thw[image_index][2],
)
image_index += 1 image_index += 1
remain_images -= 1 remain_images -= 1
ed = ed_image ed = ed_image
else: else:
t, h, w = ( t, h, w = video_grid_thw[video_index]
video_grid_thw[video_index][0],
video_grid_thw[video_index][1],
video_grid_thw[video_index][2],
)
video_index += 1 video_index += 1
remain_frames -= 1 remain_frames -= 1
ed = ed_video ed = ed_video

View File

@ -61,6 +61,7 @@ from vllm.model_executor.model_loader.weight_utils import (
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict, MultiModalDataDict,
MultiModalFeatureSpec,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargs, MultiModalKwargs,
) )
@ -1184,15 +1185,17 @@ class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, Support
def get_mrope_input_positions( def get_mrope_input_positions(
self, self,
input_tokens: list[int], input_tokens: list[int],
hf_config: PretrainedConfig, mm_features: list[MultiModalFeatureSpec],
image_grid_thw: list[list[int]] | torch.Tensor,
video_grid_thw: list[list[int]] | torch.Tensor,
second_per_grid_ts: list[float],
audio_feature_lengths: torch.Tensor | None = None,
use_audio_in_video: bool = False,
) -> tuple[torch.Tensor, int]: ) -> tuple[torch.Tensor, int]:
"""Get mrope input positions and delta value.""" kwargs = MultiModalFeatureSpec.gather_kwargs(
mm_features,
{"image_grid_thw", "video_grid_thw", "second_per_grid_ts"},
)
image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
hf_config = self.config
image_token_id = hf_config.image_token_id image_token_id = hf_config.image_token_id
video_token_id = hf_config.video_token_id video_token_id = hf_config.video_token_id
vision_start_token_id = hf_config.vision_start_token_id vision_start_token_id = hf_config.vision_start_token_id
@ -1229,20 +1232,12 @@ class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, Support
else: else:
ed_video = len(input_tokens) + 1 ed_video = len(input_tokens) + 1
if ed_image < ed_video: if ed_image < ed_video:
t, h, w = ( t, h, w = image_grid_thw[image_index]
image_grid_thw[image_index][0],
image_grid_thw[image_index][1],
image_grid_thw[image_index][2],
)
image_index += 1 image_index += 1
remain_images -= 1 remain_images -= 1
ed = ed_image ed = ed_image
else: else:
t, h, w = ( t, h, w = video_grid_thw[video_index]
video_grid_thw[video_index][0],
video_grid_thw[video_index][1],
video_grid_thw[video_index][2],
)
video_second_per_grid_t = 1.0 video_second_per_grid_t = 1.0
if second_per_grid_ts: if second_per_grid_ts:
video_second_per_grid_t = second_per_grid_ts[video_index] video_second_per_grid_t = second_per_grid_ts[video_index]

View File

@ -68,6 +68,7 @@ from vllm.multimodal.inputs import (
ImageItem, ImageItem,
ModalityData, ModalityData,
MultiModalDataDict, MultiModalDataDict,
MultiModalFeatureSpec,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
NestedTensors, NestedTensors,
@ -923,21 +924,9 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
def get_mrope_input_positions( def get_mrope_input_positions(
self, self,
input_tokens: list[int], input_tokens: list[int],
hf_config: PretrainedConfig, mm_features: list[MultiModalFeatureSpec],
image_grid_thw: list[list[int]] | torch.Tensor,
video_grid_thw: list[list[int]] | torch.Tensor,
second_per_grid_ts: list[float] | None = None,
audio_feature_lengths: torch.Tensor | None = None,
use_audio_in_video: bool = False,
) -> tuple[torch.Tensor, int]: ) -> tuple[torch.Tensor, int]:
"""Get mrope input positions and delta value (Qwen2.5-Omni version). """
Differences from MRotaryEmbedding:
1. Add audio support (and related `audio_feature_lengths`).
2. Add `use_audio_in_video` option to read audio from video inputs.
In this case, audio and vision position ids will be split into
chunks and interleaved.
Example: Example:
(V_i are vision position ids, A_i are audio position ids) (V_i are vision position ids, A_i are audio position ids)
@ -945,11 +934,33 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
|V_1 ... V_n|A_1 ... A_n|V_n+1 ... V_2n|A_n+1 ... A_2n|... |V_1 ... V_n|A_1 ... A_n|V_n+1 ... V_2n|A_n+1 ... A_2n|...
|vision chunk 1|audio chunk 1|vision chunk 2|audio chunk 2 |... |vision chunk 1|audio chunk 1|vision chunk 2|audio chunk 2 |...
""" """
kwargs = MultiModalFeatureSpec.gather_kwargs(
mm_features,
{
"image_grid_thw",
"video_grid_thw",
"second_per_grid_ts",
"audio_feature_lengths",
"use_audio_in_video",
},
)
image_grid_thw = kwargs.get("image_grid_thw", [])
video_grid_thw = kwargs.get("video_grid_thw", [])
second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
audio_feature_lengths = kwargs.get("audio_feature_lengths", [])
use_audio_in_video = any(kwargs.get("use_audio_in_video", []))
image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)(
image_grid_thw
)
video_grid_thw = (torch.stack if video_grid_thw else torch.tensor)(
video_grid_thw
)
# TODO(fyabc): refactor and share more code with # TODO(fyabc): refactor and share more code with
# _vl_get_input_positions_tensor. # _vl_get_input_positions_tensor.
thinker_config = hf_config.thinker_config thinker_config = self.config
audio_token_id = thinker_config.audio_token_index audio_token_id = thinker_config.audio_token_index
image_token_id = thinker_config.image_token_index image_token_id = thinker_config.image_token_index
video_token_id = thinker_config.video_token_index video_token_id = thinker_config.video_token_index
@ -963,11 +974,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
thinker_config.vision_config, "tokens_per_second", 25 thinker_config.vision_config, "tokens_per_second", 25
) )
if isinstance(image_grid_thw, list):
image_grid_thw = torch.tensor(image_grid_thw)
if isinstance(video_grid_thw, list):
video_grid_thw = torch.tensor(video_grid_thw)
src_item = input_tokens src_item = input_tokens
audio_seqlens = audio_feature_lengths audio_seqlens = audio_feature_lengths
if not second_per_grid_ts: if not second_per_grid_ts:

View File

@ -35,7 +35,7 @@ import einops
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from transformers import BatchFeature, PretrainedConfig from transformers import BatchFeature
from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
Qwen2_5_VLConfig, Qwen2_5_VLConfig,
@ -75,7 +75,11 @@ from vllm.multimodal.evs import (
compute_retention_mask, compute_retention_mask,
recompute_mrope_positions, recompute_mrope_positions,
) )
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs from vllm.multimodal.inputs import (
MultiModalFeatureSpec,
MultiModalFieldConfig,
MultiModalKwargs,
)
from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import PromptReplacement, PromptUpdate from vllm.multimodal.processing import PromptReplacement, PromptUpdate
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
@ -1120,15 +1124,17 @@ class Qwen2_5_VLForConditionalGeneration(
def get_mrope_input_positions( def get_mrope_input_positions(
self, self,
input_tokens: list[int], input_tokens: list[int],
hf_config: PretrainedConfig, mm_features: list[MultiModalFeatureSpec],
image_grid_thw: list[list[int]] | torch.Tensor,
video_grid_thw: list[list[int]] | torch.Tensor,
second_per_grid_ts: list[float],
audio_feature_lengths: torch.Tensor | None = None,
use_audio_in_video: bool = False,
) -> tuple[torch.Tensor, int]: ) -> tuple[torch.Tensor, int]:
"""Get mrope input positions and delta value.""" kwargs = MultiModalFeatureSpec.gather_kwargs(
mm_features,
{"image_grid_thw", "video_grid_thw", "second_per_grid_ts"},
)
image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
hf_config = self.config
image_token_id = hf_config.image_token_id image_token_id = hf_config.image_token_id
video_token_id = hf_config.video_token_id video_token_id = hf_config.video_token_id
vision_start_token_id = hf_config.vision_start_token_id vision_start_token_id = hf_config.vision_start_token_id
@ -1165,20 +1171,12 @@ class Qwen2_5_VLForConditionalGeneration(
else: else:
ed_video = len(input_tokens) + 1 ed_video = len(input_tokens) + 1
if ed_image < ed_video: if ed_image < ed_video:
t, h, w = ( t, h, w = image_grid_thw[image_index]
image_grid_thw[image_index][0],
image_grid_thw[image_index][1],
image_grid_thw[image_index][2],
)
image_index += 1 image_index += 1
remain_images -= 1 remain_images -= 1
ed = ed_image ed = ed_image
else: else:
t, h, w = ( t, h, w = video_grid_thw[video_index]
video_grid_thw[video_index][0],
video_grid_thw[video_index][1],
video_grid_thw[video_index][2],
)
video_second_per_grid_t = 1.0 video_second_per_grid_t = 1.0
if second_per_grid_ts: if second_per_grid_ts:
video_second_per_grid_t = second_per_grid_ts[video_index] video_second_per_grid_t = second_per_grid_ts[video_index]

View File

@ -34,7 +34,7 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from einops import rearrange, repeat from einops import rearrange, repeat
from transformers import BatchFeature, PretrainedConfig from transformers import BatchFeature
from transformers.models.qwen2_vl import Qwen2VLImageProcessor, Qwen2VLProcessor from transformers.models.qwen2_vl import Qwen2VLImageProcessor, Qwen2VLProcessor
from transformers.models.qwen2_vl.configuration_qwen2_vl import ( from transformers.models.qwen2_vl.configuration_qwen2_vl import (
Qwen2VLConfig, Qwen2VLConfig,
@ -70,6 +70,7 @@ from vllm.multimodal.inputs import (
ImageItem, ImageItem,
ModalityData, ModalityData,
MultiModalDataDict, MultiModalDataDict,
MultiModalFeatureSpec,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItems, MultiModalKwargsItems,
VideoItem, VideoItem,
@ -1240,21 +1241,17 @@ class Qwen2VLForConditionalGeneration(
def get_mrope_input_positions( def get_mrope_input_positions(
self, self,
input_tokens: list[int], input_tokens: list[int],
hf_config: PretrainedConfig, mm_features: list[MultiModalFeatureSpec],
image_grid_thw: list[list[int]] | torch.Tensor | None,
video_grid_thw: list[list[int]] | torch.Tensor | None,
second_per_grid_ts: list[float] | None = None,
audio_feature_lengths: torch.Tensor | None = None,
use_audio_in_video: bool = False,
) -> tuple[torch.Tensor, int]: ) -> tuple[torch.Tensor, int]:
"""Get M-RoPE input positions for Qwen2-VL model.""" kwargs = MultiModalFeatureSpec.gather_kwargs(
if image_grid_thw is None: mm_features,
image_grid_thw = [] {"image_grid_thw", "video_grid_thw", "second_per_grid_ts"},
if video_grid_thw is None: )
video_grid_thw = [] image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
if second_per_grid_ts is None: video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
second_per_grid_ts = [] second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
hf_config = self.config
image_token_id = hf_config.image_token_id image_token_id = hf_config.image_token_id
video_token_id = hf_config.video_token_id video_token_id = hf_config.video_token_id
vision_start_token_id = hf_config.vision_start_token_id vision_start_token_id = hf_config.vision_start_token_id
@ -1291,20 +1288,12 @@ class Qwen2VLForConditionalGeneration(
else: else:
ed_video = len(input_tokens) + 1 ed_video = len(input_tokens) + 1
if ed_image < ed_video: if ed_image < ed_video:
t, h, w = ( t, h, w = image_grid_thw[image_index]
image_grid_thw[image_index][0],
image_grid_thw[image_index][1],
image_grid_thw[image_index][2],
)
image_index += 1 image_index += 1
remain_images -= 1 remain_images -= 1
ed = ed_image ed = ed_image
else: else:
t, h, w = ( t, h, w = video_grid_thw[video_index]
video_grid_thw[video_index][0],
video_grid_thw[video_index][1],
video_grid_thw[video_index][2],
)
video_second_per_grid_t = 1.0 video_second_per_grid_t = 1.0
if second_per_grid_ts: if second_per_grid_ts:
video_second_per_grid_t = second_per_grid_ts[video_index] video_second_per_grid_t = second_per_grid_ts[video_index]

View File

@ -65,7 +65,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.qwen2_audio import Qwen2AudioProcessingInfo from vllm.model_executor.models.qwen2_audio import Qwen2AudioProcessingInfo
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalKwargsItems from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems
from vllm.multimodal.processing import ( from vllm.multimodal.processing import (
BaseMultiModalProcessor, BaseMultiModalProcessor,
@ -1414,39 +1414,48 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
def get_mrope_input_positions( def get_mrope_input_positions(
self, self,
input_tokens: list[int], input_tokens: list[int],
hf_config: PretrainedConfig, mm_features: list[MultiModalFeatureSpec],
image_grid_thw: list[list[int]] | torch.Tensor | None,
video_grid_thw: list[list[int]] | torch.Tensor | None,
second_per_grid_ts: list[float] | None = None,
audio_feature_lengths: torch.Tensor | None = None,
use_audio_in_video: bool = False,
) -> tuple[torch.Tensor, int]: ) -> tuple[torch.Tensor, int]:
config = hf_config.thinker_config kwargs = MultiModalFeatureSpec.gather_kwargs(
if isinstance(image_grid_thw, list): mm_features,
image_grid_thw = torch.tensor(image_grid_thw) {
if isinstance(video_grid_thw, list): "image_grid_thw",
video_grid_thw = torch.tensor(video_grid_thw) "video_grid_thw",
"second_per_grid_ts",
"audio_feature_lengths",
"use_audio_in_video",
},
)
image_grid_thw = kwargs.get("image_grid_thw", [])
video_grid_thw = kwargs.get("video_grid_thw", [])
second_per_grid_ts = kwargs.get("second_per_grid_ts", [])
audio_feature_lengths = kwargs.get("audio_feature_lengths", [])
use_audio_in_video = any(kwargs.get("use_audio_in_video", []))
image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)(
image_grid_thw
)
video_grid_thw = (torch.stack if video_grid_thw else torch.tensor)(
video_grid_thw
)
input_ids = torch.tensor(input_tokens) input_ids = torch.tensor(input_tokens)
if input_ids is None or input_ids.ndim != 1: if input_ids is None or input_ids.ndim != 1:
raise ValueError("_omni3_get_input_positions_tensor expects 1D input_ids") raise ValueError("_omni3_get_input_positions_tensor expects 1D input_ids")
seq_len = input_ids.shape[0] seq_len = input_ids.shape[0]
if audio_feature_lengths is not None and not isinstance(
audio_feature_lengths, torch.Tensor if isinstance(audio_feature_lengths, list):
): audio_feature_lengths = torch.tensor(
audio_feature_lengths = torch.as_tensor(
audio_feature_lengths, dtype=torch.long audio_feature_lengths, dtype=torch.long
) )
if second_per_grid_ts is None:
if video_grid_thw is not None and video_grid_thw.numel() > 0: if not len(second_per_grid_ts) and len(video_grid_thw):
second_per_grids = torch.ones( second_per_grids = torch.ones(len(video_grid_thw), dtype=torch.float32)
video_grid_thw.shape[0], dtype=torch.float32
)
else:
second_per_grids = torch.tensor([], dtype=torch.float32)
else: else:
second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32) second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32)
config = self.config
spatial_merge_size = config.vision_config.spatial_merge_size spatial_merge_size = config.vision_config.spatial_merge_size
image_token_id = config.image_token_id image_token_id = config.image_token_id
video_token_id = config.video_token_id video_token_id = config.video_token_id

View File

@ -34,7 +34,7 @@ import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from transformers import BatchFeature, PretrainedConfig from transformers import BatchFeature
from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast
from transformers.models.qwen2_vl.image_processing_qwen2_vl import ( from transformers.models.qwen2_vl.image_processing_qwen2_vl import (
smart_resize as image_smart_resize, smart_resize as image_smart_resize,
@ -70,6 +70,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict, MultiModalDataDict,
MultiModalFeatureSpec,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargsItem, MultiModalKwargsItem,
MultiModalKwargsItems, MultiModalKwargsItems,
@ -1416,17 +1417,18 @@ class Qwen3VLForConditionalGeneration(
def get_mrope_input_positions( def get_mrope_input_positions(
self, self,
input_tokens: list[int], input_tokens: list[int],
hf_config: PretrainedConfig, mm_features: list[MultiModalFeatureSpec],
image_grid_thw: list[list[int]] | torch.Tensor,
video_grid_thw: list[list[int]] | torch.Tensor,
second_per_grid_ts: list[float] | None = None,
audio_feature_lengths: torch.Tensor | None = None,
use_audio_in_video: bool = False,
) -> tuple[torch.Tensor, int]: ) -> tuple[torch.Tensor, int]:
"""Get mrope input positions and delta value.""" kwargs = MultiModalFeatureSpec.gather_kwargs(
mm_features,
{"image_grid_thw", "video_grid_thw"},
)
image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
video_grid_thw = [[1, h, w] for t, h, w in video_grid_thw for _ in range(t)] video_grid_thw = [[1, h, w] for t, h, w in video_grid_thw for _ in range(t)]
hf_config = self.config
image_token_id = hf_config.image_token_id image_token_id = hf_config.image_token_id
video_token_id = hf_config.video_token_id video_token_id = hf_config.video_token_id
vision_start_token_id = hf_config.vision_start_token_id vision_start_token_id = hf_config.vision_start_token_id
@ -1455,20 +1457,12 @@ class Qwen3VLForConditionalGeneration(
else: else:
ed_video = len(input_tokens) + 1 ed_video = len(input_tokens) + 1
if ed_image < ed_video: if ed_image < ed_video:
t, h, w = ( t, h, w = image_grid_thw[image_index]
image_grid_thw[image_index][0],
image_grid_thw[image_index][1],
image_grid_thw[image_index][2],
)
image_index += 1 image_index += 1
remain_images -= 1 remain_images -= 1
ed = ed_image ed = ed_image
else: else:
t, h, w = ( t, h, w = video_grid_thw[video_index]
video_grid_thw[video_index][0],
video_grid_thw[video_index][1],
video_grid_thw[video_index][2],
)
video_index += 1 video_index += 1
remain_videos -= 1 remain_videos -= 1
ed = ed_video ed = ed_video

View File

@ -27,6 +27,7 @@ from vllm.model_executor.models.utils import WeightsMapper
from vllm.multimodal import MultiModalKwargsItems from vllm.multimodal import MultiModalKwargsItems
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict, MultiModalDataDict,
MultiModalFeatureSpec,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalInputs, MultiModalInputs,
MultiModalUUIDDict, MultiModalUUIDDict,
@ -38,7 +39,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
if TYPE_CHECKING: if TYPE_CHECKING:
from transformers import BatchFeature, PretrainedConfig from transformers import BatchFeature
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions from vllm.config.multimodal import BaseDummyOptions
@ -367,20 +368,34 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
def get_mrope_input_positions( def get_mrope_input_positions(
self, self,
input_tokens: list[int], input_tokens: list[int],
hf_config: "PretrainedConfig", mm_features: list[MultiModalFeatureSpec],
image_grid_thw: list[list[int]] | torch.Tensor | None,
video_grid_thw: list[list[int]] | torch.Tensor | None,
second_per_grid_ts: list[float] | None = None,
audio_feature_lengths: torch.Tensor | None = None,
use_audio_in_video: bool = False,
) -> tuple[torch.Tensor, int]: ) -> tuple[torch.Tensor, int]:
if any((second_per_grid_ts, audio_feature_lengths, use_audio_in_video)): kwargs = MultiModalFeatureSpec.gather_kwargs(
mm_features,
{
"image_grid_thw",
"video_grid_thw",
"second_per_grid_ts",
"audio_feature_lengths",
"use_audio_in_video",
},
)
if any(
v
for k, v in kwargs.items()
if k not in {"image_grid_thw", "video_grid_thw"}
):
raise NotImplementedError("Transformers backend only supports images.") raise NotImplementedError("Transformers backend only supports images.")
if isinstance(image_grid_thw, list): image_grid_thw = kwargs.get("image_grid_thw", [])
image_grid_thw = torch.tensor(image_grid_thw) video_grid_thw = kwargs.get("video_grid_thw", [])
if isinstance(video_grid_thw, list):
video_grid_thw = torch.tensor(video_grid_thw) image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)(
image_grid_thw
)
video_grid_thw = (torch.stack if video_grid_thw else torch.tensor)(
video_grid_thw
)
mrope_positions, mrope_position_delta = self.model.get_rope_index( mrope_positions, mrope_position_delta = self.model.get_rope_index(
input_ids=torch.tensor(input_tokens).unsqueeze(0), input_ids=torch.tensor(input_tokens).unsqueeze(0),

View File

@ -249,6 +249,19 @@ class MultiModalFeatureSpec:
mm_position: PlaceholderRange mm_position: PlaceholderRange
"""e.g., PlaceholderRange(offset=2, length=336)""" """e.g., PlaceholderRange(offset=2, length=336)"""
@staticmethod
def gather_kwargs(features: list["MultiModalFeatureSpec"], keys: set[str]):
kwargs = defaultdict[str, list[NestedTensors]](list)
for f in features:
item = f.data
if item is not None:
for k in keys:
if k in item:
kwargs[k].append(item[k].data)
return dict(kwargs)
@dataclass @dataclass
class MultiModalFieldElem: class MultiModalFieldElem:

View File

@ -892,38 +892,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.input_batch.num_accepted_tokens_cpu[i] = num_tokens self.input_batch.num_accepted_tokens_cpu[i] = num_tokens
def _init_mrope_positions(self, req_state: CachedRequestState): def _init_mrope_positions(self, req_state: CachedRequestState):
image_grid_thw = [] model = self.get_model()
video_grid_thw = [] assert supports_mrope(model), "M-RoPE support is not implemented."
second_per_grid_ts = []
audio_feature_lengths = []
use_audio_in_video = False
for mm_feature in req_state.mm_features:
mm_item = mm_feature.data
if mm_item is None:
continue
mm_input = mm_item.get_data()
if (t := mm_input.get("image_grid_thw")) is not None:
image_grid_thw.append(t.tolist())
if (t := mm_input.get("video_grid_thw")) is not None:
video_grid_thw.append(t.tolist())
if (t := mm_input.get("second_per_grid_ts")) is not None:
second_per_grid_ts.append(t)
if (t := mm_input.get("audio_feature_lengths")) is not None:
audio_feature_lengths.append(t)
if mm_input.get("use_audio_in_video") is True:
use_audio_in_video = True
assert supports_mrope(self.get_model()), "M-RoPE support is not implemented."
req_state.mrope_positions, req_state.mrope_position_delta = ( req_state.mrope_positions, req_state.mrope_position_delta = (
self.model.get_mrope_input_positions( model.get_mrope_input_positions(
req_state.prompt_token_ids, req_state.prompt_token_ids,
hf_config=self.model_config.hf_config, req_state.mm_features,
image_grid_thw=image_grid_thw,
video_grid_thw=video_grid_thw,
second_per_grid_ts=second_per_grid_ts,
audio_feature_lengths=audio_feature_lengths,
use_audio_in_video=use_audio_in_video,
) )
) )