From d0e186c16f0d62af8c128e2dc7c94cde1387ac02 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 11 Nov 2025 00:30:06 +0800 Subject: [PATCH] [V0 Deprecation] Remove unused `context_len` and `seq_len` from M-RoPE (#28395) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/ernie45_vl.py | 3 --- vllm/model_executor/models/glm4_1v.py | 3 --- vllm/model_executor/models/glm4v.py | 3 --- vllm/model_executor/models/interfaces.py | 4 ---- vllm/model_executor/models/keye.py | 3 --- vllm/model_executor/models/keye_vl1_5.py | 3 --- vllm/model_executor/models/paddleocr_vl.py | 3 --- vllm/model_executor/models/qwen2_5_omni_thinker.py | 3 --- vllm/model_executor/models/qwen2_5_vl.py | 3 --- vllm/model_executor/models/qwen2_vl.py | 3 --- vllm/model_executor/models/qwen3_omni_moe_thinker.py | 2 -- vllm/model_executor/models/qwen3_vl.py | 4 +--- vllm/model_executor/models/transformers/multimodal.py | 4 +--- 13 files changed, 2 insertions(+), 39 deletions(-) diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 7c1eba103ae7e..f287cff12086b 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -1435,8 +1435,6 @@ class Ernie4_5_VLMoeForConditionalGeneration( hf_config: PretrainedConfig, image_grid_thw: list[list[int]] | torch.Tensor, video_grid_thw: list[list[int]] | torch.Tensor, - context_len: int = 0, - seq_len: int | None = None, second_per_grid_ts: list[float] | None = None, audio_feature_lengths: torch.Tensor | None = None, use_audio_in_video: bool = False, @@ -1569,7 +1567,6 @@ class Ernie4_5_VLMoeForConditionalGeneration( llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1)) llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) - llm_positions = llm_positions[:, context_len:seq_len] mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() return llm_positions, mrope_position_delta diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 121e84469c52f..b9cd3545ec453 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1622,8 +1622,6 @@ class Glm4vForConditionalGeneration( image_grid_thw: list[list[int]] | torch.Tensor | None, video_grid_thw: list[list[int]] | torch.Tensor | None, second_per_grid_ts: list[float] | None = None, - context_len: int = 0, - seq_len: int | None = None, audio_feature_lengths: torch.Tensor | None = None, use_audio_in_video: bool = False, ) -> tuple[torch.Tensor, int]: @@ -1754,7 +1752,6 @@ class Glm4vForConditionalGeneration( llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1)) llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) - llm_positions = llm_positions[:, context_len:seq_len] mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() return llm_positions, mrope_position_delta diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 2de1e48109521..ebf6934dddead 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -625,8 +625,6 @@ class GLM4VForCausalLM( hf_config: PretrainedConfig, image_grid_thw: list[list[int]] | torch.Tensor, video_grid_thw: list[list[int]] | torch.Tensor, - context_len: int = 0, - seq_len: int | None = None, second_per_grid_ts: list[float] | None = None, audio_feature_lengths: torch.Tensor | None = None, use_audio_in_video: bool = False, @@ -758,7 +756,6 @@ class GLM4VForCausalLM( llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1)) llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) - llm_positions = llm_positions[:, context_len:seq_len] mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() return llm_positions, mrope_position_delta diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index b634c7ec7d67b..d6a8f86d998bb 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -995,8 +995,6 @@ class SupportsMRoPE(Protocol): image_grid_thw: list[list[int]] | torch.Tensor | None, video_grid_thw: list[list[int]] | torch.Tensor | None, second_per_grid_ts: list[float] | None = None, - context_len: int = 0, - seq_len: int | None = None, audio_feature_lengths: torch.Tensor | None = None, use_audio_in_video: bool = False, ) -> tuple[torch.Tensor, int]: @@ -1012,8 +1010,6 @@ class SupportsMRoPE(Protocol): image_grid_thw: Image grid dimensions (t, h, w) video_grid_thw: Video grid dimensions (t, h, w) second_per_grid_ts: Seconds per grid timestep for videos - context_len: Context length - seq_len: Sequence length audio_feature_lengths: Audio feature lengths for multimodal models use_audio_in_video: Whether to use audio in video for interleaving diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 5f8659a3064eb..42f16ad9f3b3a 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -1630,8 +1630,6 @@ class KeyeForConditionalGeneration( hf_config: PretrainedConfig, image_grid_thw: list[list[int]] | torch.Tensor, video_grid_thw: list[list[int]] | torch.Tensor, - context_len: int = 0, - seq_len: int | None = None, second_per_grid_ts: list[float] | None = None, audio_feature_lengths: torch.Tensor | None = None, use_audio_in_video: bool = False, @@ -1759,6 +1757,5 @@ class KeyeForConditionalGeneration( llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() - llm_positions = llm_positions[:, context_len:seq_len] return llm_positions, mrope_position_delta diff --git a/vllm/model_executor/models/keye_vl1_5.py b/vllm/model_executor/models/keye_vl1_5.py index 13e5b2d5f1575..6f95a59d36d29 100644 --- a/vllm/model_executor/models/keye_vl1_5.py +++ b/vllm/model_executor/models/keye_vl1_5.py @@ -600,8 +600,6 @@ class KeyeVL1_5ForConditionalGeneration( hf_config: PretrainedConfig, image_grid_thw: list[list[int]] | torch.Tensor, video_grid_thw: list[list[int]] | torch.Tensor, - context_len: int = 0, - seq_len: int | None = None, second_per_grid_ts: list[float] | None = None, audio_feature_lengths: torch.Tensor | None = None, use_audio_in_video: bool = False, @@ -729,6 +727,5 @@ class KeyeVL1_5ForConditionalGeneration( llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() - llm_positions = llm_positions[:, context_len:seq_len] return llm_positions, mrope_position_delta diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index 377b41a355782..631475c964c0b 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -1179,8 +1179,6 @@ class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, Support image_grid_thw: list[list[int]] | torch.Tensor, video_grid_thw: list[list[int]] | torch.Tensor, second_per_grid_ts: list[float], - context_len: int = 0, - seq_len: int | None = None, audio_feature_lengths: torch.Tensor | None = None, use_audio_in_video: bool = False, ) -> tuple[torch.Tensor, int]: @@ -1293,7 +1291,6 @@ class PaddleOCRVLForConditionalGeneration(nn.Module, SupportsMultiModal, Support llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() - llm_positions = llm_positions[:, context_len:seq_len] return llm_positions, mrope_position_delta diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 7e970ebbe2bbc..fac281d2caf49 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -927,8 +927,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration( image_grid_thw: list[list[int]] | torch.Tensor, video_grid_thw: list[list[int]] | torch.Tensor, second_per_grid_ts: list[float] | None = None, - context_len: int = 0, - seq_len: int | None = None, audio_feature_lengths: torch.Tensor | None = None, use_audio_in_video: bool = False, ) -> tuple[torch.Tensor, int]: @@ -1125,7 +1123,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration( mrope_position_delta = ( torch.cat(llm_pos_ids_list, dim=1).max() + 1 - len(src_item) ) - llm_positions = llm_positions[:, context_len:seq_len] return llm_positions, mrope_position_delta diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index d337f1606943a..48834ba699e4c 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1118,8 +1118,6 @@ class Qwen2_5_VLForConditionalGeneration( image_grid_thw: list[list[int]] | torch.Tensor, video_grid_thw: list[list[int]] | torch.Tensor, second_per_grid_ts: list[float], - context_len: int = 0, - seq_len: int | None = None, audio_feature_lengths: torch.Tensor | None = None, use_audio_in_video: bool = False, ) -> tuple[torch.Tensor, int]: @@ -1232,7 +1230,6 @@ class Qwen2_5_VLForConditionalGeneration( llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() - llm_positions = llm_positions[:, context_len:seq_len] return llm_positions, mrope_position_delta diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 9206ac8f9d032..b3999e6c934e3 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1240,8 +1240,6 @@ class Qwen2VLForConditionalGeneration( image_grid_thw: list[list[int]] | torch.Tensor | None, video_grid_thw: list[list[int]] | torch.Tensor | None, second_per_grid_ts: list[float] | None = None, - context_len: int = 0, - seq_len: int | None = None, audio_feature_lengths: torch.Tensor | None = None, use_audio_in_video: bool = False, ) -> tuple[torch.Tensor, int]: @@ -1360,7 +1358,6 @@ class Qwen2VLForConditionalGeneration( llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() - llm_positions = llm_positions[:, context_len:seq_len] return llm_positions, mrope_position_delta diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index f20e679027214..da489a812f55d 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -1417,8 +1417,6 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( image_grid_thw: list[list[int]] | torch.Tensor | None, video_grid_thw: list[list[int]] | torch.Tensor | None, second_per_grid_ts: list[float] | None = None, - context_len: int = 0, - seq_len: int | None = None, audio_feature_lengths: torch.Tensor | None = None, use_audio_in_video: bool = False, ) -> tuple[torch.Tensor, int]: diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 2d8f431bb8fa7..fe0124ef3258b 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1419,8 +1419,6 @@ class Qwen3VLForConditionalGeneration( hf_config: PretrainedConfig, image_grid_thw: list[list[int]] | torch.Tensor, video_grid_thw: list[list[int]] | torch.Tensor, - context_len: int = 0, - seq_len: int | None = None, second_per_grid_ts: list[float] | None = None, audio_feature_lengths: torch.Tensor | None = None, use_audio_in_video: bool = False, @@ -1519,7 +1517,7 @@ class Qwen3VLForConditionalGeneration( llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() - llm_positions = llm_positions[:, context_len:seq_len] + return llm_positions, mrope_position_delta def get_language_model(self) -> torch.nn.Module: diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py index 10abd86595360..476074542e6ae 100644 --- a/vllm/model_executor/models/transformers/multimodal.py +++ b/vllm/model_executor/models/transformers/multimodal.py @@ -371,8 +371,6 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE): image_grid_thw: list[list[int]] | torch.Tensor | None, video_grid_thw: list[list[int]] | torch.Tensor | None, second_per_grid_ts: list[float] | None = None, - context_len: int = 0, - seq_len: int | None = None, audio_feature_lengths: torch.Tensor | None = None, use_audio_in_video: bool = False, ) -> tuple[torch.Tensor, int]: @@ -390,7 +388,7 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE): video_grid_thw=video_grid_thw, ) - mrope_positions = mrope_positions[:, 0, context_len:seq_len] + mrope_positions = mrope_positions[:, 0] mrope_position_delta = mrope_position_delta[0].item() return mrope_positions, mrope_position_delta