From b8f603cebe39315bcd7cca1648bdb88226e5caa0 Mon Sep 17 00:00:00 2001 From: tomeras91 <57313761+tomeras91@users.noreply.github.com> Date: Mon, 6 Oct 2025 19:23:37 +0300 Subject: [PATCH] [Model] EVS support for nano_nemotron_vl (#26269) Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com> Signed-off-by: tomeras91 <57313761+tomeras91@users.noreply.github.com> Signed-off-by: Eugene Khvedchenia Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Eugene Khvedchenia --- .../model_executor/models/nano_nemotron_vl.py | 228 ++++++++++++++++-- vllm/model_executor/models/qwen2_5_vl.py | 8 +- vllm/multimodal/evs.py | 21 +- 3 files changed, 225 insertions(+), 32 deletions(-) diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 7c64d14ca9d7c..039ffbddf8dba 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -30,6 +30,7 @@ from vllm.model_executor.models.interfaces import ( IsHybrid, MultiModalEmbeddings, SupportsMultiModal, + SupportsMultiModalPruning, ) from vllm.model_executor.models.internvl import ( calculate_internvl_targets, @@ -44,6 +45,10 @@ from vllm.model_executor.models.utils import ( maybe_prefix, ) from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.evs import ( + compute_retained_tokens_count, + compute_retention_mask, +) from vllm.multimodal.inputs import ( MultiModalDataDict, MultiModalFieldConfig, @@ -62,13 +67,20 @@ from vllm.multimodal.processing import ( PromptReplacement, PromptUpdate, PromptUpdateDetails, + _seq2tokens, ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.radio import RadioConfig -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.transformers_utils.tokenizer import ( + AnyTokenizer, + cached_tokenizer_from_config, + encode_tokens, +) from vllm.utils.tensor_schema import TensorSchema, TensorShape +from .utils import _merge_multimodal_embeddings + # Configure PIL to handle large images without warnings # This prevents DecompressionBombWarning for legitimate large images Image.MAX_IMAGE_PIXELS = None # Disable the limit entirely @@ -382,6 +394,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, video_token: Optional[str] = None, + video_pruning_rate: Optional[float] = None, ) -> None: super().__init__( config=config, @@ -392,6 +405,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): ) # add extra video token for video processing self.video_token = video_token + self.video_pruning_rate = video_pruning_rate @property def supports_video(self) -> bool: @@ -446,12 +460,38 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): ), } - for pixel_values in pixel_values_lst_video: - num_patches = pixel_values.shape[0] + image_size: int = self.config.force_image_size + patch_size: int = self.config.patch_size + downsample_ratio = self.config.downsample_ratio + tokens_in_single_frame = int( + (image_size * image_size // patch_size**2) * (downsample_ratio**2) + ) + + for pixel_values in pixel_values_lst_video: + num_frames = pixel_values.shape[0] + + if ( + self.video_pruning_rate is not None + and self.video_pruning_rate > 0.0 + ): + # Start of EVS-specific code + num_tokens = compute_retained_tokens_count( + tokens_per_frame=tokens_in_single_frame, + num_frames=num_frames, + q=self.video_pruning_rate, + ) + + # Here we just need placeholders that won't actually be replaced - + # we just need to make sure the total number of tokens is correct + # assign all tokens to the first frame + tokens_per_frame = [num_tokens] + [0] * (num_frames - 1) + + # End of EVS-specific code + else: + tokens_per_frame = [tokens_in_single_frame] * num_frames + + video_repl = self.get_video_repl(tokens_per_frame, self.video_token) - video_repl = self.get_video_repl( - self.num_image_token, num_patches, self.video_token - ) text = [t.replace("