diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 7c64d14ca9d7c..039ffbddf8dba 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -30,6 +30,7 @@ from vllm.model_executor.models.interfaces import ( IsHybrid, MultiModalEmbeddings, SupportsMultiModal, + SupportsMultiModalPruning, ) from vllm.model_executor.models.internvl import ( calculate_internvl_targets, @@ -44,6 +45,10 @@ from vllm.model_executor.models.utils import ( maybe_prefix, ) from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.evs import ( + compute_retained_tokens_count, + compute_retention_mask, +) from vllm.multimodal.inputs import ( MultiModalDataDict, MultiModalFieldConfig, @@ -62,13 +67,20 @@ from vllm.multimodal.processing import ( PromptReplacement, PromptUpdate, PromptUpdateDetails, + _seq2tokens, ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.radio import RadioConfig -from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.transformers_utils.tokenizer import ( + AnyTokenizer, + cached_tokenizer_from_config, + encode_tokens, +) from vllm.utils.tensor_schema import TensorSchema, TensorShape +from .utils import _merge_multimodal_embeddings + # Configure PIL to handle large images without warnings # This prevents DecompressionBombWarning for legitimate large images Image.MAX_IMAGE_PIXELS = None # Disable the limit entirely @@ -382,6 +394,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): max_dynamic_patch: Optional[int] = None, dynamic_image_size: Optional[bool] = None, video_token: Optional[str] = None, + video_pruning_rate: Optional[float] = None, ) -> None: super().__init__( config=config, @@ -392,6 +405,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): ) # add extra video token for video processing self.video_token = video_token + self.video_pruning_rate = video_pruning_rate @property def supports_video(self) -> bool: @@ -446,12 +460,38 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor): ), } - for pixel_values in pixel_values_lst_video: - num_patches = pixel_values.shape[0] + image_size: int = self.config.force_image_size + patch_size: int = self.config.patch_size + downsample_ratio = self.config.downsample_ratio + tokens_in_single_frame = int( + (image_size * image_size // patch_size**2) * (downsample_ratio**2) + ) + + for pixel_values in pixel_values_lst_video: + num_frames = pixel_values.shape[0] + + if ( + self.video_pruning_rate is not None + and self.video_pruning_rate > 0.0 + ): + # Start of EVS-specific code + num_tokens = compute_retained_tokens_count( + tokens_per_frame=tokens_in_single_frame, + num_frames=num_frames, + q=self.video_pruning_rate, + ) + + # Here we just need placeholders that won't actually be replaced - + # we just need to make sure the total number of tokens is correct + # assign all tokens to the first frame + tokens_per_frame = [num_tokens] + [0] * (num_frames - 1) + + # End of EVS-specific code + else: + tokens_per_frame = [tokens_in_single_frame] * num_frames + + video_repl = self.get_video_repl(tokens_per_frame, self.video_token) - video_repl = self.get_video_repl( - self.num_image_token, num_patches, self.video_token - ) text = [t.replace("