From c9f3d502c927d94cadc580529d60c72a496a2448 Mon Sep 17 00:00:00 2001 From: oscardev256 <42308241+oscardev256@users.noreply.github.com> Date: Mon, 15 Dec 2025 20:25:03 -0500 Subject: [PATCH] Remove TensorStream logic no longer needed after updating get_mrope_input_positions. Signed-off-by: oscardev256 <42308241+oscardev256@users.noreply.github.com> --- vllm/model_executor/models/isaac.py | 282 ++-------------------------- 1 file changed, 13 insertions(+), 269 deletions(-) diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index 28fb68d62052a..17e498c3cc8dd 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -2,11 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations -import itertools import math from collections.abc import Iterable, Iterator, Mapping, Sequence -from dataclasses import dataclass -from enum import Enum from typing import Any import numpy as np @@ -75,272 +72,6 @@ from vllm.sequence import IntermediateTensors from vllm.tokenizers import get_tokenizer from vllm.tokenizers.hf import get_cached_tokenizer -# ===== TensorStream Compatibility Layer for Isaac MRoPE ===== -# Minimal implementation of TensorStream classes needed for Isaac's 3D positional -# encoding - - -class ModalityType(Enum): - """ - Base class for modality-type enumerations. - Each derived class (VisionType, TextType) holds - an integer value that identifies a specific modality. - - Example usage: - If you have an object `my_event` of class `Event`, - you might write: - if my_event.type == VisionType.image: - # process an image frame - - The methods below implement ordering and hashing - based on the integer `.value` of each enum member. - """ - - @property - def modality(self): - return self.__class__ - - def __lt__(self, other): - if isinstance(other, ModalityType): - return self.value < other.value - raise NotImplementedError() - - def __eq__(self, other): - if isinstance(other, ModalityType): - return self.value == other.value - raise NotImplementedError() - - def __hash__(self): - return hash(self.value) - - -# NOTE: modality types need to be unique -class VisionType(ModalityType): - """ - Enum for vision modalities such as key video frames. - Typically used in video processing or image sequences. - - Members: - image: A single image frame. - """ - - image = 0 - - -class TextType(ModalityType): - """ - Enum for text tokens and padding. - - Members: - text: Actual textual tokens. - padding: Padding tokens used in sequence batching. - """ - - text = 1 - padding = 2 - - -@dataclass -class Event: - """Represents a single modality event with spatial/temporal dimensions.""" - - """ - Represents a single data occurrence (with a specific type, time interval, and - data payload). - - Attributes: - data (Any): The actual data payload (e.g. a torch.Tensor, a string, - etc.). - type (ModalityType): The modality type of the data (e.g., - VisionType.image). - time (Tuple[float, float]): (start_time, end_time) indicating when this - Event occurs. - role (Optional[str]): The role associated with this event (e.g., "user", - "agent", "system"). If None, the event is always included in loss - calculation. - - Example usage: - evt = Event(data=torch.zeros((1, 224, 224, 3)), # e.g. a single image frame - type=VisionType.image, - time=(0.0, 0.04), - role="user") - """ - # Descriptors - modality_type: ModalityType - - # Structure - dims_virtual: list[int] | None = ( - None # virtual/processed dimensions (e.g., pixel-shuffled) - ) - dims_real: list[int] | None = None # real/actual tensor dimensions - idx_range: tuple[int, int] | None = None - - def dims(self, virtual: bool = True) -> list[int] | None: - """ - Get the dimensions of this event. - - Args: - virtual: If True (default), return virtual/processed dimensions - (e.g., pixel-shuffled). If False, return real/actual tensor - dimensions. - - Returns: - Dimensions list or None if not measured. - """ - if virtual: - return self.dims_virtual - else: - return self.dims_real - - def num_tokens(self, partial=True, virtual=True) -> int: - if not virtual: - assert partial is False and isinstance(self.data, torch.Tensor) - return math.prod(self.dims(virtual=False)) - return ( - self.idx_range[1] - self.idx_range[0] if partial else math.prod(self.dims()) - ) - - -@dataclass -class Stream: - """ - Represents an ordered sequence of Event objects, each with - a specific ModalityType and a time range. - - Attributes: - events (List[Event]): The list of Event objects in the stream. - priority (List[ModalityType]): A list of modality types that define - how we might want to reorder or prioritize events if scheduling is needed. - - Example usage: - # Create two events of different types - evt1 = Event(torch.zeros((1, 224, 224, 3)), VisionType.image, (0.0, 0.04)) - evt2 = Event(torch.randint(0, 1000, (16, 1)), TextType.text, (0.0, 0.32)) - - # Make a stream with a given priority - s = Stream(events=[evt1, evt2], - priority=[VisionType.image, TextType.text]) - - print(s) - """ - - events: list[Event] - - def __len__(self): - """Returns the number of Event objects in this Stream.""" - return len(self.events) - - def __getitem__(self, key: int) -> Stream | Event: - return self.events[key] - - def __iter__(self): - """ - Yields each Event in the Stream, enabling iteration like: - for event in my_stream: - ... - """ - yield from self.events - - -# TODO: implement all types of cool indexing which can happen since TensorStream -# assumes Event.data = Tensor -@dataclass -class TensorStream: - streams: list[Stream] - _device: torch.device | None = None - - @property - def device(self): - return self._device - - @property - def shape(self): - seq_lens = [sum([ev.num_tokens() for ev in stream]) for stream in self.streams] - assert all([sl == seq_lens[0] for sl in seq_lens]), ( - f"each stream must have same token count to have a shape: {seq_lens}" - ) - return (len(seq_lens), seq_lens[0]) - - -def compute_mrope_pos_tensor(ts: TensorStream, n_pos_dims: int = 3) -> torch.Tensor: - """ - Create a (batch, T, n_pos_dims) position tensor in one sweep. - The first dim is the running “time” index, the rest are spatial (or 1-fillers). - - Args: - ts : TensorStream - n_pos_dims : total coordinate dimensions (default 3) - - Returns: - torch.LongTensor - shape (batch_size, seq_len, n_pos_dims) - """ - - # Manually iterate through streams and events like map_compact does, - # but maintain cumulative time offset for each stream - all_coords = [] - for stream in ts.streams: # one Stream == one batch sample - cumulative_offset = 0 # running time index for this stream - - for event in stream: - # --- build coordinate grid for THIS event using itertools - # (no tensor ops) --- - dims = (event.dims() or [1]) + [1] * (n_pos_dims - len(event.dims() or [])) - - # Create ranges for each dimension (similar to old _finalize implementation) - first_dim = list(range(cumulative_offset, cumulative_offset + dims[0])) - cumulative_offset += dims[0] # advance time for the next event - - if event.modality_type != VisionType.image: - full_coords = [(t, t, t) for t in first_dim] - else: - other_dims = [range(d) for d in dims[1:]] - full_coords = list(itertools.product(first_dim, *other_dims)) - - # Slice if the event is partial - s, e = event.idx_range - coords = full_coords[s:e] - - # Extend the flattened coordinate list - all_coords.extend(coords) - - # Convert to tensor and reshape to (B, T, n_pos_dims) - B, T = ts.shape - return torch.tensor(all_coords, dtype=torch.long, device=ts.device).reshape( - B, T, n_pos_dims - ) - - -def _resolve_vision_token_id(model_config: ModelConfig, vision_token: str) -> int: - tokenizer_name = model_config.tokenizer or model_config.model - tokenizer = get_cached_tokenizer( - get_tokenizer( - tokenizer_name, - tokenizer_mode=model_config.tokenizer_mode, - trust_remote_code=model_config.trust_remote_code, - revision=model_config.tokenizer_revision or model_config.revision, - ) - ) - return tokenizer.encode(vision_token, add_special_tokens=False)[0] - - -def modality_mask(ts: TensorStream, modality_type: ModalityType) -> torch.Tensor: - """Create boolean mask for specific modality type in the tensor stream.""" - B, T = ts.shape - mask = torch.zeros((B, T), dtype=torch.bool, device=ts.device) - - for batch_idx, stream in enumerate(ts.streams): - seq_idx = 0 - for event in stream: - if event.modality_type == modality_type: - start, end = event.idx_range - mask[batch_idx, seq_idx : seq_idx + (end - start)] = True - seq_idx += event.idx_range[1] - event.idx_range[0] - - return mask - - -# ===== End TensorStream Compatibility Layer ===== - class PixelShuffleSiglip2VisionConfig(Siglip2VisionConfig): """Vision configuration for Isaac with Pixel Shuffle support. @@ -738,6 +469,19 @@ _MEAN_TENSOR = torch.tensor(VISION_MEAN, dtype=torch.float32).view(1, 1, 1, -1) _STD_TENSOR = torch.tensor(VISION_STD, dtype=torch.float32).view(1, 1, 1, -1) +def _resolve_vision_token_id(model_config: ModelConfig, vision_token: str) -> int: + tokenizer_name = model_config.tokenizer or model_config.model + tokenizer = get_cached_tokenizer( + get_tokenizer( + tokenizer_name, + tokenizer_mode=model_config.tokenizer_mode, + trust_remote_code=model_config.trust_remote_code, + revision=model_config.tokenizer_revision or model_config.revision, + ) + ) + return tokenizer.encode(vision_token, add_special_tokens=False)[0] + + def prepare_image_tensor( image: torch.Tensor, scale: float = VISION_SCALE,