Remove TensorStream logic no longer needed after updating get_mrope_input_positions.

Signed-off-by: oscardev256 <42308241+oscardev256@users.noreply.github.com>
2026-04-22 12:27:03 +08:00 · 2025-12-15 20:25:03 -05:00 · 2025-12-15 20:25:03 -05:00 · c9f3d502c9
commit c9f3d502c9
parent 748508ae4f
1 changed files with 13 additions and 269 deletions
--- a/vllm/model_executor/models/isaac.py
+++ b/vllm/model_executor/models/isaac.py
@ -2,11 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations

-import itertools
 import math
 from collections.abc import Iterable, Iterator, Mapping, Sequence
-from dataclasses import dataclass
-from enum import Enum
 from typing import Any

 import numpy as np
@ -75,272 +72,6 @@ from vllm.sequence import IntermediateTensors
 from vllm.tokenizers import get_tokenizer
 from vllm.tokenizers.hf import get_cached_tokenizer

-# ===== TensorStream Compatibility Layer for Isaac MRoPE =====
-# Minimal implementation of TensorStream classes needed for Isaac's 3D positional
-# encoding
-
-
-class ModalityType(Enum):
-    """
-    Base class for modality-type enumerations.
-    Each derived class (VisionType, TextType) holds
-    an integer value that identifies a specific modality.
-
-    Example usage:
-        If you have an object `my_event` of class `Event`,
-        you might write:
-            if my_event.type == VisionType.image:
-                # process an image frame
-
-    The methods below implement ordering and hashing
-    based on the integer `.value` of each enum member.
-    """
-
-    @property
-    def modality(self):
-        return self.__class__
-
-    def __lt__(self, other):
-        if isinstance(other, ModalityType):
-            return self.value < other.value
-        raise NotImplementedError()
-
-    def __eq__(self, other):
-        if isinstance(other, ModalityType):
-            return self.value == other.value
-        raise NotImplementedError()
-
-    def __hash__(self):
-        return hash(self.value)
-
-
-# NOTE: modality types need to be unique
-class VisionType(ModalityType):
-    """
-    Enum for vision modalities such as key video frames.
-    Typically used in video processing or image sequences.
-
-    Members:
-        image: A single image frame.
-    """
-
-    image = 0
-
-
-class TextType(ModalityType):
-    """
-    Enum for text tokens and padding.
-
-    Members:
-        text: Actual textual tokens.
-        padding: Padding tokens used in sequence batching.
-    """
-
-    text = 1
-    padding = 2
-
-
-@dataclass
-class Event:
-    """Represents a single modality event with spatial/temporal dimensions."""
-
-    """
-    Represents a single data occurrence (with a specific type, time interval, and
-    data payload).
-
-    Attributes:
-        data (Any): The actual data payload (e.g. a torch.Tensor, a string,
-            etc.).
-        type (ModalityType): The modality type of the data (e.g.,
-            VisionType.image).
-        time (Tuple[float, float]): (start_time, end_time) indicating when this
-            Event occurs.
-        role (Optional[str]): The role associated with this event (e.g., "user",
-            "agent", "system"). If None, the event is always included in loss
-            calculation.
-
-    Example usage:
-        evt = Event(data=torch.zeros((1, 224, 224, 3)),  # e.g. a single image frame
-                    type=VisionType.image,
-                    time=(0.0, 0.04),
-                    role="user")
-    """
-    # Descriptors
-    modality_type: ModalityType
-
-    # Structure
-    dims_virtual: list[int] | None = (
-        None  # virtual/processed dimensions (e.g., pixel-shuffled)
-    )
-    dims_real: list[int] | None = None  # real/actual tensor dimensions
-    idx_range: tuple[int, int] | None = None
-
-    def dims(self, virtual: bool = True) -> list[int] | None:
-        """
-        Get the dimensions of this event.
-
-        Args:
-            virtual: If True (default), return virtual/processed dimensions
-                (e.g., pixel-shuffled). If False, return real/actual tensor
-                dimensions.
-
-        Returns:
-            Dimensions list or None if not measured.
-        """
-        if virtual:
-            return self.dims_virtual
-        else:
-            return self.dims_real
-
-    def num_tokens(self, partial=True, virtual=True) -> int:
-        if not virtual:
-            assert partial is False and isinstance(self.data, torch.Tensor)
-            return math.prod(self.dims(virtual=False))
-        return (
-            self.idx_range[1] - self.idx_range[0] if partial else math.prod(self.dims())
-        )
-
-
-@dataclass
-class Stream:
-    """
-    Represents an ordered sequence of Event objects, each with
-    a specific ModalityType and a time range.
-
-    Attributes:
-        events (List[Event]): The list of Event objects in the stream.
-        priority (List[ModalityType]): A list of modality types that define
-            how we might want to reorder or prioritize events if scheduling is needed.
-
-    Example usage:
-        # Create two events of different types
-        evt1 = Event(torch.zeros((1, 224, 224, 3)), VisionType.image, (0.0, 0.04))
-        evt2 = Event(torch.randint(0, 1000, (16, 1)), TextType.text, (0.0, 0.32))
-
-        # Make a stream with a given priority
-        s = Stream(events=[evt1, evt2],
-                   priority=[VisionType.image, TextType.text])
-
-        print(s)
-    """
-
-    events: list[Event]
-
-    def __len__(self):
-        """Returns the number of Event objects in this Stream."""
-        return len(self.events)
-
-    def __getitem__(self, key: int) -> Stream | Event:
-        return self.events[key]
-
-    def __iter__(self):
-        """
-        Yields each Event in the Stream, enabling iteration like:
-            for event in my_stream:
-                ...
-        """
-        yield from self.events
-
-
-# TODO: implement all types of cool indexing which can happen since TensorStream
-# assumes Event.data = Tensor
-@dataclass
-class TensorStream:
-    streams: list[Stream]
-    _device: torch.device | None = None
-
-    @property
-    def device(self):
-        return self._device
-
-    @property
-    def shape(self):
-        seq_lens = [sum([ev.num_tokens() for ev in stream]) for stream in self.streams]
-        assert all([sl == seq_lens[0] for sl in seq_lens]), (
-            f"each stream must have same token count to have a shape: {seq_lens}"
-        )
-        return (len(seq_lens), seq_lens[0])
-
-
-def compute_mrope_pos_tensor(ts: TensorStream, n_pos_dims: int = 3) -> torch.Tensor:
-    """
-    Create a (batch, T, n_pos_dims) position tensor in one sweep.
-    The first dim is the running “time” index, the rest are spatial (or 1-fillers).
-
-    Args:
-        ts         : TensorStream
-        n_pos_dims : total coordinate dimensions (default 3)
-
-    Returns:
-        torch.LongTensor  - shape (batch_size, seq_len, n_pos_dims)
-    """
-
-    # Manually iterate through streams and events like map_compact does,
-    # but maintain cumulative time offset for each stream
-    all_coords = []
-    for stream in ts.streams:  # one Stream == one batch sample
-        cumulative_offset = 0  # running time index for this stream
-
-        for event in stream:
-            # --- build coordinate grid for THIS event using itertools
-            # (no tensor ops) ---
-            dims = (event.dims() or [1]) + [1] * (n_pos_dims - len(event.dims() or []))
-
-            # Create ranges for each dimension (similar to old _finalize implementation)
-            first_dim = list(range(cumulative_offset, cumulative_offset + dims[0]))
-            cumulative_offset += dims[0]  # advance time for the next event
-
-            if event.modality_type != VisionType.image:
-                full_coords = [(t, t, t) for t in first_dim]
-            else:
-                other_dims = [range(d) for d in dims[1:]]
-                full_coords = list(itertools.product(first_dim, *other_dims))
-
-            # Slice if the event is partial
-            s, e = event.idx_range
-            coords = full_coords[s:e]
-
-            # Extend the flattened coordinate list
-            all_coords.extend(coords)
-
-    # Convert to tensor and reshape to (B, T, n_pos_dims)
-    B, T = ts.shape
-    return torch.tensor(all_coords, dtype=torch.long, device=ts.device).reshape(
-        B, T, n_pos_dims
-    )
-
-
-def _resolve_vision_token_id(model_config: ModelConfig, vision_token: str) -> int:
-    tokenizer_name = model_config.tokenizer or model_config.model
-    tokenizer = get_cached_tokenizer(
-        get_tokenizer(
-            tokenizer_name,
-            tokenizer_mode=model_config.tokenizer_mode,
-            trust_remote_code=model_config.trust_remote_code,
-            revision=model_config.tokenizer_revision or model_config.revision,
-        )
-    )
-    return tokenizer.encode(vision_token, add_special_tokens=False)[0]
-
-
-def modality_mask(ts: TensorStream, modality_type: ModalityType) -> torch.Tensor:
-    """Create boolean mask for specific modality type in the tensor stream."""
-    B, T = ts.shape
-    mask = torch.zeros((B, T), dtype=torch.bool, device=ts.device)
-
-    for batch_idx, stream in enumerate(ts.streams):
-        seq_idx = 0
-        for event in stream:
-            if event.modality_type == modality_type:
-                start, end = event.idx_range
-                mask[batch_idx, seq_idx : seq_idx + (end - start)] = True
-            seq_idx += event.idx_range[1] - event.idx_range[0]
-
-    return mask
-
-
-# ===== End TensorStream Compatibility Layer =====
-

 class PixelShuffleSiglip2VisionConfig(Siglip2VisionConfig):
    """Vision configuration for Isaac with Pixel Shuffle support.
@ -738,6 +469,19 @@ _MEAN_TENSOR = torch.tensor(VISION_MEAN, dtype=torch.float32).view(1, 1, 1, -1)
 _STD_TENSOR = torch.tensor(VISION_STD, dtype=torch.float32).view(1, 1, 1, -1)


+def _resolve_vision_token_id(model_config: ModelConfig, vision_token: str) -> int:
+    tokenizer_name = model_config.tokenizer or model_config.model
+    tokenizer = get_cached_tokenizer(
+        get_tokenizer(
+            tokenizer_name,
+            tokenizer_mode=model_config.tokenizer_mode,
+            trust_remote_code=model_config.trust_remote_code,
+            revision=model_config.tokenizer_revision or model_config.revision,
+        )
+    )
+    return tokenizer.encode(vision_token, add_special_tokens=False)[0]
+
+
 def prepare_image_tensor(
    image: torch.Tensor,
    scale: float = VISION_SCALE,