From f053784631acd622978b96434746d3d70a9fe166 Mon Sep 17 00:00:00 2001 From: oscardev256 <42308241+oscardev256@users.noreply.github.com> Date: Sat, 8 Nov 2025 23:11:33 -0500 Subject: [PATCH 01/18] Create isaac.py Signed-off-by: oscardev256 <42308241+oscardev256@users.noreply.github.com> --- vllm/model_executor/models/isaac.py | 1490 +++++++++++++++++++++++++++ 1 file changed, 1490 insertions(+) create mode 100644 vllm/model_executor/models/isaac.py diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py new file mode 100644 index 0000000000000..4f29d1cff347a --- /dev/null +++ b/vllm/model_executor/models/isaac.py @@ -0,0 +1,1490 @@ +from __future__ import annotations + +from collections.abc import Mapping, Sequence, Iterable +from typing import Any, Optional, Union +from typing_extensions import TypedDict, Unpack + +import itertools +from enum import Enum +from dataclasses import dataclass + +import math +import numpy as np +import PIL.Image +import torch +import torch.nn as nn +import torch.nn.functional as F + +from transformers import PretrainedConfig, Qwen3Config +from transformers.image_processing_utils import BatchFeature +from transformers.tokenization_utils import TensorType +from transformers.models.siglip2.modeling_siglip2 import ( + Siglip2MLP, +) +from transformers.models.siglip2.configuration_siglip2 import Siglip2VisionConfig + +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.model_executor.models.interfaces import SupportsMultiModal +from vllm.model_executor.models.utils import ( + WeightsMapper, + AutoWeightsLoader, + _merge_multimodal_embeddings, +) +from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM +from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.multimodal.processing import ( + BaseMultiModalProcessor, + BaseProcessingInfo, + PromptReplacement, +) +from vllm.multimodal.parse import MultiModalDataItems, ImageSize +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.multimodal.inputs import ( + MultiModalFieldConfig, + MultiModalKwargs, + MultiModalDataDict, +) +from vllm.config import VllmConfig +from vllm.model_executor.models.interfaces import ( + MultiModalEmbeddings, + SupportsLoRA, + SupportsMRoPE, + SupportsMultiModal, + SupportsPP, +) + +# ===== TensorStream Compatibility Layer for Isaac MRoPE ===== +# Minimal implementation of TensorStream classes needed for Isaac's 3D positional encoding + +class ModalityType(Enum): + """ + Base class for modality-type enumerations. + Each derived class (VisionType, TextType) holds + an integer value that identifies a specific modality. + + Example usage: + If you have an object `my_event` of class `Event`, + you might write: + if my_event.type == VisionType.image: + # process an image frame + + The methods below implement ordering and hashing + based on the integer `.value` of each enum member. + """ + + @property + def modality(self): + return self.__class__ + + def __lt__(self, other): + if isinstance(other, ModalityType): + return self.value < other.value + raise NotImplementedError() + + def __eq__(self, other): + if isinstance(other, ModalityType): + return self.value == other.value + raise NotImplementedError() + + def __hash__(self): + return hash(self.value) + + +# NOTE: modality types need to be unique +class VisionType(ModalityType): + """ + Enum for vision modalities such as key video frames. + Typically used in video processing or image sequences. + + Members: + image: A single image frame. + """ + + image = 0 + + +class TextType(ModalityType): + """ + Enum for text tokens and padding. + + Members: + text: Actual textual tokens. + padding: Padding tokens used in sequence batching. + """ + + text = 1 + padding = 2 + + +@dataclass +class Event: + """Represents a single modality event with spatial/temporal dimensions.""" + """ + Represents a single data occurrence (with a specific type, time interval, and data payload). + + Attributes: + data (Any): The actual data payload (e.g. a torch.Tensor, a string, etc.). + type (ModalityType): The modality type of the data (e.g., VisionType.image). + time (Tuple[float, float]): (start_time, end_time) indicating when this Event occurs. + role (Optional[str]): The role associated with this event (e.g., "user", "agent", "system"). + If None, the event is always included in loss calculation. + + Example usage: + evt = Event(data=torch.zeros((1, 224, 224, 3)), # e.g. a single image frame + type=VisionType.image, + time=(0.0, 0.04), + role="user") + """ + # Descriptors + modality_type: ModalityType + + # Structure + dims_virtual: list[int] | None = None # virtual/processed dimensions (e.g., pixel-shuffled) + dims_real: list[int] | None = None # real/actual tensor dimensions + idx_range: tuple[int, int] | None = None + + def dims(self, virtual: bool = True) -> list[int] | None: + """ + Get the dimensions of this event. + + Args: + virtual: If True (default), return virtual/processed dimensions (e.g., pixel-shuffled). + If False, return real/actual tensor dimensions. + + Returns: + Dimensions list or None if not measured. + """ + if virtual: + return self.dims_virtual + else: + return self.dims_real + + def num_tokens(self, partial=True, virtual=True) -> int: + if not virtual: + assert partial is False and isinstance(self.data, torch.Tensor) + return math.prod(self.dims(virtual=False)) + return self.idx_range[1] - self.idx_range[0] if partial else math.prod(self.dims()) + + +@dataclass +class Stream: + """ + Represents an ordered sequence of Event objects, each with + a specific ModalityType and a time range. + + Attributes: + events (List[Event]): The list of Event objects in the stream. + priority (List[ModalityType]): A list of modality types that define + how we might want to reorder or prioritize events if scheduling is needed. + + Example usage: + # Create two events of different types + evt1 = Event(torch.zeros((1, 224, 224, 3)), VisionType.image, (0.0, 0.04)) + evt2 = Event(torch.randint(0, 1000, (16, 1)), TextType.text, (0.0, 0.32)) + + # Make a stream with a given priority + s = Stream(events=[evt1, evt2], + priority=[VisionType.image, TextType.text]) + + print(s) + """ + + events: list[Event] + + def __len__(self): + """Returns the number of Event objects in this Stream.""" + return len(self.events) + + def __getitem__(self, key: int) -> Stream | Event: + return self.events[key] + + def __iter__(self): + """ + Yields each Event in the Stream, enabling iteration like: + for event in my_stream: + ... + """ + yield from self.events + + +# TODO: implement all types of cool indexing which can happen since TensorStream assuems Event.data = Tensor +@dataclass +class TensorStream: + streams: list[Stream] + _device: torch.device | None = None + + @property + def device(self): + return self._device + + @property + def shape(self): + seq_lens = [sum([ev.num_tokens() for ev in stream]) for stream in self.streams] + assert all([sl == seq_lens[0] for sl in seq_lens]), ( + f"each stream must have same token count to have a shape: {seq_lens}" + ) + return (len(seq_lens), seq_lens[0]) + + +def compute_mrope_pos_tensor(ts: TensorStream, n_pos_dims: int = 3) -> torch.Tensor: + """ + Create a (batch, T, n_pos_dims) position tensor in one sweep. + The first dim is the running “time” index, the rest are spatial (or 1-fillers). + + Args: + ts : TensorStream + n_pos_dims : total coordinate dimensions (default 3) + + Returns: + torch.LongTensor - shape (batch_size, seq_len, n_pos_dims) + """ + + # Manually iterate through streams and events like map_compact does, + # but maintain cumulative time offset for each stream + all_coords = [] + for stream in ts.streams: # one Stream == one batch sample + cumulative_offset = 0 # running time index for this stream + + for event in stream: + # --- build coordinate grid for THIS event using itertools (no tensor ops) --- + dims = (event.dims() or [1]) + [1] * (n_pos_dims - len(event.dims() or [])) + + # Create ranges for each dimension (similar to old _finalize implementation) + first_dim = range(cumulative_offset, cumulative_offset + dims[0]) + cumulative_offset += dims[0] # advance time for the next event + other_dims = [range(d) for d in dims[1:]] + + # Use itertools.product to create all coordinate combinations + full_coords = list(itertools.product(first_dim, *other_dims)) + + # Slice if the event is partial + s, e = event.idx_range + coords = full_coords[s:e] + + # Extend the flattened coordinate list + all_coords.extend(coords) + + # Convert to tensor and reshape to (B, T, n_pos_dims) + B, T = ts.shape + return torch.tensor(all_coords, dtype=torch.long, device=ts.device).reshape(B, T, n_pos_dims) + + +def modality_mask(ts: TensorStream, modality_type: ModalityType) -> torch.Tensor: + """Create boolean mask for specific modality type in the tensor stream.""" + B, T = ts.shape + mask = torch.zeros((B, T), dtype=torch.bool, device=ts.device) + + for batch_idx, stream in enumerate(ts.streams): + seq_idx = 0 + for event in stream: + if event.modality_type == modality_type: + start, end = event.idx_range + mask[batch_idx, seq_idx:seq_idx+(end-start)] = True + seq_idx += (event.idx_range[1] - event.idx_range[0]) + + return mask + +# ===== End TensorStream Compatibility Layer ===== + +class PixelShuffleSiglip2VisionConfig(Siglip2VisionConfig): + """Vision configuration for Isaac with Pixel Shuffle support. + + Extends Siglip2VisionConfig with additional fields for pixel shuffle. + """ + + model_type = "pixel_shuffle_siglip2" + base_config_key = "vision_config" + + def __init__( + self, + pixel_shuffle_scale_factor: int = 1, + num_patches: int = 256, + **kwargs, + ): + super().__init__(**kwargs) + + # Add our custom fields + self.pixel_shuffle_scale_factor = pixel_shuffle_scale_factor + self.num_patches = num_patches + + +def create_cumulative_seq_lengths(seq_sizes: torch.Tensor, device: torch.device) -> tuple[torch.Tensor, int]: + """Create cumulative sequence lengths for variable-length attention.""" + cu_seqlens = torch.zeros(len(seq_sizes) + 1, dtype=torch.int32, device=device) + cu_seqlens[1:] = seq_sizes.cumsum(0) + max_seqlen = int(seq_sizes.max().item()) if len(seq_sizes) > 0 else 0 + return cu_seqlens, max_seqlen + + +class Siglip2VariableSequenceEmbeddings(nn.Module): + def __init__(self, config: PixelShuffleSiglip2VisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.patch_size = config.patch_size + + self.patch_embedding = nn.Linear( + in_features=config.num_channels * self.patch_size * self.patch_size, + out_features=self.embed_dim, + ) + + self.num_patches = config.num_patches + self.position_embedding_size = int(self.num_patches**0.5) + self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim) + + def positional_embeddings( + self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor, torch.Tensor] + ) -> torch.Tensor: + # Prepare positional embeddings grid: (1, embed_dim, h, w) + positional_embeddings = ( + self.position_embedding.weight.reshape(self.position_embedding_size, self.position_embedding_size, -1) + .permute(2, 0, 1) + .unsqueeze(0) + ) + + _seq_patches, _seq_sizes, spatial_shapes = packed_seq_patches + pos_embeds_list = [] + mode = "bilinear" + align_corners = False + antialias = True + for spatial_shape in spatial_shapes: + height, width = spatial_shape + # Guard to ensure height and width are positive for torch.compile + if height > 0 and width > 0: + resized_pos_embed = F.interpolate( + positional_embeddings, + size=(height, width), + mode=mode, + align_corners=align_corners, + antialias=antialias, + ) + # Reshape from (1, embed_dim, height, width) to (height*width, embed_dim) + resized_pos_embed = resized_pos_embed.reshape(self.embed_dim, height * width).transpose(0, 1) + else: + # Fallback - should never happen in practice + resized_pos_embed = positional_embeddings.reshape( + self.embed_dim, self.position_embedding_size * self.position_embedding_size + ).transpose(0, 1)[: height * width] + pos_embeds_list.append(resized_pos_embed) + + # Concatenate all positional embeddings along the sequence dimension + pos_embeds = torch.cat(pos_embeds_list, dim=0) + return pos_embeds + + def forward(self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor, torch.Tensor]): + seq_patches, _seq_sizes, _spatial_shapes = packed_seq_patches + + # Apply patch embeddings + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(seq_patches.to(dtype=target_dtype)) + pos_embeds = self.positional_embeddings(packed_seq_patches) + + # Flatten patch embeddings to match positional embeddings format + # From [batch, patches_per_image, embed_dim] to [total_patches, embed_dim] + batch_size, patches_per_image, embed_dim = patch_embeds.shape + + # For variable-length attention, we need to reshape to (total_tokens, embed_dim) + if batch_size != 1: + raise ValueError("Variable-length attention expects batch_size=1 for packed sequences") + + patch_embeds = patch_embeds.view(batch_size * patches_per_image, embed_dim) + + # Add positional embeddings to patch embeddings + embeddings = patch_embeds + pos_embeds + return embeddings + + +class Siglip2VariableLengthAttention(nn.Module): + """Custom attention that supports variable-length sequences with flash attention.""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + + def forward(self, hidden_states, cu_seqlens=None, max_seqlen=None): + batch_size, seq_len, _ = hidden_states.size() + + # For variable-length attention, we need to reshape to (total_tokens, embed_dim) + if batch_size != 1: + raise ValueError("Variable-length attention expects batch_size=1 for packed sequences") + hidden_states = hidden_states.squeeze(0) # Remove batch dimension: (seq_len, embed_dim) + + # Store original dtype + orig_dtype = hidden_states.dtype + + # 1. Linear projections + Q = self.q_proj(hidden_states) # (seq_len, embed_dim) + K = self.k_proj(hidden_states) # (seq_len, embed_dim) + V = self.v_proj(hidden_states) # (seq_len, embed_dim) + + # 2. Reshape for multi-head attention: (seq_len, n_heads, head_dim) + Q = Q.view(-1, self.num_heads, self.embed_dim // self.num_heads) + K = K.view(-1, self.num_heads, self.embed_dim // self.num_heads) + V = V.view(-1, self.num_heads, self.embed_dim // self.num_heads) + + # 3. Apply variable-length attention using flash attention + attn_output, _, _, _, _ = torch.ops.aten._flash_attention_forward( + query=Q, + key=K, + value=V, + cum_seq_q=cu_seqlens, + cum_seq_k=cu_seqlens, + max_q=max_seqlen, + max_k=max_seqlen, + dropout_p=self.dropout if self.training else 0.0, + is_causal=False, + return_debug_mask=False, + scale=self.scale, + window_size_left=-1, + window_size_right=-1, + alibi_slopes=None, + ) + + # 4. Reshape attention output from (seq_len, n_heads, head_dim) to (seq_len, embed_dim) + attn_output = attn_output.reshape(seq_len, self.embed_dim) + + # 5. Convert back to original dtype if needed + if attn_output.dtype != orig_dtype: + attn_output = attn_output.to(orig_dtype) + + # 6. Project output + attn_output = self.out_proj(attn_output) # (seq_len, embed_dim) + + # 7. Add back batch dimension for compatibility + attn_output = attn_output.unsqueeze(0) # (1, seq_len, embed_dim) + + return attn_output, None + + +class IsaacSiglip2EncoderLayer(nn.Module): + """Siglip2 encoder layer with variable-length attention.""" + + def __init__(self, config: PixelShuffleSiglip2VisionConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = Siglip2VariableLengthAttention(config) + + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = Siglip2MLP(config) # Use HF's Siglip2MLP + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + cu_seqlens: torch.Tensor = None, + max_seqlen: int = None, + ) -> tuple[torch.FloatTensor]: + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + ) + + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return (hidden_states,) + + +class IsaacEncoder(nn.Module): + """Encoder using Isaac encoder layers with variable-length attention support.""" + + def __init__(self, config: PixelShuffleSiglip2VisionConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([IsaacSiglip2EncoderLayer(config) for _ in range(config.num_hidden_layers)]) + + def forward( + self, + inputs_embeds, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: int | None = None, + output_hidden_states: bool = False, + ): + all_hidden_states = () if output_hidden_states else None + + hidden_states = inputs_embeds + + for encoder_layer in self.layers: + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = encoder_layer( + hidden_states, + cu_seqlens, + max_seqlen, + ) + + hidden_states = layer_outputs[0] + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + return hidden_states, all_hidden_states, None + + +def create_pixel_shuffle_index_map( + seq_sizes: torch.Tensor, + token_grids: torch.Tensor, + scale_factor: int = 1, + device: torch.device | None = None, +) -> torch.Tensor: + """ + Build a gather-index map that tells us, for every *output* token after + pixel-shuffle, which `scale_factor**2` *input* tokens are being merged. + + Args + ---- + seq_sizes : (num_images,) - #patches in each image (row-major order) + token_grids : (num_images,2) - (height, width) for every image + scale_factor : spatial down-scale factor (≥2) + device : (optional) overrides `seq_sizes.device` + + Returns + ------- + gather_idx : (new_total_seq_len, scale_factor**2) int64 tensor. + gather_idx[i, j] is the *flat* index into the *original* + packed sequence for the j-th sub-patch that forms the + i-th output token. + """ + if device is None: + device = seq_sizes.device + + r = int(scale_factor) + if r < 2: + raise ValueError("`scale_factor` must be ≥ 2") + + # Safety: all spatial dims must be divisible by r + # Cannot run under torch compile fullgraph mode hence + if not torch.compiler.is_compiling(): + if not ((token_grids[:, 0] % r == 0).all() and (token_grids[:, 1] % r == 0).all()): + raise AssertionError( + f"Every (H,W) in `token_grids` must be divisible by scale_factor={r}, got {token_grids.tolist()}" + ) + + gather_chunks: list[torch.Tensor] = [] + tok_offset = 0 + + for seq_len, (h, w) in zip(seq_sizes.tolist(), token_grids.tolist(), strict=False): + # Build the (H, W) grid of flat indices for this image + grid = torch.arange(seq_len, device=device, dtype=torch.int64) + tok_offset + grid = grid.view(h, w) # (H, W) + + # -------- identical ordering to your fixed-res routine -------- + # Step 1: split width into blocks of r + grid = grid.view(h, w // r, r) # (H, W/r, r) + # Step 2: now split height into blocks of r + grid = grid.view(h // r, r, w // r, r) # (H/r, r, W/r, r) + # Step 3: final permutation to (H/r, W/r, r, r) + grid = grid.permute(0, 2, 1, 3).contiguous() # (H/r, W/r, r, r) + # Step 4: each (r, r) block forms one output token + gather_chunks.append(grid.reshape(-1, r * r)) # (H*W / r², r²) + + tok_offset += seq_len + + # Concatenate over all images in the packed batch + gather_idx = torch.cat(gather_chunks, dim=0) # (Σ_i HᵢWᵢ/r², r²) + return gather_idx + + +def pixel_shuffle_varlen( + x: torch.Tensor, + token_grids: torch.Tensor, + scale_factor: int = 1, +) -> torch.Tensor: + r"""Apply pixel shuffle to a packed vision sequence without unpacking per image. + + Args: + x (`torch.Tensor`): + Concatenated vision embeddings. Accepts `(seq_len, hidden_size)` or `(1, seq_len, hidden_size)` shapes + produced by stacking image patches. + token_grids (`torch.Tensor`): + Integer tensor of shape `(num_images, 2)` whose rows give the `(height, width)` patch grid sizes + corresponding to each image segment inside `x`. + scale_factor (`int`, *optional*, defaults to 1): + Spatial down-sampling factor specific to pixel shuffle. Values greater than one merge `scale_factor**2` neighboring patches into a + single embedding channel-group. + + Returns: + `torch.Tensor`: Pixel-shuffled embeddings with shape matching the input convention: + `(seq_len, hidden_size * scale_factor**2)` when the input was 2D, or `(1, seq_len, hidden_size * scale_factor**2)` + if the singleton batch dimension was present. + + Raises: + ValueError: If more than one batch item is provided. + """ + keep_batch_dim = x.dim() == 3 + if keep_batch_dim: + if x.size(0) != 1: + raise AssertionError("Packed sequence is expected to have batch_size == 1") + x_ = x.squeeze(0) # (seq, embed) + else: + x_ = x # (seq, embed) + + embed_dim = x_.size(-1) + r = int(scale_factor) + + # Calculate seq_sizes from token_grids + seq_sizes = torch.prod(token_grids, dim=-1) + + # Build index map and gather in one go + gather_idx = create_pixel_shuffle_index_map( + seq_sizes=seq_sizes, + token_grids=token_grids, + scale_factor=r, + device=x_.device, + ) # (new_seq, r²) + + # Gather → (new_seq, r², embed_dim) + gathered = x_[gather_idx] # fancy indexing keeps gradient + + # Merge the r² group dimension into channels to finish the shuffle + out = gathered.reshape(gathered.size(0), embed_dim * r * r) + + # Restore batch dimension if needed + if keep_batch_dim: + out = out.unsqueeze(0) + return out + + +class Siglip2SequenceVisionTransformer(nn.Module): + def __init__(self, config: PixelShuffleSiglip2VisionConfig): + super().__init__() + self.config = config + self.embeddings = Siglip2VariableSequenceEmbeddings(config) + self.encoder = IsaacEncoder(config) + self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.pixel_shuffle_scale_factor = config.pixel_shuffle_scale_factor + + def forward(self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor]): + seq_patches, token_grids = packed_seq_patches + seq_sizes = torch.prod(token_grids, dim=-1) + + # Get embeddings from packed sequence + hidden_states = self.embeddings((seq_patches, seq_sizes, token_grids)) + + # Add a pseudo batch dimension for the encoder + hidden_states = hidden_states.unsqueeze(0) + + # Generate cumulative sequence lengths for variable-length attention + cu_seqlens, max_seqlen = create_cumulative_seq_lengths(seq_sizes, hidden_states.device) + + # Pass through encoder with variable-length attention parameters + hidden_states, _, _ = self.encoder( + inputs_embeds=hidden_states, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + ) + + # Apply final layer normalization + hidden_states = self.post_layernorm(hidden_states) + + if self.pixel_shuffle_scale_factor > 1: + hidden_states = pixel_shuffle_varlen( + x=hidden_states, + token_grids=token_grids, + scale_factor=self.pixel_shuffle_scale_factor, + ) + # Remove the pseudo batch dimension we added earlier + hidden_states = hidden_states.squeeze(0) + + # Return the full sequence of embeddings + return hidden_states + + +# ============================================================================ +# Configuration +# ============================================================================ + +MAX_PIXELS = 60_000_000 # 60-megapixel ceiling ≈ 8200 × 7300 px + +# Vision preprocessing constants +VISION_MEAN = (0.5, 0.5, 0.5) +VISION_STD = (0.5, 0.5, 0.5) +VISION_SCALE = 1 / 255 + + +def _make_writeable(arr: np.ndarray) -> np.ndarray: + """Return *arr* itself if it is already writeable, otherwise try to flip the + write flag in-place and finally fall back to `arr.copy()`. + This guarantees the buffer handed to `torch.from_numpy()` is always + writeable, silencing the PyTorch warning about undefined behaviour. + """ + if arr.flags.writeable: + return arr + + # First, try the cheap path — in-place flag toggle (works for mmap'd arrays + # and some shared memory buffers): + try: + arr.setflags(write=True) + return arr # success: no data copy + except ValueError: + # Buffer is inherently read-only (e.g. backed by PyAV / PIL): make copy + return arr.copy() + + +def extract_image_pil(image: PIL.Image.Image) -> torch.Tensor | None: + if image.width * image.height > MAX_PIXELS: + raise ValueError(f"Image (w={image.width}, h={image.height}) > MAX=`{MAX_PIXELS}`") + img = image if image.mode == "RGB" else image.convert("RGB") + arr = np.asarray(img) + arr = _make_writeable(arr) + return torch.from_numpy(arr) + + +def get_image_size_for_max_num_patches( + image_height: int, + image_width: int, + patch_size: int, + max_num_patches: int, + min_num_patches: int | None = None, + eps: float = 1e-5, + pixel_shuffle_scale: int = 1, +) -> tuple[int, int]: + r"""Compute a target resolution whose patch grid satisfies patching parametrization. + + Args: + image_height (`int`): + Height in pixels of the source image prior to any resizing. + image_width (`int`): + Width in pixels of the source image prior to any resizing. + patch_size (`int`): + Size of the square patch used by the vision encoder. + max_num_patches (`int`): + Upper bound on `(height / patch_size) * (width / patch_size)` after resizing. + min_num_patches (`int`, *optional*): + Lower bound on the number of patches. When provided the image will be scaled up if necessary. + eps (`float`, *optional*, defaults to 1e-5): + Convergence tolerance for the internal binary search to determing the target dimensions. + pixel_shuffle_scale (`int`, *optional*, defaults to 1): + Additional stride multiplier applied when pixel shuffle later reduces spatial resolution. + + Returns: + `tuple[int, int]`: Height and width (in pixels) that are multiples of `patch_size * pixel_shuffle_scale` + and respect both the maximum and optional minimum patch-count constraints. + """ + + def get_scaled_image_size(scale, original_size, patch_size, pixel_shuffle_scale): + scaled_size = scale * original_size + divisor = patch_size * pixel_shuffle_scale + scaled_size = math.ceil(scaled_size / divisor) * divisor + scaled_size = max(divisor, scaled_size) + return int(scaled_size) + + # Ensure divisibility + divisor = patch_size * pixel_shuffle_scale + adjusted_height = math.ceil(image_height / divisor) * divisor + adjusted_height = max(divisor, adjusted_height) + adjusted_width = math.ceil(image_width / divisor) * divisor + adjusted_width = max(divisor, adjusted_width) + + num_patches = (adjusted_height / patch_size) * (adjusted_width / patch_size) + + if min_num_patches is not None and num_patches < min_num_patches: + # Scale up + scale_min, scale_max = 1.0, 100.0 + while (scale_max - scale_min) >= eps: + scale = (scale_min + scale_max) / 2 + target_height = get_scaled_image_size(scale, image_height, patch_size, pixel_shuffle_scale) + target_width = get_scaled_image_size(scale, image_width, patch_size, pixel_shuffle_scale) + num_patches = (target_height / patch_size) * (target_width / patch_size) + if num_patches >= min_num_patches: + scale_max = scale + else: + scale_min = scale + scale = scale_max + target_height = get_scaled_image_size(scale, image_height, patch_size, pixel_shuffle_scale) + target_width = get_scaled_image_size(scale, image_width, patch_size, pixel_shuffle_scale) + return target_height, target_width + elif num_patches <= max_num_patches: + return adjusted_height, adjusted_width + else: + # Scale down + scale_min, scale_max = eps / 10, 1.0 + while (scale_max - scale_min) >= eps: + scale = (scale_min + scale_max) / 2 + target_height = get_scaled_image_size(scale, image_height, patch_size, pixel_shuffle_scale) + target_width = get_scaled_image_size(scale, image_width, patch_size, pixel_shuffle_scale) + num_patches = (target_height / patch_size) * (target_width / patch_size) + if num_patches <= max_num_patches: + scale_min = scale + else: + scale_max = scale + scale = scale_min + target_height = get_scaled_image_size(scale, image_height, patch_size, pixel_shuffle_scale) + target_width = get_scaled_image_size(scale, image_width, patch_size, pixel_shuffle_scale) + return target_height, target_width + + +_MEAN_TENSOR = torch.tensor(VISION_MEAN, dtype=torch.float32).view(1, 1, 1, -1) +_STD_TENSOR = torch.tensor(VISION_STD, dtype=torch.float32).view(1, 1, 1, -1) + + +def prepare_image_tensor( + image: torch.Tensor, + scale: float = VISION_SCALE, +) -> torch.Tensor: + r"""Standardize RGB images prior to patch extraction via rescaling and whitening. + + Args: + image (`torch.Tensor`): + Tensor with shape `(..., height, width, 3)` containing RGB values. The tensor is converted to floating + point if needed. + scale (`float`, *optional*, defaults to `VISION_SCALE`): + Scalar multiplier applied before normalization. + Returns: + `torch.Tensor`: Normalized tensor with the same shape as the input and dtype `torch.float32`. + """ + if not torch.is_floating_point(image): + image = image.float() + rescaled = image * scale + + # Use precomputed tensors and move to the correct device if needed + mean_tensor = _MEAN_TENSOR.to(image.device) + std_tensor = _STD_TENSOR.to(image.device) + + normalized = (rescaled - mean_tensor) / std_tensor + return normalized + + +def patchify_vision(image: torch.Tensor, patch_size: int) -> torch.Tensor: + r"""Convert normalized images into flattened ViT-style patches. + + Args: + image (`torch.Tensor`): + Tensor of shape `(num_images, height, width, channels)`. + patch_size (`int`): + Edge length of the square patches + + Returns: + `torch.Tensor`: + Patch tensor where each position stores the flattened pixels belonging to that patch. + + Raises: + ValueError: If `height` or `width` is not divisible by `patch_size`. + """ + num_images, height, width, channels = image.shape + if height % patch_size or width % patch_size: + raise ValueError(f"Dimensions of images {image.shape} are not divisible by patch_size={patch_size}.") + patches = image.reshape(num_images, height // patch_size, patch_size, width // patch_size, patch_size, channels) + patches = patches.permute(0, 1, 3, 2, 4, 5) + patches = patches.reshape(num_images, height // patch_size, width // patch_size, channels * patch_size * patch_size) + return patches + + +def process_vision_for_patches( + images: torch.Tensor, + patch_size: int, + max_num_patches: int, + min_num_patches: int | None = None, + pixel_shuffle_scale: int = 1, +) -> tuple[torch.Tensor, list[int]]: + r"""Resize, normalize, and patchify RGB images for the vision encoder. + + Args: + images (`torch.Tensor`): + Either `(height, width, channels)` for a single image or `(num_images, height, width, channels)` for a + batch. Channels are expected to be RGB. + patch_size (`int`): + Edge length of square patches; implictly controls resize grid granularity. + max_num_patches (`int`): + Maximum number of patches allowed after resizing. + min_num_patches (`int`, *optional*): + Minimum number of patches. If provided, the routine upsamples images as needed to satisfy the lower bound. + pixel_shuffle_scale (`int`, *optional*, defaults to 1): + pixel shuffle scale factor; influences the target grid that the function produces. + + Returns: + `tuple[torch.Tensor, list[int]]`: A pair `(patches, dims_virtual)` where `patches` has shape + `(num_images, target_h / patch_size, target_w / patch_size, channels * patch_size**2)` and `dims_virtual` + encodes effective `(images, height, width)` dimensions after optional pixel shuffling. + """ + # Add batch dim if single image + if images.dim() == 3: + images = images.unsqueeze(0) + + # Permute to channel first for resize + images = images.permute(0, 3, 1, 2) + + # Get target dimensions + _, _, orig_height, orig_width = images.shape + target_height, target_width = get_image_size_for_max_num_patches( + orig_height, + orig_width, + patch_size, + max_num_patches, + min_num_patches=min_num_patches, + pixel_shuffle_scale=pixel_shuffle_scale, + ) + + # Resize + images = F.interpolate( + images, + size=(target_height, target_width), + mode="bilinear", + align_corners=False, + ) + + # Back to channel last + images = images.permute(0, 2, 3, 1) + + # Normalize + images = prepare_image_tensor(images) + + # Patchify + patches = patchify_vision(images, patch_size=patch_size) + + # Calculate dimensions for the patches + n_images, h_patches, w_patches, _ = patches.shape + dims_virtual = ( + [1, h_patches, w_patches] + if pixel_shuffle_scale == 1 + else [1, h_patches // pixel_shuffle_scale, w_patches // pixel_shuffle_scale] + ) + + return patches, dims_virtual + + +class IsaacConfig(Qwen3Config): + """Configuration class for Isaac multimodal model.""" + + model_type = "isaac" + sub_configs = {"vision_config": PixelShuffleSiglip2VisionConfig} + + def __init__( + self, + vision_config=None, + vision_patch_size: int = 16, + vision_max_num_patches: int = 256, + vision_min_num_patches: int | None = None, + pixel_shuffle_scale: int = 1, + max_sequence_length: int = 16384, + vision_token: str = "<|image_pad|>", + **kwargs, + ): + super().__init__(**kwargs) + + # EventStreamProcessor parameters (for backward compatibility) + self.video_patch_size = vision_patch_size + self.vision_max_num_patches = vision_max_num_patches + self.vision_min_num_patches = vision_min_num_patches + self.pixel_shuffle_scale = pixel_shuffle_scale + + # Processing parameters + self.max_sequence_length = max_sequence_length + self.vision_token = vision_token + + # Handle vision config - PixelShuffleSiglip2VisionConfig instance + self.vision_config = PixelShuffleSiglip2VisionConfig( + pixel_shuffle_scale_factor=pixel_shuffle_scale, + num_patches=vision_max_num_patches, + ) + + +class IsaacImageProcessorKwargs(TypedDict, total=False): + patch_size: int + max_num_patches: int + min_num_patches: int + pixel_shuffle_scale: int + #merge_size: int # kept for parity with other processors that expose it + + +class IsaacImageProcessor: + + patch_size = 16 + max_num_patches = 6144 + min_num_patches = 256 + pixel_shuffle_scale = 2 + + valid_kwargs = IsaacImageProcessorKwargs + model_input_names = ["pixel_values", "image_grid_thw"] + + def __init__(self, kwargs): + self.patch_size = kwargs.pop("patch_size", self.patch_size) + self.vision_max_num_patches = kwargs.pop("vision_max_num_patches", self.max_num_patches) + self.vision_min_num_patches = kwargs.pop("vision_min_num_patches", self.min_num_patches) + self.pixel_shuffle_scale = kwargs.pop("pixel_shuffle_scale", 2) + + def preprocess( + self, + images: list[torch.Tensor], + return_tensors: Optional[Union[str, TensorType]], + **kwargs: Unpack[IsaacImageProcessorKwargs], + ) -> BatchFeature: + """Isaac's resize → normalize → patchify → pack.""" + + all_pixel_values: list[torch.Tensor] = [] + all_image_grids: list[torch.Tensor] = [] + + for image in images: + image_tensor = extract_image_pil(image) + + patches, dims_virtual = process_vision_for_patches( + image_tensor, + patch_size=self.patch_size, + max_num_patches=self.vision_max_num_patches, + min_num_patches=self.vision_min_num_patches, + pixel_shuffle_scale=self.pixel_shuffle_scale, + ) + + # Isaac packs a dummy temporal dim for images + patches = patches.unsqueeze(1) # [N, T=1, Hp, Wp, D] + + hp, wp, dim = patches.shape[-3], patches.shape[-2], patches.shape[-1] + current_num_patches = hp * wp + pixel_values = patches.reshape(current_num_patches, dim) # [N_tokens, D] + + # Use real patch dimensions for image_grid_thw, not virtual dimensions + # This ensures the vision model receives correct grid info for pixel shuffle + dims_real = [1, hp, wp] # Real patch dimensions + image_grid_thw = torch.tensor(dims_real).unsqueeze(0) # [1, [T, H, W]] + + all_pixel_values.append(pixel_values) + all_image_grids.append(image_grid_thw) + + if all_pixel_values: + final_pixel_values = torch.cat(all_pixel_values, dim=0) + final_image_grids = torch.cat(all_image_grids, dim=0) + else: + final_pixel_values = torch.empty(0, 0) + final_image_grids = torch.empty(0, 3) + + return BatchFeature( + data={"pixel_values": final_pixel_values, "image_grid_thw": final_image_grids}, + tensor_type=return_tensors, + ) + + +class IsaacProcessor: + """Processor wrapper (tokenizer + IsaacImageProcessor).""" + + attributes = ["tokenizer"] + tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") + + def __init__(self, image_processor=None, tokenizer=None, **kwargs): + self.image_processor = image_processor or IsaacImageProcessor(kwargs) + self.tokenizer = tokenizer + self.image_token = "<|image_pad|>" + + def __call__(self, text=None, images=None, **kwargs) -> BatchFeature: + result = {} + + if text is not None: + result.update(self.tokenizer(text, **kwargs)) + if images is not None: + image_result = self.image_processor.preprocess(images, **kwargs) + result.update(image_result) + return BatchFeature(result) + + def apply_chat_template( + self, + messages: list[dict[str, Any]], + tokenize: bool = False, + add_generation_prompt: bool = False, + **kwargs, + ) -> Any: + # Convert mixed content messages to simple text format + processed_messages = [] + + for message in messages: + if "content" in message and isinstance(message["content"], list): + # Handle mixed content (text + image) + text_parts = [] + for content_item in message["content"]: + if content_item.get("type") == "text": + text_parts.append(content_item.get("text", "")) + elif content_item.get("type") == "image": + # Replace image with vision token + text_parts.append(self.image_token) + + processed_message = { + "role": message.get("role", "user"), + "content": "".join(text_parts) + } + processed_messages.append(processed_message) + else: + # Regular text message + processed_messages.append(message) + + return self.tokenizer.apply_chat_template( + processed_messages, tokenize=tokenize, add_generation_prompt=add_generation_prompt, **kwargs + ) + + +class IsaacProcessingInfo(BaseProcessingInfo): + + def get_hf_config(self) -> IsaacConfig: + if hasattr(self.ctx, "get_hf_config"): + original_config = self.ctx.get_hf_config() + # Map HF config parameters to our vLLM config parameters + return IsaacConfig( + # Vision parameters - map from HF names + vision_config=getattr(original_config, "vision_config", None), + vision_patch_size=getattr(original_config, "video_patch_size", 16), + vision_max_num_patches=getattr(original_config, "vision_max_num_patches", 256), + vision_min_num_patches=getattr(original_config, "vision_min_num_patches", None), + pixel_shuffle_scale=getattr(original_config, "pixel_shuffle_scale", 1), + max_sequence_length=getattr(original_config, "max_sequence_length", 16384), + vision_token="<|image_pad|>", + ) + return IsaacConfig() + + def get_hf_processor(self, **kwargs) -> IsaacProcessor: + return self.ctx.get_hf_processor(IsaacProcessor, **kwargs) + + def get_tokenizer(self): + return self.ctx.tokenizer + + def get_image_size_with_most_features(self) -> ImageSize: + hf_config = self.get_hf_config() + # Get target dimensions + target_height, target_width = get_image_size_for_max_num_patches( + 9999999, + 9999999, + hf_config.video_patch_size, + hf_config.vision_max_num_patches, + min_num_patches=hf_config.vision_min_num_patches, + pixel_shuffle_scale=hf_config.pixel_shuffle_scale, + ) + return ImageSize(width=target_width, height=target_height) + + def get_image_processor(self, **kwargs) -> IsaacImageProcessor: + return self.get_hf_processor(**kwargs).image_processor + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_mm_max_tokens_per_item( + self, seq_len: int, mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + hf_config = self.get_hf_config() + num_vision_tokens = hf_config.vision_max_num_patches // (hf_config.pixel_shuffle_scale**2) + return {"image": num_vision_tokens} + + +class IsaacDummyInputsBuilder(BaseDummyInputsBuilder[IsaacProcessingInfo]): + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + + hf_processor = self.info.get_hf_processor() + image_token: str = hf_processor.image_token + + return image_token * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + mm_options: Mapping[str] | None = None, + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + + target_width, target_height = self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + + return { + "image": self._get_dummy_images( + width=target_width, + height=target_height, + num_images=num_images, + overrides=image_overrides, + ), + } + + +class IsaacMultiModalProcessor(BaseMultiModalProcessor): + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + + # Configure multimodal fields for Isaac model + image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3))) + image_grid_sizes = image_grid_thw.prod(-1) + + return { + "pixel_values": MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes), + "image_grid_thw": MultiModalFieldConfig.batched("image"), + } + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, Any], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + + #hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + + vocab = tokenizer.get_vocab() + placeholder_id = vocab.get("<|image_pad|>", 151655) + + pixel_shuffle_scale = getattr(image_processor, 'pixel_shuffle_scale', 2) + merge_length = pixel_shuffle_scale ** 2 + + def get_replacement_isaac(item_idx: int): + out_item = out_mm_kwargs["image"][item_idx] + grid_thw = out_item["image_grid_thw"].data + assert isinstance(grid_thw, torch.Tensor) + + num_tokens = int(grid_thw.prod()) // merge_length + return [placeholder_id] * num_tokens + + return [ + PromptReplacement( + modality="image", + target=[placeholder_id], + replacement=get_replacement_isaac, + ) + ] + + +@MULTIMODAL_REGISTRY.register_processor( + IsaacMultiModalProcessor, + info=IsaacProcessingInfo, + dummy_inputs=IsaacDummyInputsBuilder, +) +class IsaacForConditionalGeneration( + Qwen3ForCausalLM, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE +): + + supports_encoder_tp_data = True + + # To ensure correct weight loading and mapping. + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "model.vision_embedding.": "vision_embedding.", + } + ) + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> str | None: + if modality.startswith("image"): + return "<|image_pad|>" + + raise ValueError("Only image modality is supported") + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"): + + config: IsaacConfig = vllm_config.model_config.hf_config + head_dim = config.head_dim + + calculated_mrope_section = [ + head_dim // 4, # 2x more for temporal dim + head_dim // 8, + head_dim // 8, + ] + + config.rope_scaling["mrope_section"] = calculated_mrope_section + self.config = config + + # Initialize the parent class with updated config + super().__init__(vllm_config=vllm_config, prefix=prefix) + + # Create the language model module to match checkpoint structure + self.language_model = nn.ModuleDict({ + "embed_tokens": self.model.embed_tokens, + "layers": self.model.layers, + "norm": self.model.norm + }) + + vision_cfg = config.vision_config + if vision_cfg is None: + raise ValueError("IsaacConfig should always have vision_config") + + hidden_dim = vision_cfg.hidden_size * (vision_cfg.pixel_shuffle_scale_factor**2) + self.vision_embedding = nn.Sequential( + Siglip2SequenceVisionTransformer(vision_cfg), + nn.Linear( + hidden_dim, + 4 * hidden_dim, + bias=False, + ), + nn.SiLU(), + nn.Linear(4 * hidden_dim, config.hidden_size, bias=False), + ) + + def get_mrope_input_positions( + self, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: list[list[int]] | torch.Tensor, + video_grid_thw: list[list[int]] | torch.Tensor, + context_len: int = 0, + seq_len: int | None = None, + second_per_grid_ts: list[float] | None = None, + audio_feature_lengths: torch.Tensor | None = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + """Get mrope input positions and delta value.""" + + vision_token_id = getattr(self.config, 'image_token_id', 151655) + spatial_merge_size = hf_config.vision_config.pixel_shuffle_scale_factor + input_tokens_tensor = torch.tensor(input_tokens) + + # Find image token positions + image_positions = torch.where(input_tokens_tensor == vision_token_id)[0].tolist() + + # For text-only inputs, use Isaac's original logic from compute_position_ids_input_ids() + if len(image_positions) == 0: + seq_len = len(input_tokens) + # Create 3D positions where all dimensions get the same 1D temporal progression + position_ids = torch.arange(seq_len, dtype=torch.long) + position_ids = position_ids.view(1, -1).expand(1, -1) # [1, seq_len] + position_ids = position_ids.unsqueeze(2).expand(-1, -1, 3) # [1, seq_len, 3] + + # vLLM expects shape [3, seq_len], so transpose + position_ids = position_ids.squeeze(0).transpose(0, 1) # [3, seq_len] + + return position_ids, 0 + + events = [] + image_idx = 0 + current_pos = 0 + last_processed_pos = -1 + + for image_pos in image_positions: + if image_pos <= last_processed_pos: + continue # Skip already processed positions + + # Add any text before this image + if image_pos > current_pos: + text_tokens = image_pos - current_pos + text_event = Event( + modality_type=TextType.text, + dims_virtual=[text_tokens, 1], + idx_range=(0, text_tokens), + ) + events.append(text_event) + + # Add image + t, h, w = image_grid_thw[image_idx] + llm_grid_h, llm_grid_w = h // spatial_merge_size, w // spatial_merge_size + image_tokens = t * llm_grid_h * llm_grid_w + + image_event = Event( + modality_type=VisionType.image, + dims_virtual=[t, llm_grid_h, llm_grid_w], + idx_range=(0, image_tokens), + ) + events.append(image_event) + + current_pos = image_pos + image_tokens + last_processed_pos = current_pos - 1 # Mark up to this position as processed + image_idx += 1 + + # Add final text segment if any + if current_pos < len(input_tokens): + text_tokens = len(input_tokens) - current_pos + text_event = Event( + modality_type=TextType.text, + dims_virtual=[text_tokens, 1], + idx_range=(0, text_tokens), + ) + events.append(text_event) + + stream = Stream(events) + tensor_stream = TensorStream([stream]) + + # Use Isaac's native MRoPE calculation + position_ids = compute_mrope_pos_tensor(tensor_stream, n_pos_dims=3) + + # Max position per batch across the 3 planes and sequence dimension: (B,) + m_per_batch = position_ids.amax(dim=(1, 2)) + + mrope_position_delta = (m_per_batch + 1 - len(input_tokens)).item() + + # vLLM expects shape [3, seq_len] but Isaac returns [batch, seq_len, 3] + # Transpose to match vLLM's expected format + position_ids = position_ids.squeeze(0).transpose(0, 1) + + return position_ids, mrope_position_delta + + def get_multimodal_embeddings( + self, **kwargs: object + ) -> MultiModalEmbeddings | None: + + pixel_values = kwargs.get("pixel_values") + image_grid_thw = kwargs.get("image_grid_thw") + + if pixel_values is None: + return [] + + # Convert image_grid_thw from [batch, 1, [T, H, W]] to [batch, [H, W]] + spatial_grids = image_grid_thw[:, 0, 1:3] # Extract H, W from [T, H, W] for each image + + # Process packed sequence patches through vision_embedding module + vision_embeddings = self.vision_embedding((pixel_values, spatial_grids)) + + # Split concatenated embeddings for each image item (following Qwen2-VL pattern) + merge_size = self.config.vision_config.pixel_shuffle_scale_factor # Isaac uses pixel shuffle + sizes = spatial_grids.prod(-1) // (merge_size * merge_size) # H * W / (merge_size^2) + + return vision_embeddings.split(sizes.tolist()) + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: MultiModalEmbeddings | None = None, + *, + is_multimodal: torch.Tensor | None = None, + handle_oov_mm_token: bool = False, + ) -> torch.Tensor: + + # Get text embeddings from the base language model + inputs_embeds = super().get_input_embeddings(input_ids) + + # If we have multimodal embeddings, merge them with text embeddings + if multimodal_embeddings is not None and len(multimodal_embeddings) != 0: + + inputs_embeds = _merge_multimodal_embeddings( + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + ) + + return inputs_embeds + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + skip_prefixes = [] + if self.vision_embedding is None: + skip_prefixes.extend(["vision_embedding."]) + loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="vision_embedding.3", # The final linear layer + tower_model="vision_embedding", + ) From e27cb3c53d8daa83fcdb51f6e50cc43a396631f4 Mon Sep 17 00:00:00 2001 From: oscardev256 <42308241+oscardev256@users.noreply.github.com> Date: Sat, 8 Nov 2025 23:15:17 -0500 Subject: [PATCH 02/18] Update registry.py Added Isaac model architecture. Signed-off-by: oscardev256 <42308241+oscardev256@users.noreply.github.com> --- vllm/model_executor/models/registry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 4575e91e13959..aeca078775b43 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -326,6 +326,7 @@ _MULTIMODAL_MODELS = { "idefics3", "Idefics3ForConditionalGeneration", ), + "IsaacForConditionalGeneration": ("isaac", "IsaacForConditionalGeneration"), "SmolVLMForConditionalGeneration": ("smolvlm", "SmolVLMForConditionalGeneration"), # noqa: E501 "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"), "KeyeVL1_5ForConditionalGeneration": ( From 37a92d952b12a144f6d887bf75b8981f98ab647e Mon Sep 17 00:00:00 2001 From: Oscar Gonzalez Date: Sat, 15 Nov 2025 01:00:01 -0500 Subject: [PATCH 03/18] Updated to use Siglip2Encoder defined in siglip2navit.py. Signed-off-by: Oscar Gonzalez --- vllm/model_executor/models/isaac.py | 434 ++++++++++++++-------------- 1 file changed, 222 insertions(+), 212 deletions(-) diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index 4f29d1cff347a..f3a589faa163e 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -29,6 +29,7 @@ from vllm.model_executor.models.utils import ( WeightsMapper, AutoWeightsLoader, _merge_multimodal_embeddings, + maybe_prefix, ) from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM from vllm.model_executor.models.module_mapping import MultiModelKeys @@ -308,14 +309,6 @@ class PixelShuffleSiglip2VisionConfig(Siglip2VisionConfig): self.num_patches = num_patches -def create_cumulative_seq_lengths(seq_sizes: torch.Tensor, device: torch.device) -> tuple[torch.Tensor, int]: - """Create cumulative sequence lengths for variable-length attention.""" - cu_seqlens = torch.zeros(len(seq_sizes) + 1, dtype=torch.int32, device=device) - cu_seqlens[1:] = seq_sizes.cumsum(0) - max_seqlen = int(seq_sizes.max().item()) if len(seq_sizes) > 0 else 0 - return cu_seqlens, max_seqlen - - class Siglip2VariableSequenceEmbeddings(nn.Module): def __init__(self, config: PixelShuffleSiglip2VisionConfig): super().__init__() @@ -380,7 +373,6 @@ class Siglip2VariableSequenceEmbeddings(nn.Module): pos_embeds = self.positional_embeddings(packed_seq_patches) # Flatten patch embeddings to match positional embeddings format - # From [batch, patches_per_image, embed_dim] to [total_patches, embed_dim] batch_size, patches_per_image, embed_dim = patch_embeds.shape # For variable-length attention, we need to reshape to (total_tokens, embed_dim) @@ -394,158 +386,6 @@ class Siglip2VariableSequenceEmbeddings(nn.Module): return embeddings -class Siglip2VariableLengthAttention(nn.Module): - """Custom attention that supports variable-length sequences with flash attention.""" - - def __init__(self, config): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.embed_dim // self.num_heads - if self.head_dim * self.num_heads != self.embed_dim: - raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {self.num_heads})." - ) - self.scale = self.head_dim**-0.5 - self.dropout = config.attention_dropout - - self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) - - def forward(self, hidden_states, cu_seqlens=None, max_seqlen=None): - batch_size, seq_len, _ = hidden_states.size() - - # For variable-length attention, we need to reshape to (total_tokens, embed_dim) - if batch_size != 1: - raise ValueError("Variable-length attention expects batch_size=1 for packed sequences") - hidden_states = hidden_states.squeeze(0) # Remove batch dimension: (seq_len, embed_dim) - - # Store original dtype - orig_dtype = hidden_states.dtype - - # 1. Linear projections - Q = self.q_proj(hidden_states) # (seq_len, embed_dim) - K = self.k_proj(hidden_states) # (seq_len, embed_dim) - V = self.v_proj(hidden_states) # (seq_len, embed_dim) - - # 2. Reshape for multi-head attention: (seq_len, n_heads, head_dim) - Q = Q.view(-1, self.num_heads, self.embed_dim // self.num_heads) - K = K.view(-1, self.num_heads, self.embed_dim // self.num_heads) - V = V.view(-1, self.num_heads, self.embed_dim // self.num_heads) - - # 3. Apply variable-length attention using flash attention - attn_output, _, _, _, _ = torch.ops.aten._flash_attention_forward( - query=Q, - key=K, - value=V, - cum_seq_q=cu_seqlens, - cum_seq_k=cu_seqlens, - max_q=max_seqlen, - max_k=max_seqlen, - dropout_p=self.dropout if self.training else 0.0, - is_causal=False, - return_debug_mask=False, - scale=self.scale, - window_size_left=-1, - window_size_right=-1, - alibi_slopes=None, - ) - - # 4. Reshape attention output from (seq_len, n_heads, head_dim) to (seq_len, embed_dim) - attn_output = attn_output.reshape(seq_len, self.embed_dim) - - # 5. Convert back to original dtype if needed - if attn_output.dtype != orig_dtype: - attn_output = attn_output.to(orig_dtype) - - # 6. Project output - attn_output = self.out_proj(attn_output) # (seq_len, embed_dim) - - # 7. Add back batch dimension for compatibility - attn_output = attn_output.unsqueeze(0) # (1, seq_len, embed_dim) - - return attn_output, None - - -class IsaacSiglip2EncoderLayer(nn.Module): - """Siglip2 encoder layer with variable-length attention.""" - - def __init__(self, config: PixelShuffleSiglip2VisionConfig): - super().__init__() - self.embed_dim = config.hidden_size - self.self_attn = Siglip2VariableLengthAttention(config) - - self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - self.mlp = Siglip2MLP(config) # Use HF's Siglip2MLP - self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - cu_seqlens: torch.Tensor = None, - max_seqlen: int = None, - ) -> tuple[torch.FloatTensor]: - residual = hidden_states - - hidden_states = self.layer_norm1(hidden_states) - - hidden_states, attn_weights = self.self_attn( - hidden_states=hidden_states, - cu_seqlens=cu_seqlens, - max_seqlen=max_seqlen, - ) - - hidden_states = residual + hidden_states - - residual = hidden_states - hidden_states = self.layer_norm2(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - return (hidden_states,) - - -class IsaacEncoder(nn.Module): - """Encoder using Isaac encoder layers with variable-length attention support.""" - - def __init__(self, config: PixelShuffleSiglip2VisionConfig): - super().__init__() - self.config = config - self.layers = nn.ModuleList([IsaacSiglip2EncoderLayer(config) for _ in range(config.num_hidden_layers)]) - - def forward( - self, - inputs_embeds, - cu_seqlens: torch.Tensor | None = None, - max_seqlen: int | None = None, - output_hidden_states: bool = False, - ): - all_hidden_states = () if output_hidden_states else None - - hidden_states = inputs_embeds - - for encoder_layer in self.layers: - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - layer_outputs = encoder_layer( - hidden_states, - cu_seqlens, - max_seqlen, - ) - - hidden_states = layer_outputs[0] - - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - return hidden_states, all_hidden_states, None - - def create_pixel_shuffle_index_map( seq_sizes: torch.Tensor, token_grids: torch.Tensor, @@ -669,52 +509,6 @@ def pixel_shuffle_varlen( out = out.unsqueeze(0) return out - -class Siglip2SequenceVisionTransformer(nn.Module): - def __init__(self, config: PixelShuffleSiglip2VisionConfig): - super().__init__() - self.config = config - self.embeddings = Siglip2VariableSequenceEmbeddings(config) - self.encoder = IsaacEncoder(config) - self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.pixel_shuffle_scale_factor = config.pixel_shuffle_scale_factor - - def forward(self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor]): - seq_patches, token_grids = packed_seq_patches - seq_sizes = torch.prod(token_grids, dim=-1) - - # Get embeddings from packed sequence - hidden_states = self.embeddings((seq_patches, seq_sizes, token_grids)) - - # Add a pseudo batch dimension for the encoder - hidden_states = hidden_states.unsqueeze(0) - - # Generate cumulative sequence lengths for variable-length attention - cu_seqlens, max_seqlen = create_cumulative_seq_lengths(seq_sizes, hidden_states.device) - - # Pass through encoder with variable-length attention parameters - hidden_states, _, _ = self.encoder( - inputs_embeds=hidden_states, - cu_seqlens=cu_seqlens, - max_seqlen=max_seqlen, - ) - - # Apply final layer normalization - hidden_states = self.post_layernorm(hidden_states) - - if self.pixel_shuffle_scale_factor > 1: - hidden_states = pixel_shuffle_varlen( - x=hidden_states, - token_grids=token_grids, - scale_factor=self.pixel_shuffle_scale_factor, - ) - # Remove the pseudo batch dimension we added earlier - hidden_states = hidden_states.squeeze(0) - - # Return the full sequence of embeddings - return hidden_states - - # ============================================================================ # Configuration # ============================================================================ @@ -1009,7 +803,6 @@ class IsaacImageProcessorKwargs(TypedDict, total=False): max_num_patches: int min_num_patches: int pixel_shuffle_scale: int - #merge_size: int # kept for parity with other processors that expose it class IsaacImageProcessor: @@ -1265,6 +1058,156 @@ class IsaacMultiModalProcessor(BaseMultiModalProcessor): ) ] +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) +from vllm.model_executor.models.utils import is_pp_missing_parameter +from vllm.model_executor.models.siglip2navit import Siglip2VisionEmbeddings, Siglip2Encoder +from vllm.attention.backends.registry import _Backend +from vllm.model_executor.layers.quantization import QuantizationConfig + +class Siglip2VisionTransformer(nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE +): + + is_pooling_model = True + + merge_by_field_config = True + + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + def __init__( + self, + config, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + use_data_parallel: bool = False, + attn_backend_override: _Backend | None = None, + ): + super().__init__() + self.config = config + self.quant_config = quant_config + embed_dim = config.hidden_size + + self.embeddings = Siglip2VariableSequenceEmbeddings(config) + self.pixel_shuffle_scale_factor = config.pixel_shuffle_scale_factor + self.encoder = Siglip2Encoder( + config, + quant_config=quant_config, + prefix=f"{prefix}.encoder", + use_data_parallel=use_data_parallel, + attn_backend_override=attn_backend_override, + ) + self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + + def forward( + self, + packed_seq_patches: tuple[torch.Tensor, torch.Tensor], + ) -> torch.Tensor: + r""" + spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`): + Tensor containing the spatial dimensions (height, width) + of the input images. + """ + + seq_patches, token_grids = packed_seq_patches + seq_sizes = torch.prod(token_grids, dim=-1) + + # Get embeddings from packed sequence + hidden_states = self.embeddings((seq_patches, seq_sizes, token_grids)) + + grid_thws = torch.tensor([[1, token_grids[0][0].item(), token_grids[0][1].item()]]) + last_hidden_state = self.encoder(hidden_states, grid_thws) + hidden_states = self.post_layernorm(last_hidden_state) + + # Add a pseudo batch dimension for the encoder + hidden_states = hidden_states.unsqueeze(0) + + if self.pixel_shuffle_scale_factor > 1: + hidden_states = pixel_shuffle_varlen( + x=hidden_states, + token_grids=token_grids, + scale_factor=self.pixel_shuffle_scale_factor, + ) + # Remove the pseudo batch dimension we added earlier + hidden_states = hidden_states.squeeze(0) + + #return last_hidden_state + return hidden_states + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if self.quant_config is not None and ( + scale_name := self.quant_config.get_cache_scale(name) + ): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + loaded_weight = ( + loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] + ) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + if name.endswith("scale"): + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + if weight_loader == default_weight_loader: + weight_loader(param, loaded_weight) + else: + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + if is_pp_missing_parameter(name, self): + continue + print(f"qwen2: name={name}") + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params @MULTIMODAL_REGISTRY.register_processor( IsaacMultiModalProcessor, @@ -1274,13 +1217,24 @@ class IsaacMultiModalProcessor(BaseMultiModalProcessor): class IsaacForConditionalGeneration( Qwen3ForCausalLM, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } supports_encoder_tp_data = True # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ - "model.vision_embedding.": "vision_embedding.", + "model.vision_embedding.": "vision_embedding.", } ) @@ -1315,13 +1269,18 @@ class IsaacForConditionalGeneration( "norm": self.model.norm }) + config.vision_config.preserve_original_pe = True + config.vision_config.use_rope = False + config.vision_config.hidden_stride = config.vision_config.pixel_shuffle_scale_factor + config.vision_config.window_size = 32*2 + config.vision_config.fullatt_block_indexes = None vision_cfg = config.vision_config if vision_cfg is None: raise ValueError("IsaacConfig should always have vision_config") hidden_dim = vision_cfg.hidden_size * (vision_cfg.pixel_shuffle_scale_factor**2) self.vision_embedding = nn.Sequential( - Siglip2SequenceVisionTransformer(vision_cfg), + Siglip2VisionTransformer(vision_cfg, prefix=maybe_prefix(prefix, "vision_embedding")), nn.Linear( hidden_dim, 4 * hidden_dim, @@ -1472,10 +1431,61 @@ class IsaacForConditionalGeneration( return inputs_embeds + def merge_qkv_weights( + weights: Iterable[tuple[str, torch.Tensor]] + ) -> Iterable[tuple[str, torch.Tensor]]: + """Merge separate Q, K, V projection weights into QKV format.""" + + # Buffer to collect q, k, v weights for each layer + qkv_buffer = {} + + for name, tensor in weights: + # Check if this is a q/k/v projection weight + if '.q_proj.' in name or '.k_proj.' in name or '.v_proj.' in name: + # Extract the base name (everything before q/k/v_proj) + if '.q_proj.' in name: + base_name = name.replace('.q_proj.', '.qkv_proj.') + proj_type = 'q' + elif '.k_proj.' in name: + base_name = name.replace('.k_proj.', '.qkv_proj.') + proj_type = 'k' + else: # v_proj + base_name = name.replace('.v_proj.', '.qkv_proj.') + proj_type = 'v' + + # Store in buffer + if base_name not in qkv_buffer: + qkv_buffer[base_name] = {} + qkv_buffer[base_name][proj_type] = tensor + + # If we have all three (q, k, v), merge and yield + if len(qkv_buffer[base_name]) == 3: + q = qkv_buffer[base_name]['q'] + k = qkv_buffer[base_name]['k'] + v = qkv_buffer[base_name]['v'] + + # Concatenate along dim 0 for weight, dim agnostic for bias + merged = torch.cat([q, k, v], dim=0) + yield base_name, merged + + # Clear buffer + del qkv_buffer[base_name] + else: + # Pass through non-qkv weights unchanged + yield name, tensor + + # Check if any incomplete qkv sets remain + if qkv_buffer: + raise ValueError(f"Incomplete QKV weights found: {list(qkv_buffer.keys())}") + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: skip_prefixes = [] - if self.vision_embedding is None: - skip_prefixes.extend(["vision_embedding."]) + #if self.vision_embedding is None: + # skip_prefixes.extend(["vision_embedding."]) + + # Usage: + #weights = self.merge_qkv_weights(weights) loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) From 0dbe093c562c356f0fcc7348c6cd33373c3f1ace Mon Sep 17 00:00:00 2001 From: Oscar Gonzalez Date: Tue, 18 Nov 2025 02:01:35 -0500 Subject: [PATCH 04/18] Updated load_weight for Siglip2VisionTransformer Signed-off-by: Oscar Gonzalez --- vllm/model_executor/models/isaac.py | 160 +++++----------------------- 1 file changed, 27 insertions(+), 133 deletions(-) diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index f3a589faa163e..786b1fe4e6f1c 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -18,9 +18,6 @@ import torch.nn.functional as F from transformers import PretrainedConfig, Qwen3Config from transformers.image_processing_utils import BatchFeature from transformers.tokenization_utils import TensorType -from transformers.models.siglip2.modeling_siglip2 import ( - Siglip2MLP, -) from transformers.models.siglip2.configuration_siglip2 import Siglip2VisionConfig from vllm.multimodal import MULTIMODAL_REGISTRY @@ -30,6 +27,7 @@ from vllm.model_executor.models.utils import ( AutoWeightsLoader, _merge_multimodal_embeddings, maybe_prefix, + init_vllm_registered_model, ) from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM from vllm.model_executor.models.module_mapping import MultiModelKeys @@ -54,6 +52,15 @@ from vllm.model_executor.models.interfaces import ( SupportsPP, ) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, +) +from vllm.model_executor.models.siglip2navit import Siglip2Encoder +from vllm.attention.backends.registry import _Backend +from vllm.model_executor.layers.quantization import QuantizationConfig + +from vllm.model_executor.layers.linear import ReplicatedLinear + # ===== TensorStream Compatibility Layer for Isaac MRoPE ===== # Minimal implementation of TensorStream classes needed for Isaac's 3D positional encoding @@ -316,9 +323,10 @@ class Siglip2VariableSequenceEmbeddings(nn.Module): self.embed_dim = config.hidden_size self.patch_size = config.patch_size - self.patch_embedding = nn.Linear( - in_features=config.num_channels * self.patch_size * self.patch_size, - out_features=self.embed_dim, + self.patch_embedding = ReplicatedLinear( + input_size=config.num_channels * self.patch_size * self.patch_size, + output_size=self.embed_dim, + return_bias=False, ) self.num_patches = config.num_patches @@ -1058,37 +1066,10 @@ class IsaacMultiModalProcessor(BaseMultiModalProcessor): ) ] -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) -from vllm.model_executor.models.utils import is_pp_missing_parameter -from vllm.model_executor.models.siglip2navit import Siglip2VisionEmbeddings, Siglip2Encoder -from vllm.attention.backends.registry import _Backend -from vllm.model_executor.layers.quantization import QuantizationConfig - -class Siglip2VisionTransformer(nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE -): - - is_pooling_model = True - - merge_by_field_config = True - - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - +class Siglip2VisionTransformer(nn.Module): def __init__( self, - config, + config: PixelShuffleSiglip2VisionConfig, quant_config: QuantizationConfig | None = None, prefix: str = "", use_data_parallel: bool = False, @@ -1151,64 +1132,28 @@ class Siglip2VisionTransformer(nn.Module, SupportsMultiModal, SupportsLoRA, Supp ("qkv_proj", "q_proj", "q"), ("qkv_proj", "k_proj", "k"), ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), ] - params_dict = dict(self.named_parameters(remove_duplicate=False)) + params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() + for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - if name.endswith("scale"): - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue + param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - if weight_loader == default_weight_loader: - weight_loader(param, loaded_weight) - else: - weight_loader(param, loaded_weight, shard_id) + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) break else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - if is_pp_missing_parameter(name, self): - continue - print(f"qwen2: name={name}") param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params + @MULTIMODAL_REGISTRY.register_processor( IsaacMultiModalProcessor, info=IsaacProcessingInfo, @@ -1217,6 +1162,7 @@ class Siglip2VisionTransformer(nn.Module, SupportsMultiModal, SupportsLoRA, Supp class IsaacForConditionalGeneration( Qwen3ForCausalLM, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): + packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -1230,7 +1176,7 @@ class IsaacForConditionalGeneration( } supports_encoder_tp_data = True - + # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ @@ -1261,14 +1207,14 @@ class IsaacForConditionalGeneration( # Initialize the parent class with updated config super().__init__(vllm_config=vllm_config, prefix=prefix) - + # Create the language model module to match checkpoint structure self.language_model = nn.ModuleDict({ "embed_tokens": self.model.embed_tokens, "layers": self.model.layers, "norm": self.model.norm }) - + config.vision_config.preserve_original_pe = True config.vision_config.use_rope = False config.vision_config.hidden_stride = config.vision_config.pixel_shuffle_scale_factor @@ -1431,61 +1377,9 @@ class IsaacForConditionalGeneration( return inputs_embeds - def merge_qkv_weights( - weights: Iterable[tuple[str, torch.Tensor]] - ) -> Iterable[tuple[str, torch.Tensor]]: - """Merge separate Q, K, V projection weights into QKV format.""" - - # Buffer to collect q, k, v weights for each layer - qkv_buffer = {} - - for name, tensor in weights: - # Check if this is a q/k/v projection weight - if '.q_proj.' in name or '.k_proj.' in name or '.v_proj.' in name: - # Extract the base name (everything before q/k/v_proj) - if '.q_proj.' in name: - base_name = name.replace('.q_proj.', '.qkv_proj.') - proj_type = 'q' - elif '.k_proj.' in name: - base_name = name.replace('.k_proj.', '.qkv_proj.') - proj_type = 'k' - else: # v_proj - base_name = name.replace('.v_proj.', '.qkv_proj.') - proj_type = 'v' - - # Store in buffer - if base_name not in qkv_buffer: - qkv_buffer[base_name] = {} - qkv_buffer[base_name][proj_type] = tensor - - # If we have all three (q, k, v), merge and yield - if len(qkv_buffer[base_name]) == 3: - q = qkv_buffer[base_name]['q'] - k = qkv_buffer[base_name]['k'] - v = qkv_buffer[base_name]['v'] - - # Concatenate along dim 0 for weight, dim agnostic for bias - merged = torch.cat([q, k, v], dim=0) - yield base_name, merged - - # Clear buffer - del qkv_buffer[base_name] - else: - # Pass through non-qkv weights unchanged - yield name, tensor - - # Check if any incomplete qkv sets remain - if qkv_buffer: - raise ValueError(f"Incomplete QKV weights found: {list(qkv_buffer.keys())}") - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: skip_prefixes = [] - #if self.vision_embedding is None: - # skip_prefixes.extend(["vision_embedding."]) - - # Usage: - #weights = self.merge_qkv_weights(weights) + loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) From 2c13695951c45749d795d6858ce89f7c580dcb61 Mon Sep 17 00:00:00 2001 From: Yang Date: Thu, 20 Nov 2025 15:39:18 -0800 Subject: [PATCH 05/18] org and add imports and fix lint error Signed-off-by: Yang --- vllm/model_executor/models/isaac.py | 483 +++++++++++++++++----------- 1 file changed, 298 insertions(+), 185 deletions(-) diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index 786b1fe4e6f1c..5c61e5bf48a70 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -1,49 +1,32 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations -from collections.abc import Mapping, Sequence, Iterable -from typing import Any, Optional, Union -from typing_extensions import TypedDict, Unpack - import itertools -from enum import Enum -from dataclasses import dataclass - import math +from collections.abc import Iterable, Mapping, Sequence +from dataclasses import dataclass +from enum import Enum +from typing import Any + import numpy as np import PIL.Image import torch import torch.nn as nn import torch.nn.functional as F - from transformers import PretrainedConfig, Qwen3Config from transformers.image_processing_utils import BatchFeature -from transformers.tokenization_utils import TensorType from transformers.models.siglip2.configuration_siglip2 import Siglip2VisionConfig +from transformers.tokenization_utils import TensorType +from typing_extensions import TypedDict, Unpack -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.model_executor.models.interfaces import SupportsMultiModal -from vllm.model_executor.models.utils import ( - WeightsMapper, - AutoWeightsLoader, - _merge_multimodal_embeddings, - maybe_prefix, - init_vllm_registered_model, -) -from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM -from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.multimodal.processing import ( - BaseMultiModalProcessor, - BaseProcessingInfo, - PromptReplacement, -) -from vllm.multimodal.parse import MultiModalDataItems, ImageSize -from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.multimodal.inputs import ( - MultiModalFieldConfig, - MultiModalKwargs, - MultiModalDataDict, -) +from vllm.attention.backends.registry import _Backend from vllm.config import VllmConfig +from vllm.model_executor.layers.linear import ReplicatedLinear +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, +) from vllm.model_executor.models.interfaces import ( MultiModalEmbeddings, SupportsLoRA, @@ -51,18 +34,34 @@ from vllm.model_executor.models.interfaces import ( SupportsMultiModal, SupportsPP, ) - -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, -) +from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM from vllm.model_executor.models.siglip2navit import Siglip2Encoder -from vllm.attention.backends.registry import _Backend -from vllm.model_executor.layers.quantization import QuantizationConfig - -from vllm.model_executor.layers.linear import ReplicatedLinear +from vllm.model_executor.models.utils import ( + AutoWeightsLoader, + WeightsMapper, + _merge_multimodal_embeddings, + maybe_prefix, +) +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import ( + MultiModalDataDict, + MultiModalFieldConfig, + MultiModalKwargs, +) +from vllm.multimodal.parse import ImageSize, MultiModalDataItems +from vllm.multimodal.processing import ( + BaseMultiModalProcessor, + BaseProcessingInfo, + PromptReplacement, + PromptUpdate, +) +from vllm.multimodal.profiling import BaseDummyInputsBuilder # ===== TensorStream Compatibility Layer for Isaac MRoPE ===== -# Minimal implementation of TensorStream classes needed for Isaac's 3D positional encoding +# Minimal implementation of TensorStream classes needed for Isaac's 3D positional +# encoding + class ModalityType(Enum): """ @@ -127,37 +126,46 @@ class TextType(ModalityType): @dataclass class Event: """Represents a single modality event with spatial/temporal dimensions.""" + """ - Represents a single data occurrence (with a specific type, time interval, and data payload). + Represents a single data occurrence (with a specific type, time interval, and + data payload). Attributes: - data (Any): The actual data payload (e.g. a torch.Tensor, a string, etc.). - type (ModalityType): The modality type of the data (e.g., VisionType.image). - time (Tuple[float, float]): (start_time, end_time) indicating when this Event occurs. - role (Optional[str]): The role associated with this event (e.g., "user", "agent", "system"). - If None, the event is always included in loss calculation. + data (Any): The actual data payload (e.g. a torch.Tensor, a string, + etc.). + type (ModalityType): The modality type of the data (e.g., + VisionType.image). + time (Tuple[float, float]): (start_time, end_time) indicating when this + Event occurs. + role (Optional[str]): The role associated with this event (e.g., "user", + "agent", "system"). If None, the event is always included in loss + calculation. Example usage: evt = Event(data=torch.zeros((1, 224, 224, 3)), # e.g. a single image frame type=VisionType.image, time=(0.0, 0.04), role="user") - """ + """ # Descriptors modality_type: ModalityType - + # Structure - dims_virtual: list[int] | None = None # virtual/processed dimensions (e.g., pixel-shuffled) + dims_virtual: list[int] | None = ( + None # virtual/processed dimensions (e.g., pixel-shuffled) + ) dims_real: list[int] | None = None # real/actual tensor dimensions idx_range: tuple[int, int] | None = None - + def dims(self, virtual: bool = True) -> list[int] | None: """ Get the dimensions of this event. Args: - virtual: If True (default), return virtual/processed dimensions (e.g., pixel-shuffled). - If False, return real/actual tensor dimensions. + virtual: If True (default), return virtual/processed dimensions + (e.g., pixel-shuffled). If False, return real/actual tensor + dimensions. Returns: Dimensions list or None if not measured. @@ -171,7 +179,9 @@ class Event: if not virtual: assert partial is False and isinstance(self.data, torch.Tensor) return math.prod(self.dims(virtual=False)) - return self.idx_range[1] - self.idx_range[0] if partial else math.prod(self.dims()) + return ( + self.idx_range[1] - self.idx_range[0] if partial else math.prod(self.dims()) + ) @dataclass @@ -215,7 +225,8 @@ class Stream: yield from self.events -# TODO: implement all types of cool indexing which can happen since TensorStream assuems Event.data = Tensor +# TODO: implement all types of cool indexing which can happen since TensorStream +# assumes Event.data = Tensor @dataclass class TensorStream: streams: list[Stream] @@ -254,7 +265,8 @@ def compute_mrope_pos_tensor(ts: TensorStream, n_pos_dims: int = 3) -> torch.Ten cumulative_offset = 0 # running time index for this stream for event in stream: - # --- build coordinate grid for THIS event using itertools (no tensor ops) --- + # --- build coordinate grid for THIS event using itertools + # (no tensor ops) --- dims = (event.dims() or [1]) + [1] * (n_pos_dims - len(event.dims() or [])) # Create ranges for each dimension (similar to old _finalize implementation) @@ -274,26 +286,30 @@ def compute_mrope_pos_tensor(ts: TensorStream, n_pos_dims: int = 3) -> torch.Ten # Convert to tensor and reshape to (B, T, n_pos_dims) B, T = ts.shape - return torch.tensor(all_coords, dtype=torch.long, device=ts.device).reshape(B, T, n_pos_dims) + return torch.tensor(all_coords, dtype=torch.long, device=ts.device).reshape( + B, T, n_pos_dims + ) def modality_mask(ts: TensorStream, modality_type: ModalityType) -> torch.Tensor: """Create boolean mask for specific modality type in the tensor stream.""" B, T = ts.shape mask = torch.zeros((B, T), dtype=torch.bool, device=ts.device) - + for batch_idx, stream in enumerate(ts.streams): seq_idx = 0 for event in stream: if event.modality_type == modality_type: start, end = event.idx_range - mask[batch_idx, seq_idx:seq_idx+(end-start)] = True - seq_idx += (event.idx_range[1] - event.idx_range[0]) - + mask[batch_idx, seq_idx : seq_idx + (end - start)] = True + seq_idx += event.idx_range[1] - event.idx_range[0] + return mask + # ===== End TensorStream Compatibility Layer ===== + class PixelShuffleSiglip2VisionConfig(Siglip2VisionConfig): """Vision configuration for Isaac with Pixel Shuffle support. @@ -338,7 +354,9 @@ class Siglip2VariableSequenceEmbeddings(nn.Module): ) -> torch.Tensor: # Prepare positional embeddings grid: (1, embed_dim, h, w) positional_embeddings = ( - self.position_embedding.weight.reshape(self.position_embedding_size, self.position_embedding_size, -1) + self.position_embedding.weight.reshape( + self.position_embedding_size, self.position_embedding_size, -1 + ) .permute(2, 0, 1) .unsqueeze(0) ) @@ -359,12 +377,16 @@ class Siglip2VariableSequenceEmbeddings(nn.Module): align_corners=align_corners, antialias=antialias, ) - # Reshape from (1, embed_dim, height, width) to (height*width, embed_dim) - resized_pos_embed = resized_pos_embed.reshape(self.embed_dim, height * width).transpose(0, 1) + # Reshape from (1, embed_dim, height, width) to + # (height*width, embed_dim) + resized_pos_embed = resized_pos_embed.reshape( + self.embed_dim, height * width + ).transpose(0, 1) else: # Fallback - should never happen in practice resized_pos_embed = positional_embeddings.reshape( - self.embed_dim, self.position_embedding_size * self.position_embedding_size + self.embed_dim, + self.position_embedding_size * self.position_embedding_size, ).transpose(0, 1)[: height * width] pos_embeds_list.append(resized_pos_embed) @@ -372,7 +394,9 @@ class Siglip2VariableSequenceEmbeddings(nn.Module): pos_embeds = torch.cat(pos_embeds_list, dim=0) return pos_embeds - def forward(self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor, torch.Tensor]): + def forward( + self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor, torch.Tensor] + ): seq_patches, _seq_sizes, _spatial_shapes = packed_seq_patches # Apply patch embeddings @@ -385,7 +409,9 @@ class Siglip2VariableSequenceEmbeddings(nn.Module): # For variable-length attention, we need to reshape to (total_tokens, embed_dim) if batch_size != 1: - raise ValueError("Variable-length attention expects batch_size=1 for packed sequences") + raise ValueError( + "Variable-length attention expects batch_size=1 for packed sequences" + ) patch_embeds = patch_embeds.view(batch_size * patches_per_image, embed_dim) @@ -427,11 +453,13 @@ def create_pixel_shuffle_index_map( # Safety: all spatial dims must be divisible by r # Cannot run under torch compile fullgraph mode hence - if not torch.compiler.is_compiling(): - if not ((token_grids[:, 0] % r == 0).all() and (token_grids[:, 1] % r == 0).all()): - raise AssertionError( - f"Every (H,W) in `token_grids` must be divisible by scale_factor={r}, got {token_grids.tolist()}" - ) + if not torch.compiler.is_compiling() and not ( + (token_grids[:, 0] % r == 0).all() and (token_grids[:, 1] % r == 0).all() + ): + raise AssertionError( + "Every (H,W) in `token_grids` must be divisible by " + f"scale_factor={r}, got {token_grids.tolist()}" + ) gather_chunks: list[torch.Tensor] = [] tok_offset = 0 @@ -467,19 +495,23 @@ def pixel_shuffle_varlen( Args: x (`torch.Tensor`): - Concatenated vision embeddings. Accepts `(seq_len, hidden_size)` or `(1, seq_len, hidden_size)` shapes - produced by stacking image patches. + Concatenated vision embeddings. Accepts `(seq_len, hidden_size)` or + `(1, seq_len, hidden_size)` shapes produced by stacking image + patches. token_grids (`torch.Tensor`): - Integer tensor of shape `(num_images, 2)` whose rows give the `(height, width)` patch grid sizes - corresponding to each image segment inside `x`. + Integer tensor of shape `(num_images, 2)` whose rows give the + `(height, width)` patch grid sizes corresponding to each image + segment inside `x`. scale_factor (`int`, *optional*, defaults to 1): - Spatial down-sampling factor specific to pixel shuffle. Values greater than one merge `scale_factor**2` neighboring patches into a + Spatial down-sampling factor specific to pixel shuffle. Values + greater than one merge `scale_factor**2` neighboring patches into a single embedding channel-group. Returns: - `torch.Tensor`: Pixel-shuffled embeddings with shape matching the input convention: - `(seq_len, hidden_size * scale_factor**2)` when the input was 2D, or `(1, seq_len, hidden_size * scale_factor**2)` - if the singleton batch dimension was present. + `torch.Tensor`: Pixel-shuffled embeddings with shape matching the input + convention: `(seq_len, hidden_size * scale_factor**2)` when the input + was 2D, or `(1, seq_len, hidden_size * scale_factor**2)` if the + singleton batch dimension was present. Raises: ValueError: If more than one batch item is provided. @@ -517,6 +549,7 @@ def pixel_shuffle_varlen( out = out.unsqueeze(0) return out + # ============================================================================ # Configuration # ============================================================================ @@ -550,7 +583,9 @@ def _make_writeable(arr: np.ndarray) -> np.ndarray: def extract_image_pil(image: PIL.Image.Image) -> torch.Tensor | None: if image.width * image.height > MAX_PIXELS: - raise ValueError(f"Image (w={image.width}, h={image.height}) > MAX=`{MAX_PIXELS}`") + raise ValueError( + f"Image (w={image.width}, h={image.height}) > MAX=`{MAX_PIXELS}`" + ) img = image if image.mode == "RGB" else image.convert("RGB") arr = np.asarray(img) arr = _make_writeable(arr) @@ -576,17 +611,22 @@ def get_image_size_for_max_num_patches( patch_size (`int`): Size of the square patch used by the vision encoder. max_num_patches (`int`): - Upper bound on `(height / patch_size) * (width / patch_size)` after resizing. + Upper bound on `(height / patch_size) * (width / patch_size)` after + resizing. min_num_patches (`int`, *optional*): - Lower bound on the number of patches. When provided the image will be scaled up if necessary. + Lower bound on the number of patches. When provided the image will + be scaled up if necessary. eps (`float`, *optional*, defaults to 1e-5): - Convergence tolerance for the internal binary search to determing the target dimensions. + Convergence tolerance for the internal binary search to determine + the target dimensions. pixel_shuffle_scale (`int`, *optional*, defaults to 1): - Additional stride multiplier applied when pixel shuffle later reduces spatial resolution. + Additional stride multiplier applied when pixel shuffle later + reduces spatial resolution. Returns: - `tuple[int, int]`: Height and width (in pixels) that are multiples of `patch_size * pixel_shuffle_scale` - and respect both the maximum and optional minimum patch-count constraints. + `tuple[int, int]`: Height and width (in pixels) that are multiples of + `patch_size * pixel_shuffle_scale` and respect both the maximum and + optional minimum patch-count constraints. """ def get_scaled_image_size(scale, original_size, patch_size, pixel_shuffle_scale): @@ -610,16 +650,24 @@ def get_image_size_for_max_num_patches( scale_min, scale_max = 1.0, 100.0 while (scale_max - scale_min) >= eps: scale = (scale_min + scale_max) / 2 - target_height = get_scaled_image_size(scale, image_height, patch_size, pixel_shuffle_scale) - target_width = get_scaled_image_size(scale, image_width, patch_size, pixel_shuffle_scale) + target_height = get_scaled_image_size( + scale, image_height, patch_size, pixel_shuffle_scale + ) + target_width = get_scaled_image_size( + scale, image_width, patch_size, pixel_shuffle_scale + ) num_patches = (target_height / patch_size) * (target_width / patch_size) if num_patches >= min_num_patches: scale_max = scale else: scale_min = scale scale = scale_max - target_height = get_scaled_image_size(scale, image_height, patch_size, pixel_shuffle_scale) - target_width = get_scaled_image_size(scale, image_width, patch_size, pixel_shuffle_scale) + target_height = get_scaled_image_size( + scale, image_height, patch_size, pixel_shuffle_scale + ) + target_width = get_scaled_image_size( + scale, image_width, patch_size, pixel_shuffle_scale + ) return target_height, target_width elif num_patches <= max_num_patches: return adjusted_height, adjusted_width @@ -628,16 +676,24 @@ def get_image_size_for_max_num_patches( scale_min, scale_max = eps / 10, 1.0 while (scale_max - scale_min) >= eps: scale = (scale_min + scale_max) / 2 - target_height = get_scaled_image_size(scale, image_height, patch_size, pixel_shuffle_scale) - target_width = get_scaled_image_size(scale, image_width, patch_size, pixel_shuffle_scale) + target_height = get_scaled_image_size( + scale, image_height, patch_size, pixel_shuffle_scale + ) + target_width = get_scaled_image_size( + scale, image_width, patch_size, pixel_shuffle_scale + ) num_patches = (target_height / patch_size) * (target_width / patch_size) if num_patches <= max_num_patches: scale_min = scale else: scale_max = scale scale = scale_min - target_height = get_scaled_image_size(scale, image_height, patch_size, pixel_shuffle_scale) - target_width = get_scaled_image_size(scale, image_width, patch_size, pixel_shuffle_scale) + target_height = get_scaled_image_size( + scale, image_height, patch_size, pixel_shuffle_scale + ) + target_width = get_scaled_image_size( + scale, image_width, patch_size, pixel_shuffle_scale + ) return target_height, target_width @@ -653,12 +709,13 @@ def prepare_image_tensor( Args: image (`torch.Tensor`): - Tensor with shape `(..., height, width, 3)` containing RGB values. The tensor is converted to floating - point if needed. + Tensor with shape `(..., height, width, 3)` containing RGB values. + The tensor is converted to floating point if needed. scale (`float`, *optional*, defaults to `VISION_SCALE`): Scalar multiplier applied before normalization. Returns: - `torch.Tensor`: Normalized tensor with the same shape as the input and dtype `torch.float32`. + `torch.Tensor`: Normalized tensor with the same shape as the input and + dtype `torch.float32`. """ if not torch.is_floating_point(image): image = image.float() @@ -683,17 +740,33 @@ def patchify_vision(image: torch.Tensor, patch_size: int) -> torch.Tensor: Returns: `torch.Tensor`: - Patch tensor where each position stores the flattened pixels belonging to that patch. + Patch tensor where each position stores the flattened pixels + belonging to that patch. Raises: ValueError: If `height` or `width` is not divisible by `patch_size`. """ num_images, height, width, channels = image.shape if height % patch_size or width % patch_size: - raise ValueError(f"Dimensions of images {image.shape} are not divisible by patch_size={patch_size}.") - patches = image.reshape(num_images, height // patch_size, patch_size, width // patch_size, patch_size, channels) + raise ValueError( + "Dimensions of images " + f"{image.shape} are not divisible by patch_size={patch_size}." + ) + patches = image.reshape( + num_images, + height // patch_size, + patch_size, + width // patch_size, + patch_size, + channels, + ) patches = patches.permute(0, 1, 3, 2, 4, 5) - patches = patches.reshape(num_images, height // patch_size, width // patch_size, channels * patch_size * patch_size) + patches = patches.reshape( + num_images, + height // patch_size, + width // patch_size, + channels * patch_size * patch_size, + ) return patches @@ -708,21 +781,26 @@ def process_vision_for_patches( Args: images (`torch.Tensor`): - Either `(height, width, channels)` for a single image or `(num_images, height, width, channels)` for a - batch. Channels are expected to be RGB. + Either `(height, width, channels)` for a single image or + `(num_images, height, width, channels)` for a batch. Channels are + expected to be RGB. patch_size (`int`): Edge length of square patches; implictly controls resize grid granularity. max_num_patches (`int`): Maximum number of patches allowed after resizing. min_num_patches (`int`, *optional*): - Minimum number of patches. If provided, the routine upsamples images as needed to satisfy the lower bound. + Minimum number of patches. If provided, the routine upsamples images + as needed to satisfy the lower bound. pixel_shuffle_scale (`int`, *optional*, defaults to 1): - pixel shuffle scale factor; influences the target grid that the function produces. + Pixel shuffle scale factor; influences the target grid that the + function produces. Returns: - `tuple[torch.Tensor, list[int]]`: A pair `(patches, dims_virtual)` where `patches` has shape - `(num_images, target_h / patch_size, target_w / patch_size, channels * patch_size**2)` and `dims_virtual` - encodes effective `(images, height, width)` dimensions after optional pixel shuffling. + `tuple[torch.Tensor, list[int]]`: A pair `(patches, dims_virtual)` + where `patches` has shape `(num_images, target_h / patch_size, target_w + / patch_size, channels * patch_size**2)` and `dims_virtual` encodes + effective `(images, height, width)` dimensions after optional pixel + shuffling. """ # Add batch dim if single image if images.dim() == 3: @@ -788,7 +866,7 @@ class IsaacConfig(Qwen3Config): **kwargs, ): super().__init__(**kwargs) - + # EventStreamProcessor parameters (for backward compatibility) self.video_patch_size = vision_patch_size self.vision_max_num_patches = vision_max_num_patches @@ -814,7 +892,6 @@ class IsaacImageProcessorKwargs(TypedDict, total=False): class IsaacImageProcessor: - patch_size = 16 max_num_patches = 6144 min_num_patches = 256 @@ -825,14 +902,18 @@ class IsaacImageProcessor: def __init__(self, kwargs): self.patch_size = kwargs.pop("patch_size", self.patch_size) - self.vision_max_num_patches = kwargs.pop("vision_max_num_patches", self.max_num_patches) - self.vision_min_num_patches = kwargs.pop("vision_min_num_patches", self.min_num_patches) + self.vision_max_num_patches = kwargs.pop( + "vision_max_num_patches", self.max_num_patches + ) + self.vision_min_num_patches = kwargs.pop( + "vision_min_num_patches", self.min_num_patches + ) self.pixel_shuffle_scale = kwargs.pop("pixel_shuffle_scale", 2) def preprocess( self, images: list[torch.Tensor], - return_tensors: Optional[Union[str, TensorType]], + return_tensors: str | TensorType | None, **kwargs: Unpack[IsaacImageProcessorKwargs], ) -> BatchFeature: """Isaac's resize → normalize → patchify → pack.""" @@ -840,9 +921,9 @@ class IsaacImageProcessor: all_pixel_values: list[torch.Tensor] = [] all_image_grids: list[torch.Tensor] = [] - for image in images: + for image in images: image_tensor = extract_image_pil(image) - + patches, dims_virtual = process_vision_for_patches( image_tensor, patch_size=self.patch_size, @@ -874,7 +955,10 @@ class IsaacImageProcessor: final_image_grids = torch.empty(0, 3) return BatchFeature( - data={"pixel_values": final_pixel_values, "image_grid_thw": final_image_grids}, + data={ + "pixel_values": final_pixel_values, + "image_grid_thw": final_image_grids, + }, tensor_type=return_tensors, ) @@ -899,7 +983,7 @@ class IsaacProcessor: image_result = self.image_processor.preprocess(images, **kwargs) result.update(image_result) return BatchFeature(result) - + def apply_chat_template( self, messages: list[dict[str, Any]], @@ -909,7 +993,7 @@ class IsaacProcessor: ) -> Any: # Convert mixed content messages to simple text format processed_messages = [] - + for message in messages: if "content" in message and isinstance(message["content"], list): # Handle mixed content (text + image) @@ -920,23 +1004,25 @@ class IsaacProcessor: elif content_item.get("type") == "image": # Replace image with vision token text_parts.append(self.image_token) - + processed_message = { "role": message.get("role", "user"), - "content": "".join(text_parts) + "content": "".join(text_parts), } processed_messages.append(processed_message) else: # Regular text message processed_messages.append(message) - + return self.tokenizer.apply_chat_template( - processed_messages, tokenize=tokenize, add_generation_prompt=add_generation_prompt, **kwargs + processed_messages, + tokenize=tokenize, + add_generation_prompt=add_generation_prompt, + **kwargs, ) class IsaacProcessingInfo(BaseProcessingInfo): - def get_hf_config(self) -> IsaacConfig: if hasattr(self.ctx, "get_hf_config"): original_config = self.ctx.get_hf_config() @@ -945,10 +1031,16 @@ class IsaacProcessingInfo(BaseProcessingInfo): # Vision parameters - map from HF names vision_config=getattr(original_config, "vision_config", None), vision_patch_size=getattr(original_config, "video_patch_size", 16), - vision_max_num_patches=getattr(original_config, "vision_max_num_patches", 256), - vision_min_num_patches=getattr(original_config, "vision_min_num_patches", None), + vision_max_num_patches=getattr( + original_config, "vision_max_num_patches", 256 + ), + vision_min_num_patches=getattr( + original_config, "vision_min_num_patches", None + ), pixel_shuffle_scale=getattr(original_config, "pixel_shuffle_scale", 1), - max_sequence_length=getattr(original_config, "max_sequence_length", 16384), + max_sequence_length=getattr( + original_config, "max_sequence_length", 16384 + ), vision_token="<|image_pad|>", ) return IsaacConfig() @@ -975,18 +1067,22 @@ class IsaacProcessingInfo(BaseProcessingInfo): def get_image_processor(self, **kwargs) -> IsaacImageProcessor: return self.get_hf_processor(**kwargs).image_processor - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"image": None} def get_mm_max_tokens_per_item( - self, seq_len: int, mm_counts: Mapping[str, int], + self, + seq_len: int, + mm_counts: Mapping[str, int], ) -> Mapping[str, int]: hf_config = self.get_hf_config() - num_vision_tokens = hf_config.vision_max_num_patches // (hf_config.pixel_shuffle_scale**2) + num_vision_tokens = hf_config.vision_max_num_patches // ( + hf_config.pixel_shuffle_scale**2 + ) return {"image": num_vision_tokens} -class IsaacDummyInputsBuilder(BaseDummyInputsBuilder[IsaacProcessingInfo]): +class IsaacDummyInputsBuilder(BaseDummyInputsBuilder[IsaacProcessingInfo]): def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: num_images = mm_counts.get("image", 0) @@ -1017,19 +1113,19 @@ class IsaacDummyInputsBuilder(BaseDummyInputsBuilder[IsaacProcessingInfo]): class IsaacMultiModalProcessor(BaseMultiModalProcessor): - def _get_mm_fields_config( self, hf_inputs: BatchFeature, hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: - # Configure multimodal fields for Isaac model image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3))) image_grid_sizes = image_grid_thw.prod(-1) return { - "pixel_values": MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes), + "pixel_values": MultiModalFieldConfig.flat_from_sizes( + "image", image_grid_sizes + ), "image_grid_thw": MultiModalFieldConfig.batched("image"), } @@ -1039,24 +1135,23 @@ class IsaacMultiModalProcessor(BaseMultiModalProcessor): hf_processor_mm_kwargs: Mapping[str, Any], out_mm_kwargs: MultiModalKwargs, ) -> Sequence[PromptUpdate]: - - #hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + # hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs) tokenizer = self.info.get_tokenizer() vocab = tokenizer.get_vocab() placeholder_id = vocab.get("<|image_pad|>", 151655) - - pixel_shuffle_scale = getattr(image_processor, 'pixel_shuffle_scale', 2) - merge_length = pixel_shuffle_scale ** 2 - + + pixel_shuffle_scale = getattr(image_processor, "pixel_shuffle_scale", 2) + merge_length = pixel_shuffle_scale**2 + def get_replacement_isaac(item_idx: int): out_item = out_mm_kwargs["image"][item_idx] grid_thw = out_item["image_grid_thw"].data assert isinstance(grid_thw, torch.Tensor) num_tokens = int(grid_thw.prod()) // merge_length - return [placeholder_id] * num_tokens + return [placeholder_id] * num_tokens return [ PromptReplacement( @@ -1066,6 +1161,7 @@ class IsaacMultiModalProcessor(BaseMultiModalProcessor): ) ] + class Siglip2VisionTransformer(nn.Module): def __init__( self, @@ -1107,7 +1203,9 @@ class Siglip2VisionTransformer(nn.Module): # Get embeddings from packed sequence hidden_states = self.embeddings((seq_patches, seq_sizes, token_grids)) - grid_thws = torch.tensor([[1, token_grids[0][0].item(), token_grids[0][1].item()]]) + grid_thws = torch.tensor( + [[1, token_grids[0][0].item(), token_grids[0][1].item()]] + ) last_hidden_state = self.encoder(hidden_states, grid_thws) hidden_states = self.post_layernorm(last_hidden_state) @@ -1123,7 +1221,7 @@ class Siglip2VisionTransformer(nn.Module): # Remove the pseudo batch dimension we added earlier hidden_states = hidden_states.squeeze(0) - #return last_hidden_state + # return last_hidden_state return hidden_states def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: @@ -1160,9 +1258,8 @@ class Siglip2VisionTransformer(nn.Module): dummy_inputs=IsaacDummyInputsBuilder, ) class IsaacForConditionalGeneration( - Qwen3ForCausalLM, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE + Qwen3ForCausalLM, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): - packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -1176,11 +1273,11 @@ class IsaacForConditionalGeneration( } supports_encoder_tp_data = True - + # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ - "model.vision_embedding.": "vision_embedding.", + "model.vision_embedding.": "vision_embedding.", } ) @@ -1188,11 +1285,10 @@ class IsaacForConditionalGeneration( def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): return "<|image_pad|>" - + raise ValueError("Only image modality is supported") def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"): - config: IsaacConfig = vllm_config.model_config.hf_config head_dim = config.head_dim @@ -1207,18 +1303,22 @@ class IsaacForConditionalGeneration( # Initialize the parent class with updated config super().__init__(vllm_config=vllm_config, prefix=prefix) - + # Create the language model module to match checkpoint structure - self.language_model = nn.ModuleDict({ - "embed_tokens": self.model.embed_tokens, - "layers": self.model.layers, - "norm": self.model.norm - }) - + self.language_model = nn.ModuleDict( + { + "embed_tokens": self.model.embed_tokens, + "layers": self.model.layers, + "norm": self.model.norm, + } + ) + config.vision_config.preserve_original_pe = True config.vision_config.use_rope = False - config.vision_config.hidden_stride = config.vision_config.pixel_shuffle_scale_factor - config.vision_config.window_size = 32*2 + config.vision_config.hidden_stride = ( + config.vision_config.pixel_shuffle_scale_factor + ) + config.vision_config.window_size = 32 * 2 config.vision_config.fullatt_block_indexes = None vision_cfg = config.vision_config if vision_cfg is None: @@ -1226,7 +1326,9 @@ class IsaacForConditionalGeneration( hidden_dim = vision_cfg.hidden_size * (vision_cfg.pixel_shuffle_scale_factor**2) self.vision_embedding = nn.Sequential( - Siglip2VisionTransformer(vision_cfg, prefix=maybe_prefix(prefix, "vision_embedding")), + Siglip2VisionTransformer( + vision_cfg, prefix=maybe_prefix(prefix, "vision_embedding") + ), nn.Linear( hidden_dim, 4 * hidden_dim, @@ -1250,26 +1352,32 @@ class IsaacForConditionalGeneration( ) -> tuple[torch.Tensor, int]: """Get mrope input positions and delta value.""" - vision_token_id = getattr(self.config, 'image_token_id', 151655) + vision_token_id = getattr(self.config, "image_token_id", 151655) spatial_merge_size = hf_config.vision_config.pixel_shuffle_scale_factor input_tokens_tensor = torch.tensor(input_tokens) - + # Find image token positions - image_positions = torch.where(input_tokens_tensor == vision_token_id)[0].tolist() - - # For text-only inputs, use Isaac's original logic from compute_position_ids_input_ids() + image_positions = torch.where(input_tokens_tensor == vision_token_id)[ + 0 + ].tolist() + + # For text-only inputs, use Isaac's original logic from + # compute_position_ids_input_ids() if len(image_positions) == 0: seq_len = len(input_tokens) - # Create 3D positions where all dimensions get the same 1D temporal progression + # Create 3D positions where all dimensions get the same 1D temporal + # progression position_ids = torch.arange(seq_len, dtype=torch.long) position_ids = position_ids.view(1, -1).expand(1, -1) # [1, seq_len] - position_ids = position_ids.unsqueeze(2).expand(-1, -1, 3) # [1, seq_len, 3] + position_ids = position_ids.unsqueeze(2).expand( + -1, -1, 3 + ) # [1, seq_len, 3] # vLLM expects shape [3, seq_len], so transpose position_ids = position_ids.squeeze(0).transpose(0, 1) # [3, seq_len] - + return position_ids, 0 - + events = [] image_idx = 0 current_pos = 0 @@ -1278,7 +1386,7 @@ class IsaacForConditionalGeneration( for image_pos in image_positions: if image_pos <= last_processed_pos: continue # Skip already processed positions - + # Add any text before this image if image_pos > current_pos: text_tokens = image_pos - current_pos @@ -1288,21 +1396,23 @@ class IsaacForConditionalGeneration( idx_range=(0, text_tokens), ) events.append(text_event) - + # Add image t, h, w = image_grid_thw[image_idx] llm_grid_h, llm_grid_w = h // spatial_merge_size, w // spatial_merge_size image_tokens = t * llm_grid_h * llm_grid_w - + image_event = Event( modality_type=VisionType.image, dims_virtual=[t, llm_grid_h, llm_grid_w], idx_range=(0, image_tokens), ) events.append(image_event) - + current_pos = image_pos + image_tokens - last_processed_pos = current_pos - 1 # Mark up to this position as processed + last_processed_pos = ( + current_pos - 1 + ) # Mark up to this position as processed image_idx += 1 # Add final text segment if any @@ -1314,7 +1424,7 @@ class IsaacForConditionalGeneration( idx_range=(0, text_tokens), ) events.append(text_event) - + stream = Stream(events) tensor_stream = TensorStream([stream]) @@ -1334,8 +1444,7 @@ class IsaacForConditionalGeneration( def get_multimodal_embeddings( self, **kwargs: object - ) -> MultiModalEmbeddings | None: - + ) -> MultiModalEmbeddings | None: pixel_values = kwargs.get("pixel_values") image_grid_thw = kwargs.get("image_grid_thw") @@ -1343,15 +1452,21 @@ class IsaacForConditionalGeneration( return [] # Convert image_grid_thw from [batch, 1, [T, H, W]] to [batch, [H, W]] - spatial_grids = image_grid_thw[:, 0, 1:3] # Extract H, W from [T, H, W] for each image - + spatial_grids = image_grid_thw[ + :, 0, 1:3 + ] # Extract H, W from [T, H, W] for each image + # Process packed sequence patches through vision_embedding module vision_embeddings = self.vision_embedding((pixel_values, spatial_grids)) # Split concatenated embeddings for each image item (following Qwen2-VL pattern) - merge_size = self.config.vision_config.pixel_shuffle_scale_factor # Isaac uses pixel shuffle - sizes = spatial_grids.prod(-1) // (merge_size * merge_size) # H * W / (merge_size^2) - + merge_size = ( + self.config.vision_config.pixel_shuffle_scale_factor + ) # Isaac uses pixel shuffle + sizes = spatial_grids.prod(-1) // ( + merge_size * merge_size + ) # H * W / (merge_size^2) + return vision_embeddings.split(sizes.tolist()) def get_input_embeddings( @@ -1362,13 +1477,11 @@ class IsaacForConditionalGeneration( is_multimodal: torch.Tensor | None = None, handle_oov_mm_token: bool = False, ) -> torch.Tensor: - # Get text embeddings from the base language model inputs_embeds = super().get_input_embeddings(input_ids) - + # If we have multimodal embeddings, merge them with text embeddings if multimodal_embeddings is not None and len(multimodal_embeddings) != 0: - inputs_embeds = _merge_multimodal_embeddings( inputs_embeds=inputs_embeds, multimodal_embeddings=multimodal_embeddings, @@ -1379,7 +1492,7 @@ class IsaacForConditionalGeneration( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: skip_prefixes = [] - + loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) From ac8a0b936aa25bf5f866dbe843950f8c30d45654 Mon Sep 17 00:00:00 2001 From: Yang Date: Fri, 21 Nov 2025 19:52:24 -0800 Subject: [PATCH 06/18] [Feature] Enhance Isaac model with vision embedding and attention mechanisms Signed-off-by: Yang --- vllm/model_executor/models/isaac.py | 452 +++++++++++++++++++++++++--- 1 file changed, 417 insertions(+), 35 deletions(-) diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index 5c61e5bf48a70..d2d980a9aadf4 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -14,15 +14,30 @@ import PIL.Image import torch import torch.nn as nn import torch.nn.functional as F +from einops import rearrange from transformers import PretrainedConfig, Qwen3Config from transformers.image_processing_utils import BatchFeature from transformers.models.siglip2.configuration_siglip2 import Siglip2VisionConfig from transformers.tokenization_utils import TensorType from typing_extensions import TypedDict, Unpack -from vllm.attention.backends.registry import _Backend +from vllm.attention.backends.registry import AttentionBackendEnum +from vllm.attention.layer import ( + check_upstream_fa_availability, + maybe_get_vit_flash_attn_backend, +) +from vllm.attention.ops.vit_attn_wrappers import ( + vit_xformers_attn_wrapper, +) from vllm.config import VllmConfig -from vllm.model_executor.layers.linear import ReplicatedLinear +from vllm.distributed import parallel_state +from vllm.distributed import utils as dist_utils +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear, +) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, @@ -36,13 +51,14 @@ from vllm.model_executor.models.interfaces import ( ) from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM -from vllm.model_executor.models.siglip2navit import Siglip2Encoder +from vllm.model_executor.models.siglip import SiglipMLP from vllm.model_executor.models.utils import ( AutoWeightsLoader, WeightsMapper, _merge_multimodal_embeddings, maybe_prefix, ) +from vllm.model_executor.models.vision import get_vit_attn_backend from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( MultiModalDataDict, @@ -332,6 +348,16 @@ class PixelShuffleSiglip2VisionConfig(Siglip2VisionConfig): self.num_patches = num_patches +def create_cumulative_seq_lengths( + seq_sizes: torch.Tensor, device: torch.device +) -> tuple[torch.Tensor, int]: + """Create cumulative sequence lengths for variable-length attention.""" + cu_seqlens = torch.zeros(len(seq_sizes) + 1, dtype=torch.int32, device=device) + cu_seqlens[1:] = seq_sizes.cumsum(0) + max_seqlen = int(seq_sizes.max().item()) if len(seq_sizes) > 0 else 0 + return cu_seqlens, max_seqlen + + class Siglip2VariableSequenceEmbeddings(nn.Module): def __init__(self, config: PixelShuffleSiglip2VisionConfig): super().__init__() @@ -367,7 +393,7 @@ class Siglip2VariableSequenceEmbeddings(nn.Module): align_corners = False antialias = True for spatial_shape in spatial_shapes: - height, width = spatial_shape + height, width = int(spatial_shape[0]), int(spatial_shape[1]) # Guard to ensure height and width are positive for torch.compile if height > 0 and width > 0: resized_pos_embed = F.interpolate( @@ -399,21 +425,16 @@ class Siglip2VariableSequenceEmbeddings(nn.Module): ): seq_patches, _seq_sizes, _spatial_shapes = packed_seq_patches - # Apply patch embeddings - target_dtype = self.patch_embedding.weight.dtype - patch_embeds = self.patch_embedding(seq_patches.to(dtype=target_dtype)) + target_weight = self.patch_embedding.weight + seq_patches = seq_patches.to( + device=target_weight.device, dtype=target_weight.dtype + ) + patch_embeds = self.patch_embedding(seq_patches) pos_embeds = self.positional_embeddings(packed_seq_patches) # Flatten patch embeddings to match positional embeddings format - batch_size, patches_per_image, embed_dim = patch_embeds.shape - - # For variable-length attention, we need to reshape to (total_tokens, embed_dim) - if batch_size != 1: - raise ValueError( - "Variable-length attention expects batch_size=1 for packed sequences" - ) - - patch_embeds = patch_embeds.view(batch_size * patches_per_image, embed_dim) + if patch_embeds.dim() == 3: + patch_embeds = patch_embeds.view(-1, patch_embeds.size(-1)) # Add positional embeddings to patch embeddings embeddings = patch_embeds + pos_embeds @@ -1162,6 +1183,313 @@ class IsaacMultiModalProcessor(BaseMultiModalProcessor): ] +def all_gather_interleave(local_tensor: torch.Tensor, hidden_size: int, tp_size: int): + """All-gather the input tensor interleavely across model parallel group.""" + import torch.distributed as dist + + gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)] + dist.all_gather( + gathered_tensors, local_tensor, group=parallel_state.get_tp_group().device_group + ) + + gathered_tensors_split = [ + torch.split(tensor, hidden_size // tp_size, -1) for tensor in gathered_tensors + ] + ordered_tensors = [ + tensor for pair in zip(*gathered_tensors_split) for tensor in pair + ] + return torch.cat(ordered_tensors, dim=-1) + + +class Siglip2VisionAttention(nn.Module): + def __init__( + self, + config: PixelShuffleSiglip2VisionConfig, + quant_config: QuantizationConfig | None = None, + *, + prefix: str = "", + use_data_parallel: bool = False, + use_upstream_fa: bool = False, + attn_backend: AttentionBackendEnum | None = None, + attn_backend_override: AttentionBackendEnum | None = None, + ) -> None: + super().__init__() + + self.tp_size = ( + 1 + if use_data_parallel + else parallel_state.get_tensor_model_parallel_world_size() + ) + self.tp_rank = parallel_state.get_tensor_model_parallel_rank() + self.hidden_size_per_attention_head = dist_utils.divide( + config.hidden_size, config.num_attention_heads + ) + self.num_attention_heads_per_partition = dist_utils.divide( + config.num_attention_heads, self.tp_size + ) + + self.qkv_proj = QKVParallelLinear( + hidden_size=config.hidden_size, + head_size=self.hidden_size_per_attention_head, + total_num_heads=config.num_attention_heads, + total_num_kv_heads=config.num_attention_heads, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + disable_tp=use_data_parallel, + ) + self.out_proj = RowParallelLinear( + input_size=config.hidden_size, + output_size=config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.out_proj", + disable_tp=use_data_parallel, + ) + + self.use_upstream_fa = use_upstream_fa + self.attn_backend = attn_backend + + if self.attn_backend not in { + AttentionBackendEnum.FLASH_ATTN, + AttentionBackendEnum.ROCM_AITER_FA, + } and check_upstream_fa_availability(torch.get_default_dtype()): + self.attn_backend = AttentionBackendEnum.FLASH_ATTN + self.use_upstream_fa = True + if self.attn_backend not in { + AttentionBackendEnum.FLASH_ATTN, + AttentionBackendEnum.TORCH_SDPA, + AttentionBackendEnum.XFORMERS, + AttentionBackendEnum.ROCM_AITER_FA, + }: + raise RuntimeError( + f"Isaac vision embedding does not support {self.attn_backend} backend." + ) + self.attn_backend, self.flash_attn_varlen_func = ( + maybe_get_vit_flash_attn_backend( + self.attn_backend, + self.use_upstream_fa, + attn_backend_override=attn_backend_override, + ) + ) + self.is_flash_attn_backend = self.attn_backend in { + AttentionBackendEnum.FLASH_ATTN, + AttentionBackendEnum.ROCM_AITER_FA, + } + + def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: + seq_len, bs, _ = qkv.shape + if self.tp_size > 1: + qkv = all_gather_interleave(qkv, self.qkv_proj.hidden_size, self.tp_size) + + q, k, v = qkv.chunk(3, dim=2) + + if self.tp_size > 1: + q = dist_utils.split_tensor_along_last_dim(q, self.tp_size)[self.tp_rank] + k = dist_utils.split_tensor_along_last_dim(k, self.tp_size)[self.tp_rank] + v = dist_utils.split_tensor_along_last_dim(v, self.tp_size)[self.tp_rank] + + new_shape = ( + seq_len, + bs, + self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head, + ) + q, k, v = (x.view(*new_shape) for x in (q, k, v)) + return q, k, v + + def forward( + self, + hidden_states: torch.Tensor, + *, + cu_seqlens: torch.Tensor, + max_seqlen: torch.Tensor | None, + seqlens: torch.Tensor | None, + ) -> torch.Tensor: + batch_size, _, _ = hidden_states.shape + if batch_size != 1: + raise ValueError("packed variable-length attention expects batch_size=1") + x = rearrange(hidden_states, "b s d -> s b d") + x, _ = self.qkv_proj(x) + q, k, v = self.split_qkv(x) + q, k, v = (rearrange(t, "s b h d -> b s h d") for t in (q, k, v)) + + if self.is_flash_attn_backend: + q, k, v = (rearrange(t, "b s ... -> (b s) ...") for t in (q, k, v)) + output = self.flash_attn_varlen_func( + q, + k, + v, + cu_seqlens_q=cu_seqlens, + cu_seqlens_k=cu_seqlens, + max_seqlen_q=max_seqlen, + max_seqlen_k=max_seqlen, + dropout_p=0.0, + causal=False, + ) + context_layer = rearrange( + output, "(b s) h d -> s b (h d)", b=batch_size + ).contiguous() + elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA: + outputs = [] + for i in range(1, len(cu_seqlens)): + start_idx = cu_seqlens[i - 1] + end_idx = cu_seqlens[i] + q_i = q[:, start_idx:end_idx] + k_i = k[:, start_idx:end_idx] + v_i = v[:, start_idx:end_idx] + q_i, k_i, v_i = ( + rearrange(tensor, "b s h d -> b h s d") + for tensor in (q_i, k_i, v_i) + ) + output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0) + output_i = rearrange(output_i, "b h s d -> b s h d") + outputs.append(output_i) + context_layer = torch.cat(outputs, dim=1) + context_layer = rearrange( + context_layer, "b s h d -> s b (h d)" + ).contiguous() + elif self.attn_backend == AttentionBackendEnum.XFORMERS: + if seqlens is None: + raise ValueError("xFormers attention backend requires seqlens tensor.") + context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens) + else: + raise RuntimeError( + f"Isaac vision embedding does not support {self.attn_backend} backend." + ) + + output, _ = self.out_proj(context_layer) + output = rearrange(output, "s b d -> b s d") + return output + + +class Siglip2EncoderLayer(nn.Module): + def __init__( + self, + config: PixelShuffleSiglip2VisionConfig, + quant_config: QuantizationConfig | None = None, + *, + prefix: str = "", + attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA, + attn_backend_override: AttentionBackendEnum | None = None, + use_upstream_fa: bool = False, + use_data_parallel: bool = False, + ) -> None: + super().__init__() + self.embed_dim = config.hidden_size + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.self_attn = Siglip2VisionAttention( + config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + use_data_parallel=use_data_parallel, + use_upstream_fa=use_upstream_fa, + attn_backend=attn_backend, + attn_backend_override=attn_backend_override, + ) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = SiglipMLP( + config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + + def forward( + self, + hidden_states: torch.Tensor, + *, + cu_seqlens: torch.Tensor, + max_seqlen: torch.Tensor | None, + seqlens: torch.Tensor | None, + ) -> torch.Tensor: + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states = self.self_attn( + hidden_states=hidden_states, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + seqlens=seqlens, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +class Siglip2Encoder(nn.Module): + def __init__( + self, + config: PixelShuffleSiglip2VisionConfig, + quant_config: QuantizationConfig | None = None, + *, + prefix: str = "", + use_data_parallel: bool = False, + attn_backend_override: AttentionBackendEnum | None = None, + ) -> None: + super().__init__() + self.config = config + embed_dim = config.hidden_size + num_heads = config.num_attention_heads + head_dim = embed_dim // num_heads + self.attn_backend = get_vit_attn_backend( + head_size=head_dim, + dtype=torch.get_default_dtype(), + attn_backend_override=attn_backend_override, + ) + self.use_upstream_fa = False + if self.attn_backend not in { + AttentionBackendEnum.FLASH_ATTN, + AttentionBackendEnum.ROCM_AITER_FA, + } and check_upstream_fa_availability(torch.get_default_dtype()): + self.attn_backend = AttentionBackendEnum.FLASH_ATTN + self.use_upstream_fa = True + if self.attn_backend not in { + AttentionBackendEnum.FLASH_ATTN, + AttentionBackendEnum.TORCH_SDPA, + AttentionBackendEnum.XFORMERS, + AttentionBackendEnum.ROCM_AITER_FA, + }: + raise RuntimeError( + f"Isaac vision embedding does not support {self.attn_backend} backend." + ) + self.layers = nn.ModuleList( + [ + Siglip2EncoderLayer( + config, + quant_config=quant_config, + prefix=f"{prefix}.layers.{layer_idx}", + attn_backend=self.attn_backend, + attn_backend_override=attn_backend_override, + use_upstream_fa=self.use_upstream_fa, + use_data_parallel=use_data_parallel, + ) + for layer_idx in range(config.num_hidden_layers) + ] + ) + + def forward( + self, + inputs_embeds: torch.Tensor, + *, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: torch.Tensor | None = None, + seqlens: torch.Tensor | None = None, + ) -> torch.Tensor: + hidden_states = inputs_embeds + for encoder_layer in self.layers: + hidden_states = encoder_layer( + hidden_states, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + seqlens=seqlens, + ) + return hidden_states + + class Siglip2VisionTransformer(nn.Module): def __init__( self, @@ -1169,7 +1497,7 @@ class Siglip2VisionTransformer(nn.Module): quant_config: QuantizationConfig | None = None, prefix: str = "", use_data_parallel: bool = False, - attn_backend_override: _Backend | None = None, + attn_backend_override: AttentionBackendEnum | None = None, ): super().__init__() self.config = config @@ -1187,6 +1515,19 @@ class Siglip2VisionTransformer(nn.Module): ) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + def compute_attn_mask_seqlen( + self, cu_seqlens: torch.Tensor + ) -> tuple[torch.Tensor | None, torch.Tensor | None]: + max_seqlen, seqlens = None, None + if self.encoder.attn_backend in { + AttentionBackendEnum.FLASH_ATTN, + AttentionBackendEnum.ROCM_AITER_FA, + }: + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() + elif self.encoder.attn_backend == AttentionBackendEnum.XFORMERS: + seqlens = cu_seqlens[1:] - cu_seqlens[:-1] + return max_seqlen, seqlens + def forward( self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor], @@ -1203,15 +1544,20 @@ class Siglip2VisionTransformer(nn.Module): # Get embeddings from packed sequence hidden_states = self.embeddings((seq_patches, seq_sizes, token_grids)) - grid_thws = torch.tensor( - [[1, token_grids[0][0].item(), token_grids[0][1].item()]] - ) - last_hidden_state = self.encoder(hidden_states, grid_thws) - hidden_states = self.post_layernorm(last_hidden_state) - # Add a pseudo batch dimension for the encoder hidden_states = hidden_states.unsqueeze(0) + cu_seqlens, _ = create_cumulative_seq_lengths(seq_sizes, hidden_states.device) + max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens) + + hidden_states = self.encoder( + inputs_embeds=hidden_states, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + seqlens=seqlens, + ) + hidden_states = self.post_layernorm(hidden_states) + if self.pixel_shuffle_scale_factor > 1: hidden_states = pixel_shuffle_varlen( x=hidden_states, @@ -1252,6 +1598,44 @@ class Siglip2VisionTransformer(nn.Module): return loaded_params +class IsaacVisionEmbedding(nn.Module): + def __init__( + self, + vision_cfg: PixelShuffleSiglip2VisionConfig, + hidden_dim: int, + output_dim: int, + prefix: str, + ): + super().__init__() + self.transformer = Siglip2VisionTransformer( + vision_cfg, prefix=maybe_prefix(prefix, "vision_embedding") + ) + self.linear_fc1 = ColumnParallelLinear( + hidden_dim, + 4 * hidden_dim, + bias=False, + prefix=maybe_prefix(prefix, "vision_embedding.1"), + return_bias=False, + ) + self.act = nn.SiLU() + self.linear_fc2 = RowParallelLinear( + 4 * hidden_dim, + output_dim, + bias=False, + prefix=maybe_prefix(prefix, "vision_embedding.3"), + return_bias=False, + ) + + def forward( + self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor] + ) -> torch.Tensor: + hidden_states = self.transformer(packed_seq_patches) + hidden_states = self.linear_fc1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_fc2(hidden_states) + return hidden_states + + @MULTIMODAL_REGISTRY.register_processor( IsaacMultiModalProcessor, info=IsaacProcessingInfo, @@ -1278,6 +1662,10 @@ class IsaacForConditionalGeneration( hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "model.vision_embedding.": "vision_embedding.", + "vision_embedding.0": "vision_embedding.transformer", + "vision_embedding.1": "vision_embedding.linear_fc1", + "vision_embedding.2": "vision_embedding.act", + "vision_embedding.3": "vision_embedding.linear_fc2", } ) @@ -1325,17 +1713,11 @@ class IsaacForConditionalGeneration( raise ValueError("IsaacConfig should always have vision_config") hidden_dim = vision_cfg.hidden_size * (vision_cfg.pixel_shuffle_scale_factor**2) - self.vision_embedding = nn.Sequential( - Siglip2VisionTransformer( - vision_cfg, prefix=maybe_prefix(prefix, "vision_embedding") - ), - nn.Linear( - hidden_dim, - 4 * hidden_dim, - bias=False, - ), - nn.SiLU(), - nn.Linear(4 * hidden_dim, config.hidden_size, bias=False), + self.vision_embedding = IsaacVisionEmbedding( + vision_cfg=vision_cfg, + hidden_dim=hidden_dim, + output_dim=config.hidden_size, + prefix=prefix, ) def get_mrope_input_positions( @@ -1502,6 +1884,6 @@ class IsaacForConditionalGeneration( """ return MultiModelKeys.from_string_field( language_model="language_model", - connector="vision_embedding.3", # The final linear layer + connector="vision_embedding.linear_fc2", # The final linear layer tower_model="vision_embedding", ) From c10f5653baeb6d1e1d684161c37bd59ce02edcf7 Mon Sep 17 00:00:00 2001 From: Yang Date: Wed, 26 Nov 2025 15:09:47 -0800 Subject: [PATCH 07/18] 1. Add support for Isaac model in the registry and documentation 2. optimize Isaac model implementation. Signed-off-by: Yang --- docs/models/supported_models.md | 1 + tests/models/registry.py | 4 + vllm/model_executor/models/isaac.py | 394 +++++++++++++++------------- 3 files changed, 212 insertions(+), 187 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 9ba0f4ca9096e..470807ff8da91 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -679,6 +679,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | | `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + IE+ | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | +| `IsaacForConditionalGeneration` | Isaac | T + I+ | `PerceptronAI/Isaac-0.1` | ✅︎ | ✅︎ | | `InternS1ForConditionalGeneration` | Intern-S1 | T + IE+ + VE+ | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ | | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | | `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + IE+ + VE+ | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index c5d72b5d581b9..7ce22f4238167 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -646,6 +646,10 @@ _MULTIMODAL_EXAMPLE_MODELS = { "HuggingFaceM4/Idefics3-8B-Llama3", extras={"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}, ), + "IsaacForConditionalGeneration": _HfExamplesInfo( + "PerceptronAI/Isaac-0.1", + trust_remote_code=True, + ), "InternS1ForConditionalGeneration": _HfExamplesInfo( "internlm/Intern-S1", trust_remote_code=True ), diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index d2d980a9aadf4..82dae62cb56e4 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -4,7 +4,7 @@ from __future__ import annotations import itertools import math -from collections.abc import Iterable, Mapping, Sequence +from collections.abc import Iterable, Iterator, Mapping, Sequence from dataclasses import dataclass from enum import Enum from typing import Any @@ -15,7 +15,7 @@ import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange -from transformers import PretrainedConfig, Qwen3Config +from transformers import Qwen3Config from transformers.image_processing_utils import BatchFeature from transformers.models.siglip2.configuration_siglip2 import Siglip2VisionConfig from transformers.tokenization_utils import TensorType @@ -30,8 +30,10 @@ from vllm.attention.ops.vit_attn_wrappers import ( vit_xformers_attn_wrapper, ) from vllm.config import VllmConfig +from vllm.config.model import ModelConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils +from vllm.logger import init_logger from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, @@ -50,18 +52,18 @@ from vllm.model_executor.models.interfaces import ( SupportsPP, ) from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM from vllm.model_executor.models.siglip import SiglipMLP from vllm.model_executor.models.utils import ( AutoWeightsLoader, WeightsMapper, - _merge_multimodal_embeddings, + init_vllm_registered_model, maybe_prefix, ) from vllm.model_executor.models.vision import get_vit_attn_backend from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( MultiModalDataDict, + MultiModalFeatureSpec, MultiModalFieldConfig, MultiModalKwargs, ) @@ -73,6 +75,13 @@ from vllm.multimodal.processing import ( PromptUpdate, ) from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.tokenizer import ( + get_cached_tokenizer, + get_tokenizer, +) + +logger = init_logger(__name__) # ===== TensorStream Compatibility Layer for Isaac MRoPE ===== # Minimal implementation of TensorStream classes needed for Isaac's 3D positional @@ -286,12 +295,14 @@ def compute_mrope_pos_tensor(ts: TensorStream, n_pos_dims: int = 3) -> torch.Ten dims = (event.dims() or [1]) + [1] * (n_pos_dims - len(event.dims() or [])) # Create ranges for each dimension (similar to old _finalize implementation) - first_dim = range(cumulative_offset, cumulative_offset + dims[0]) + first_dim = list(range(cumulative_offset, cumulative_offset + dims[0])) cumulative_offset += dims[0] # advance time for the next event - other_dims = [range(d) for d in dims[1:]] - # Use itertools.product to create all coordinate combinations - full_coords = list(itertools.product(first_dim, *other_dims)) + if event.modality_type != VisionType.image: + full_coords = [(t, t, t) for t in first_dim] + else: + other_dims = [range(d) for d in dims[1:]] + full_coords = list(itertools.product(first_dim, *other_dims)) # Slice if the event is partial s, e = event.idx_range @@ -307,6 +318,19 @@ def compute_mrope_pos_tensor(ts: TensorStream, n_pos_dims: int = 3) -> torch.Ten ) +def _resolve_vision_token_id(model_config: ModelConfig, vision_token: str) -> int: + tokenizer_name = model_config.tokenizer or model_config.model + tokenizer = get_cached_tokenizer( + get_tokenizer( + tokenizer_name, + tokenizer_mode=model_config.tokenizer_mode, + trust_remote_code=model_config.trust_remote_code, + revision=model_config.tokenizer_revision or model_config.revision, + ) + ) + return tokenizer.encode(vision_token, add_special_tokens=False)[0] + + def modality_mask(ts: TensorStream, modality_type: ModalityType) -> torch.Tensor: """Create boolean mask for specific modality type in the tensor stream.""" B, T = ts.shape @@ -883,7 +907,8 @@ class IsaacConfig(Qwen3Config): vision_min_num_patches: int | None = None, pixel_shuffle_scale: int = 1, max_sequence_length: int = 16384, - vision_token: str = "<|image_pad|>", + vision_token: str = "", + vision_attn_implementation: str | None = None, **kwargs, ): super().__init__(**kwargs) @@ -899,10 +924,25 @@ class IsaacConfig(Qwen3Config): self.vision_token = vision_token # Handle vision config - PixelShuffleSiglip2VisionConfig instance - self.vision_config = PixelShuffleSiglip2VisionConfig( - pixel_shuffle_scale_factor=pixel_shuffle_scale, - num_patches=vision_max_num_patches, + if isinstance(vision_config, dict): + self.vision_config = PixelShuffleSiglip2VisionConfig(**vision_config) + elif vision_config is None: + self.vision_config = PixelShuffleSiglip2VisionConfig() + else: + self.vision_config = vision_config + + # Ensure compatibility with pretrained checkpoints + self.vision_config.pixel_shuffle_scale_factor = getattr( + self.vision_config, + "pixel_shuffle_scale_factor", + pixel_shuffle_scale, ) + self.vision_config.num_patches = getattr( + self.vision_config, + "num_patches", + vision_max_num_patches, + ) + self.vision_attn_implementation = vision_attn_implementation class IsaacImageProcessorKwargs(TypedDict, total=False): @@ -991,9 +1031,9 @@ class IsaacProcessor: tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") def __init__(self, image_processor=None, tokenizer=None, **kwargs): + self.image_token = kwargs.pop("image_token", "") self.image_processor = image_processor or IsaacImageProcessor(kwargs) self.tokenizer = tokenizer - self.image_token = "<|image_pad|>" def __call__(self, text=None, images=None, **kwargs) -> BatchFeature: result = {} @@ -1062,12 +1102,20 @@ class IsaacProcessingInfo(BaseProcessingInfo): max_sequence_length=getattr( original_config, "max_sequence_length", 16384 ), - vision_token="<|image_pad|>", + vision_token=getattr(original_config, "vision_token", ""), + vision_attn_implementation=getattr( + original_config, "vision_attn_implementation", None + ), ) return IsaacConfig() def get_hf_processor(self, **kwargs) -> IsaacProcessor: - return self.ctx.get_hf_processor(IsaacProcessor, **kwargs) + hf_config = self.get_hf_config() + processor_kwargs = { + "image_token": hf_config.vision_token, + } + processor_kwargs.update(kwargs) + return self.ctx.get_hf_processor(IsaacProcessor, **processor_kwargs) def get_tokenizer(self): return self.ctx.tokenizer @@ -1157,11 +1205,13 @@ class IsaacMultiModalProcessor(BaseMultiModalProcessor): out_mm_kwargs: MultiModalKwargs, ) -> Sequence[PromptUpdate]: # hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + hf_config = self.info.get_hf_config() image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs) tokenizer = self.info.get_tokenizer() - - vocab = tokenizer.get_vocab() - placeholder_id = vocab.get("<|image_pad|>", 151655) + placeholder_id = tokenizer.encode( + hf_config.vision_token, + add_special_tokens=False, + ) pixel_shuffle_scale = getattr(image_processor, "pixel_shuffle_scale", 2) merge_length = pixel_shuffle_scale**2 @@ -1172,12 +1222,12 @@ class IsaacMultiModalProcessor(BaseMultiModalProcessor): assert isinstance(grid_thw, torch.Tensor) num_tokens = int(grid_thw.prod()) // merge_length - return [placeholder_id] * num_tokens + return placeholder_id * num_tokens return [ PromptReplacement( modality="image", - target=[placeholder_id], + target=placeholder_id, replacement=get_replacement_isaac, ) ] @@ -1278,16 +1328,7 @@ class Siglip2VisionAttention(nn.Module): def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: seq_len, bs, _ = qkv.shape - if self.tp_size > 1: - qkv = all_gather_interleave(qkv, self.qkv_proj.hidden_size, self.tp_size) - q, k, v = qkv.chunk(3, dim=2) - - if self.tp_size > 1: - q = dist_utils.split_tensor_along_last_dim(q, self.tp_size)[self.tp_rank] - k = dist_utils.split_tensor_along_last_dim(k, self.tp_size)[self.tp_rank] - v = dist_utils.split_tensor_along_last_dim(v, self.tp_size)[self.tp_rank] - new_shape = ( seq_len, bs, @@ -1604,7 +1645,8 @@ class IsaacVisionEmbedding(nn.Module): vision_cfg: PixelShuffleSiglip2VisionConfig, hidden_dim: int, output_dim: int, - prefix: str, + quant_config: QuantizationConfig | None = None, + prefix: str = "", ): super().__init__() self.transformer = Siglip2VisionTransformer( @@ -1614,6 +1656,7 @@ class IsaacVisionEmbedding(nn.Module): hidden_dim, 4 * hidden_dim, bias=False, + quant_config=quant_config, prefix=maybe_prefix(prefix, "vision_embedding.1"), return_bias=False, ) @@ -1622,6 +1665,7 @@ class IsaacVisionEmbedding(nn.Module): 4 * hidden_dim, output_dim, bias=False, + quant_config=quant_config, prefix=maybe_prefix(prefix, "vision_embedding.3"), return_bias=False, ) @@ -1642,8 +1686,9 @@ class IsaacVisionEmbedding(nn.Module): dummy_inputs=IsaacDummyInputsBuilder, ) class IsaacForConditionalGeneration( - Qwen3ForCausalLM, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE + nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): + merge_by_field_config = True packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -1661,221 +1706,196 @@ class IsaacForConditionalGeneration( # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ + "lm_head.": "language_model.lm_head.", + "model.vision_embedding.0": "vision_embedding.transformer", + "model.vision_embedding.1": "vision_embedding.linear_fc1", + "model.vision_embedding.2": "vision_embedding.act", + "model.vision_embedding.3": "vision_embedding.linear_fc2", "model.vision_embedding.": "vision_embedding.", - "vision_embedding.0": "vision_embedding.transformer", - "vision_embedding.1": "vision_embedding.linear_fc1", - "vision_embedding.2": "vision_embedding.act", - "vision_embedding.3": "vision_embedding.linear_fc2", + "model.": "language_model.model.", } ) @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): - return "<|image_pad|>" + return "" raise ValueError("Only image modality is supported") def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"): + super().__init__() config: IsaacConfig = vllm_config.model_config.hf_config - head_dim = config.head_dim + quant_config = vllm_config.quant_config + self.config = config + self.multimodal_config = vllm_config.model_config.multimodal_config + head_dim = config.head_dim calculated_mrope_section = [ head_dim // 4, # 2x more for temporal dim head_dim // 8, head_dim // 8, ] + self.vision_token_id = _resolve_vision_token_id( + vllm_config.model_config, config.vision_token + ) + config.image_token_id = self.vision_token_id + + logger.info("vllm config: %s", repr(vllm_config)) config.rope_scaling["mrope_section"] = calculated_mrope_section - self.config = config - - # Initialize the parent class with updated config - super().__init__(vllm_config=vllm_config, prefix=prefix) - - # Create the language model module to match checkpoint structure - self.language_model = nn.ModuleDict( - { - "embed_tokens": self.model.embed_tokens, - "layers": self.model.layers, - "norm": self.model.norm, - } + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + architectures=["Qwen3ForCausalLM"], + prefix=maybe_prefix(prefix, "language_model"), + ) + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors ) - config.vision_config.preserve_original_pe = True - config.vision_config.use_rope = False - config.vision_config.hidden_stride = ( - config.vision_config.pixel_shuffle_scale_factor - ) - config.vision_config.window_size = 32 * 2 - config.vision_config.fullatt_block_indexes = None vision_cfg = config.vision_config if vision_cfg is None: raise ValueError("IsaacConfig should always have vision_config") + vision_cfg.preserve_original_pe = True + vision_cfg.use_rope = False + vision_cfg.hidden_stride = vision_cfg.pixel_shuffle_scale_factor + vision_cfg.window_size = 32 * 2 + vision_cfg.fullatt_block_indexes = None + attn_impl = ( + config.vision_attn_implementation + if config.vision_attn_implementation is not None + else getattr(config, "_attn_implementation", None) + ) + if attn_impl is not None: + vision_cfg._attn_implementation = attn_impl hidden_dim = vision_cfg.hidden_size * (vision_cfg.pixel_shuffle_scale_factor**2) self.vision_embedding = IsaacVisionEmbedding( vision_cfg=vision_cfg, hidden_dim=hidden_dim, output_dim=config.hidden_size, - prefix=prefix, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "vision_embedding"), ) + def iter_mm_grid_hw( + self, input_tokens: list[int], mm_features: list[MultiModalFeatureSpec] + ) -> Iterator[tuple[int, int, int]]: + spatial_merge_size = self.config.vision_config.pixel_shuffle_scale_factor + for mm_feature in sorted(mm_features, key=lambda f: f.mm_position.offset): + offset = mm_feature.mm_position.offset + if mm_feature.modality == "image": + t, h, w = mm_feature.data["image_grid_thw"].data.tolist() + assert t == 1, f"Image must have 1 frame, got {t}" + yield offset, h // spatial_merge_size, w // spatial_merge_size + else: + raise ValueError(f"Unsupported modality: {mm_feature.modality}") + def get_mrope_input_positions( self, input_tokens: list[int], - hf_config: PretrainedConfig, - image_grid_thw: list[list[int]] | torch.Tensor, - video_grid_thw: list[list[int]] | torch.Tensor, - context_len: int = 0, - seq_len: int | None = None, - second_per_grid_ts: list[float] | None = None, - audio_feature_lengths: torch.Tensor | None = None, - use_audio_in_video: bool = False, + mm_features: list[MultiModalFeatureSpec], ) -> tuple[torch.Tensor, int]: - """Get mrope input positions and delta value.""" - - vision_token_id = getattr(self.config, "image_token_id", 151655) - spatial_merge_size = hf_config.vision_config.pixel_shuffle_scale_factor - input_tokens_tensor = torch.tensor(input_tokens) - - # Find image token positions - image_positions = torch.where(input_tokens_tensor == vision_token_id)[ - 0 - ].tolist() - - # For text-only inputs, use Isaac's original logic from - # compute_position_ids_input_ids() - if len(image_positions) == 0: - seq_len = len(input_tokens) - # Create 3D positions where all dimensions get the same 1D temporal - # progression - position_ids = torch.arange(seq_len, dtype=torch.long) - position_ids = position_ids.view(1, -1).expand(1, -1) # [1, seq_len] - position_ids = position_ids.unsqueeze(2).expand( - -1, -1, 3 - ) # [1, seq_len, 3] - - # vLLM expects shape [3, seq_len], so transpose - position_ids = position_ids.squeeze(0).transpose(0, 1) # [3, seq_len] - - return position_ids, 0 - - events = [] - image_idx = 0 - current_pos = 0 - last_processed_pos = -1 - - for image_pos in image_positions: - if image_pos <= last_processed_pos: - continue # Skip already processed positions - - # Add any text before this image - if image_pos > current_pos: - text_tokens = image_pos - current_pos - text_event = Event( - modality_type=TextType.text, - dims_virtual=[text_tokens, 1], - idx_range=(0, text_tokens), - ) - events.append(text_event) - - # Add image - t, h, w = image_grid_thw[image_idx] - llm_grid_h, llm_grid_w = h // spatial_merge_size, w // spatial_merge_size - image_tokens = t * llm_grid_h * llm_grid_w - - image_event = Event( - modality_type=VisionType.image, - dims_virtual=[t, llm_grid_h, llm_grid_w], - idx_range=(0, image_tokens), + llm_pos_ids_list = [] + st = 0 + for offset, llm_grid_h, llm_grid_w in self.iter_mm_grid_hw( + input_tokens, mm_features + ): + text_len = offset - st + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + llm_pos_ids_list.append( + np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx ) - events.append(image_event) - current_pos = image_pos + image_tokens - last_processed_pos = ( - current_pos - 1 - ) # Mark up to this position as processed - image_idx += 1 + grid_indices = np.indices((1, llm_grid_h, llm_grid_w)).reshape(3, -1) + grid_indices[0, :] = grid_indices[0, :] + text_len + st_idx + llm_pos_ids_list.append(grid_indices) + st = offset + llm_grid_h * llm_grid_w - # Add final text segment if any - if current_pos < len(input_tokens): - text_tokens = len(input_tokens) - current_pos - text_event = Event( - modality_type=TextType.text, - dims_virtual=[text_tokens, 1], - idx_range=(0, text_tokens), + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1][0, -1] + 1 if len(llm_pos_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append( + np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx ) - events.append(text_event) - stream = Stream(events) - tensor_stream = TensorStream([stream]) + llm_positions = np.concatenate(llm_pos_ids_list, axis=1).reshape(3, -1) + mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() - # Use Isaac's native MRoPE calculation - position_ids = compute_mrope_pos_tensor(tensor_stream, n_pos_dims=3) + return torch.from_numpy(llm_positions), mrope_position_delta - # Max position per batch across the 3 planes and sequence dimension: (B,) - m_per_batch = position_ids.amax(dim=(1, 2)) + def _parse_and_validate_image_input( + self, **kwargs: object + ) -> dict[str, torch.Tensor] | None: + pixel_values = kwargs.get("pixel_values") + image_grid_thw = kwargs.get("image_grid_thw") + if pixel_values is None or image_grid_thw is None: + return None + return { + "pixel_values": pixel_values, + "image_grid_thw": image_grid_thw, + } - mrope_position_delta = (m_per_batch + 1 - len(input_tokens)).item() + def _process_image_input( + self, + image_input: dict[str, torch.Tensor], + ) -> tuple[torch.Tensor, ...]: + pixel_values = image_input["pixel_values"] + image_grid_thw = image_input["image_grid_thw"] + if pixel_values.numel() == 0: + return () - # vLLM expects shape [3, seq_len] but Isaac returns [batch, seq_len, 3] - # Transpose to match vLLM's expected format - position_ids = position_ids.squeeze(0).transpose(0, 1) + device = next(self.language_model.parameters()).device + dtype = self.vision_embedding.linear_fc1.weight.dtype + pixel_values = pixel_values.to(device=device, dtype=dtype) + if image_grid_thw.dim() == 3: + image_grid_thw = image_grid_thw[0] + spatial_grids = image_grid_thw[:, 1:3].to(device, dtype=torch.int32) - return position_ids, mrope_position_delta + vision_embeddings = self.vision_embedding((pixel_values, spatial_grids)) + merge_size = self.config.vision_config.pixel_shuffle_scale_factor + sizes = spatial_grids.prod(-1) // (merge_size * merge_size) + return tuple(vision_embeddings.split(sizes.tolist())) + + def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return () + return self._process_image_input(image_input) def get_multimodal_embeddings( self, **kwargs: object ) -> MultiModalEmbeddings | None: - pixel_values = kwargs.get("pixel_values") - image_grid_thw = kwargs.get("image_grid_thw") - - if pixel_values is None: + # Backward compatibility for older runners. + embeddings = self.embed_multimodal(**kwargs) + if not embeddings: return [] + return embeddings - # Convert image_grid_thw from [batch, 1, [T, H, W]] to [batch, [H, W]] - spatial_grids = image_grid_thw[ - :, 0, 1:3 - ] # Extract H, W from [T, H, W] for each image + def get_language_model(self) -> torch.nn.Module: + return self.language_model - # Process packed sequence patches through vision_embedding module - vision_embeddings = self.vision_embedding((pixel_values, spatial_grids)) - - # Split concatenated embeddings for each image item (following Qwen2-VL pattern) - merge_size = ( - self.config.vision_config.pixel_shuffle_scale_factor - ) # Isaac uses pixel shuffle - sizes = spatial_grids.prod(-1) // ( - merge_size * merge_size - ) # H * W / (merge_size^2) - - return vision_embeddings.split(sizes.tolist()) - - def get_input_embeddings( + def forward( self, input_ids: torch.Tensor, - multimodal_embeddings: MultiModalEmbeddings | None = None, - *, - is_multimodal: torch.Tensor | None = None, - handle_oov_mm_token: bool = False, - ) -> torch.Tensor: - # Get text embeddings from the base language model - inputs_embeds = super().get_input_embeddings(input_ids) + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs: object, + ) -> torch.Tensor | IntermediateTensors: + return self.language_model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + **kwargs, + ) - # If we have multimodal embeddings, merge them with text embeddings - if multimodal_embeddings is not None and len(multimodal_embeddings) != 0: - inputs_embeds = _merge_multimodal_embeddings( - inputs_embeds=inputs_embeds, - multimodal_embeddings=multimodal_embeddings, - is_multimodal=is_multimodal, - ) - - return inputs_embeds + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None: + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - skip_prefixes = [] - - loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) + loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) def get_mm_mapping(self) -> MultiModelKeys: From 7cfd83ad92092c957f4c4745fda7f78f675153c9 Mon Sep 17 00:00:00 2001 From: Oscar Gonzalez Date: Tue, 2 Dec 2025 01:09:19 -0500 Subject: [PATCH 08/18] 1. Remove upstream fa checks (#29471) 2. Remove deprecated xformers (#29262) 3. Updated _get_prompt_updates() Signed-off-by: Oscar Gonzalez --- vllm/model_executor/models/isaac.py | 46 ++++------------------------- 1 file changed, 5 insertions(+), 41 deletions(-) diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index 82dae62cb56e4..e5a2d5440724a 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -23,12 +23,8 @@ from typing_extensions import TypedDict, Unpack from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.layer import ( - check_upstream_fa_availability, maybe_get_vit_flash_attn_backend, ) -from vllm.attention.ops.vit_attn_wrappers import ( - vit_xformers_attn_wrapper, -) from vllm.config import VllmConfig from vllm.config.model import ModelConfig from vllm.distributed import parallel_state @@ -73,6 +69,7 @@ from vllm.multimodal.processing import ( BaseProcessingInfo, PromptReplacement, PromptUpdate, + PromptUpdateDetails, ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors @@ -1204,14 +1201,7 @@ class IsaacMultiModalProcessor(BaseMultiModalProcessor): hf_processor_mm_kwargs: Mapping[str, Any], out_mm_kwargs: MultiModalKwargs, ) -> Sequence[PromptUpdate]: - # hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - hf_config = self.info.get_hf_config() image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs) - tokenizer = self.info.get_tokenizer() - placeholder_id = tokenizer.encode( - hf_config.vision_token, - add_special_tokens=False, - ) pixel_shuffle_scale = getattr(image_processor, "pixel_shuffle_scale", 2) merge_length = pixel_shuffle_scale**2 @@ -1221,13 +1211,14 @@ class IsaacMultiModalProcessor(BaseMultiModalProcessor): grid_thw = out_item["image_grid_thw"].data assert isinstance(grid_thw, torch.Tensor) - num_tokens = int(grid_thw.prod()) // merge_length - return placeholder_id * num_tokens + feature_size = int(grid_thw.prod()) // merge_length + repl_full = "<|image_pad|>" * feature_size + return PromptUpdateDetails.select_text(repl_full, "<|image_pad|>") return [ PromptReplacement( modality="image", - target=placeholder_id, + target="", replacement=get_replacement_isaac, ) ] @@ -1259,7 +1250,6 @@ class Siglip2VisionAttention(nn.Module): *, prefix: str = "", use_data_parallel: bool = False, - use_upstream_fa: bool = False, attn_backend: AttentionBackendEnum | None = None, attn_backend_override: AttentionBackendEnum | None = None, ) -> None: @@ -1296,19 +1286,11 @@ class Siglip2VisionAttention(nn.Module): disable_tp=use_data_parallel, ) - self.use_upstream_fa = use_upstream_fa self.attn_backend = attn_backend - if self.attn_backend not in { - AttentionBackendEnum.FLASH_ATTN, - AttentionBackendEnum.ROCM_AITER_FA, - } and check_upstream_fa_availability(torch.get_default_dtype()): - self.attn_backend = AttentionBackendEnum.FLASH_ATTN - self.use_upstream_fa = True if self.attn_backend not in { AttentionBackendEnum.FLASH_ATTN, AttentionBackendEnum.TORCH_SDPA, - AttentionBackendEnum.XFORMERS, AttentionBackendEnum.ROCM_AITER_FA, }: raise RuntimeError( @@ -1317,7 +1299,6 @@ class Siglip2VisionAttention(nn.Module): self.attn_backend, self.flash_attn_varlen_func = ( maybe_get_vit_flash_attn_backend( self.attn_backend, - self.use_upstream_fa, attn_backend_override=attn_backend_override, ) ) @@ -1389,10 +1370,6 @@ class Siglip2VisionAttention(nn.Module): context_layer = rearrange( context_layer, "b s h d -> s b (h d)" ).contiguous() - elif self.attn_backend == AttentionBackendEnum.XFORMERS: - if seqlens is None: - raise ValueError("xFormers attention backend requires seqlens tensor.") - context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens) else: raise RuntimeError( f"Isaac vision embedding does not support {self.attn_backend} backend." @@ -1412,7 +1389,6 @@ class Siglip2EncoderLayer(nn.Module): prefix: str = "", attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA, attn_backend_override: AttentionBackendEnum | None = None, - use_upstream_fa: bool = False, use_data_parallel: bool = False, ) -> None: super().__init__() @@ -1423,7 +1399,6 @@ class Siglip2EncoderLayer(nn.Module): quant_config=quant_config, prefix=f"{prefix}.self_attn", use_data_parallel=use_data_parallel, - use_upstream_fa=use_upstream_fa, attn_backend=attn_backend, attn_backend_override=attn_backend_override, ) @@ -1481,17 +1456,9 @@ class Siglip2Encoder(nn.Module): dtype=torch.get_default_dtype(), attn_backend_override=attn_backend_override, ) - self.use_upstream_fa = False - if self.attn_backend not in { - AttentionBackendEnum.FLASH_ATTN, - AttentionBackendEnum.ROCM_AITER_FA, - } and check_upstream_fa_availability(torch.get_default_dtype()): - self.attn_backend = AttentionBackendEnum.FLASH_ATTN - self.use_upstream_fa = True if self.attn_backend not in { AttentionBackendEnum.FLASH_ATTN, AttentionBackendEnum.TORCH_SDPA, - AttentionBackendEnum.XFORMERS, AttentionBackendEnum.ROCM_AITER_FA, }: raise RuntimeError( @@ -1505,7 +1472,6 @@ class Siglip2Encoder(nn.Module): prefix=f"{prefix}.layers.{layer_idx}", attn_backend=self.attn_backend, attn_backend_override=attn_backend_override, - use_upstream_fa=self.use_upstream_fa, use_data_parallel=use_data_parallel, ) for layer_idx in range(config.num_hidden_layers) @@ -1565,8 +1531,6 @@ class Siglip2VisionTransformer(nn.Module): AttentionBackendEnum.ROCM_AITER_FA, }: max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() - elif self.encoder.attn_backend == AttentionBackendEnum.XFORMERS: - seqlens = cu_seqlens[1:] - cu_seqlens[:-1] return max_seqlen, seqlens def forward( From 54f5c92a9b8478604c574a24cd21ca717a264d99 Mon Sep 17 00:00:00 2001 From: Yang Date: Fri, 12 Dec 2025 17:49:03 -0800 Subject: [PATCH 09/18] fix tokenizer error Signed-off-by: Yang --- vllm/model_executor/models/isaac.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index e5a2d5440724a..e37c8aad1524b 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -73,10 +73,8 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.tokenizer import ( - get_cached_tokenizer, - get_tokenizer, -) +from vllm.tokenizers import get_tokenizer +from vllm.tokenizers.hf import get_cached_tokenizer logger = init_logger(__name__) From 748508ae4fc9113d527abac56f753f7ecebdcebe Mon Sep 17 00:00:00 2001 From: Yang Date: Fri, 12 Dec 2025 17:51:36 -0800 Subject: [PATCH 10/18] remove log Signed-off-by: Yang --- vllm/model_executor/models/isaac.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index e37c8aad1524b..28fb68d62052a 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -29,7 +29,6 @@ from vllm.config import VllmConfig from vllm.config.model import ModelConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils -from vllm.logger import init_logger from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, @@ -76,8 +75,6 @@ from vllm.sequence import IntermediateTensors from vllm.tokenizers import get_tokenizer from vllm.tokenizers.hf import get_cached_tokenizer -logger = init_logger(__name__) - # ===== TensorStream Compatibility Layer for Isaac MRoPE ===== # Minimal implementation of TensorStream classes needed for Isaac's 3D positional # encoding @@ -1704,7 +1701,6 @@ class IsaacForConditionalGeneration( ) config.image_token_id = self.vision_token_id - logger.info("vllm config: %s", repr(vllm_config)) config.rope_scaling["mrope_section"] = calculated_mrope_section self.language_model = init_vllm_registered_model( vllm_config=vllm_config, From c9f3d502c927d94cadc580529d60c72a496a2448 Mon Sep 17 00:00:00 2001 From: oscardev256 <42308241+oscardev256@users.noreply.github.com> Date: Mon, 15 Dec 2025 20:25:03 -0500 Subject: [PATCH 11/18] Remove TensorStream logic no longer needed after updating get_mrope_input_positions. Signed-off-by: oscardev256 <42308241+oscardev256@users.noreply.github.com> --- vllm/model_executor/models/isaac.py | 282 ++-------------------------- 1 file changed, 13 insertions(+), 269 deletions(-) diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index 28fb68d62052a..17e498c3cc8dd 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -2,11 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations -import itertools import math from collections.abc import Iterable, Iterator, Mapping, Sequence -from dataclasses import dataclass -from enum import Enum from typing import Any import numpy as np @@ -75,272 +72,6 @@ from vllm.sequence import IntermediateTensors from vllm.tokenizers import get_tokenizer from vllm.tokenizers.hf import get_cached_tokenizer -# ===== TensorStream Compatibility Layer for Isaac MRoPE ===== -# Minimal implementation of TensorStream classes needed for Isaac's 3D positional -# encoding - - -class ModalityType(Enum): - """ - Base class for modality-type enumerations. - Each derived class (VisionType, TextType) holds - an integer value that identifies a specific modality. - - Example usage: - If you have an object `my_event` of class `Event`, - you might write: - if my_event.type == VisionType.image: - # process an image frame - - The methods below implement ordering and hashing - based on the integer `.value` of each enum member. - """ - - @property - def modality(self): - return self.__class__ - - def __lt__(self, other): - if isinstance(other, ModalityType): - return self.value < other.value - raise NotImplementedError() - - def __eq__(self, other): - if isinstance(other, ModalityType): - return self.value == other.value - raise NotImplementedError() - - def __hash__(self): - return hash(self.value) - - -# NOTE: modality types need to be unique -class VisionType(ModalityType): - """ - Enum for vision modalities such as key video frames. - Typically used in video processing or image sequences. - - Members: - image: A single image frame. - """ - - image = 0 - - -class TextType(ModalityType): - """ - Enum for text tokens and padding. - - Members: - text: Actual textual tokens. - padding: Padding tokens used in sequence batching. - """ - - text = 1 - padding = 2 - - -@dataclass -class Event: - """Represents a single modality event with spatial/temporal dimensions.""" - - """ - Represents a single data occurrence (with a specific type, time interval, and - data payload). - - Attributes: - data (Any): The actual data payload (e.g. a torch.Tensor, a string, - etc.). - type (ModalityType): The modality type of the data (e.g., - VisionType.image). - time (Tuple[float, float]): (start_time, end_time) indicating when this - Event occurs. - role (Optional[str]): The role associated with this event (e.g., "user", - "agent", "system"). If None, the event is always included in loss - calculation. - - Example usage: - evt = Event(data=torch.zeros((1, 224, 224, 3)), # e.g. a single image frame - type=VisionType.image, - time=(0.0, 0.04), - role="user") - """ - # Descriptors - modality_type: ModalityType - - # Structure - dims_virtual: list[int] | None = ( - None # virtual/processed dimensions (e.g., pixel-shuffled) - ) - dims_real: list[int] | None = None # real/actual tensor dimensions - idx_range: tuple[int, int] | None = None - - def dims(self, virtual: bool = True) -> list[int] | None: - """ - Get the dimensions of this event. - - Args: - virtual: If True (default), return virtual/processed dimensions - (e.g., pixel-shuffled). If False, return real/actual tensor - dimensions. - - Returns: - Dimensions list or None if not measured. - """ - if virtual: - return self.dims_virtual - else: - return self.dims_real - - def num_tokens(self, partial=True, virtual=True) -> int: - if not virtual: - assert partial is False and isinstance(self.data, torch.Tensor) - return math.prod(self.dims(virtual=False)) - return ( - self.idx_range[1] - self.idx_range[0] if partial else math.prod(self.dims()) - ) - - -@dataclass -class Stream: - """ - Represents an ordered sequence of Event objects, each with - a specific ModalityType and a time range. - - Attributes: - events (List[Event]): The list of Event objects in the stream. - priority (List[ModalityType]): A list of modality types that define - how we might want to reorder or prioritize events if scheduling is needed. - - Example usage: - # Create two events of different types - evt1 = Event(torch.zeros((1, 224, 224, 3)), VisionType.image, (0.0, 0.04)) - evt2 = Event(torch.randint(0, 1000, (16, 1)), TextType.text, (0.0, 0.32)) - - # Make a stream with a given priority - s = Stream(events=[evt1, evt2], - priority=[VisionType.image, TextType.text]) - - print(s) - """ - - events: list[Event] - - def __len__(self): - """Returns the number of Event objects in this Stream.""" - return len(self.events) - - def __getitem__(self, key: int) -> Stream | Event: - return self.events[key] - - def __iter__(self): - """ - Yields each Event in the Stream, enabling iteration like: - for event in my_stream: - ... - """ - yield from self.events - - -# TODO: implement all types of cool indexing which can happen since TensorStream -# assumes Event.data = Tensor -@dataclass -class TensorStream: - streams: list[Stream] - _device: torch.device | None = None - - @property - def device(self): - return self._device - - @property - def shape(self): - seq_lens = [sum([ev.num_tokens() for ev in stream]) for stream in self.streams] - assert all([sl == seq_lens[0] for sl in seq_lens]), ( - f"each stream must have same token count to have a shape: {seq_lens}" - ) - return (len(seq_lens), seq_lens[0]) - - -def compute_mrope_pos_tensor(ts: TensorStream, n_pos_dims: int = 3) -> torch.Tensor: - """ - Create a (batch, T, n_pos_dims) position tensor in one sweep. - The first dim is the running “time” index, the rest are spatial (or 1-fillers). - - Args: - ts : TensorStream - n_pos_dims : total coordinate dimensions (default 3) - - Returns: - torch.LongTensor - shape (batch_size, seq_len, n_pos_dims) - """ - - # Manually iterate through streams and events like map_compact does, - # but maintain cumulative time offset for each stream - all_coords = [] - for stream in ts.streams: # one Stream == one batch sample - cumulative_offset = 0 # running time index for this stream - - for event in stream: - # --- build coordinate grid for THIS event using itertools - # (no tensor ops) --- - dims = (event.dims() or [1]) + [1] * (n_pos_dims - len(event.dims() or [])) - - # Create ranges for each dimension (similar to old _finalize implementation) - first_dim = list(range(cumulative_offset, cumulative_offset + dims[0])) - cumulative_offset += dims[0] # advance time for the next event - - if event.modality_type != VisionType.image: - full_coords = [(t, t, t) for t in first_dim] - else: - other_dims = [range(d) for d in dims[1:]] - full_coords = list(itertools.product(first_dim, *other_dims)) - - # Slice if the event is partial - s, e = event.idx_range - coords = full_coords[s:e] - - # Extend the flattened coordinate list - all_coords.extend(coords) - - # Convert to tensor and reshape to (B, T, n_pos_dims) - B, T = ts.shape - return torch.tensor(all_coords, dtype=torch.long, device=ts.device).reshape( - B, T, n_pos_dims - ) - - -def _resolve_vision_token_id(model_config: ModelConfig, vision_token: str) -> int: - tokenizer_name = model_config.tokenizer or model_config.model - tokenizer = get_cached_tokenizer( - get_tokenizer( - tokenizer_name, - tokenizer_mode=model_config.tokenizer_mode, - trust_remote_code=model_config.trust_remote_code, - revision=model_config.tokenizer_revision or model_config.revision, - ) - ) - return tokenizer.encode(vision_token, add_special_tokens=False)[0] - - -def modality_mask(ts: TensorStream, modality_type: ModalityType) -> torch.Tensor: - """Create boolean mask for specific modality type in the tensor stream.""" - B, T = ts.shape - mask = torch.zeros((B, T), dtype=torch.bool, device=ts.device) - - for batch_idx, stream in enumerate(ts.streams): - seq_idx = 0 - for event in stream: - if event.modality_type == modality_type: - start, end = event.idx_range - mask[batch_idx, seq_idx : seq_idx + (end - start)] = True - seq_idx += event.idx_range[1] - event.idx_range[0] - - return mask - - -# ===== End TensorStream Compatibility Layer ===== - class PixelShuffleSiglip2VisionConfig(Siglip2VisionConfig): """Vision configuration for Isaac with Pixel Shuffle support. @@ -738,6 +469,19 @@ _MEAN_TENSOR = torch.tensor(VISION_MEAN, dtype=torch.float32).view(1, 1, 1, -1) _STD_TENSOR = torch.tensor(VISION_STD, dtype=torch.float32).view(1, 1, 1, -1) +def _resolve_vision_token_id(model_config: ModelConfig, vision_token: str) -> int: + tokenizer_name = model_config.tokenizer or model_config.model + tokenizer = get_cached_tokenizer( + get_tokenizer( + tokenizer_name, + tokenizer_mode=model_config.tokenizer_mode, + trust_remote_code=model_config.trust_remote_code, + revision=model_config.tokenizer_revision or model_config.revision, + ) + ) + return tokenizer.encode(vision_token, add_special_tokens=False)[0] + + def prepare_image_tensor( image: torch.Tensor, scale: float = VISION_SCALE, From 2c51f83762acd22d363c41b65a83b2fb91ccc38b Mon Sep 17 00:00:00 2001 From: oscardev256 <42308241+oscardev256@users.noreply.github.com> Date: Tue, 16 Dec 2025 00:49:43 -0500 Subject: [PATCH 12/18] Update for merge_by_field_config and MultiModalKwargs deprecations. Signed-off-by: oscardev256 <42308241+oscardev256@users.noreply.github.com> --- vllm/model_executor/models/isaac.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index 17e498c3cc8dd..d4bb11468c11f 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -57,7 +57,7 @@ from vllm.multimodal.inputs import ( MultiModalDataDict, MultiModalFeatureSpec, MultiModalFieldConfig, - MultiModalKwargs, + MultiModalKwargsItems, ) from vllm.multimodal.parse import ImageSize, MultiModalDataItems from vllm.multimodal.processing import ( @@ -938,7 +938,7 @@ class IsaacMultiModalProcessor(BaseMultiModalProcessor): self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, Any], - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs) @@ -1391,7 +1391,7 @@ class IsaacVisionEmbedding(nn.Module): class IsaacForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): - merge_by_field_config = True + packed_modules_mapping = { "qkv_proj": [ "q_proj", From 4cdd788dd009a5aa07dfca4f3c9afc56bd9a1a96 Mon Sep 17 00:00:00 2001 From: oscardev256 <42308241+oscardev256@users.noreply.github.com> Date: Tue, 16 Dec 2025 01:05:23 -0500 Subject: [PATCH 13/18] Apply ruff formatting Signed-off-by: oscardev256 <42308241+oscardev256@users.noreply.github.com> --- vllm/model_executor/models/isaac.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index d4bb11468c11f..85d9568b89048 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -1391,7 +1391,6 @@ class IsaacVisionEmbedding(nn.Module): class IsaacForConditionalGeneration( nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): - packed_modules_mapping = { "qkv_proj": [ "q_proj", From c7c3853e9e3b3e0341d251e5a63879ee849cf769 Mon Sep 17 00:00:00 2001 From: Yang Date: Wed, 17 Dec 2025 18:30:44 -0800 Subject: [PATCH 14/18] fix cr Signed-off-by: Yang --- vllm/model_executor/models/isaac.py | 192 +++++++--------------------- 1 file changed, 44 insertions(+), 148 deletions(-) diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index 85d9568b89048..097363f83c4dd 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -18,11 +18,8 @@ from transformers.models.siglip2.configuration_siglip2 import Siglip2VisionConfi from transformers.tokenization_utils import TensorType from typing_extensions import TypedDict, Unpack -from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layer import ( - maybe_get_vit_flash_attn_backend, -) -from vllm.config import VllmConfig +from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention +from vllm.config import MultiModalConfig, VllmConfig from vllm.config.model import ModelConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils @@ -51,7 +48,6 @@ from vllm.model_executor.models.utils import ( init_vllm_registered_model, maybe_prefix, ) -from vllm.model_executor.models.vision import get_vit_attn_backend from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( MultiModalDataDict, @@ -97,11 +93,15 @@ class PixelShuffleSiglip2VisionConfig(Siglip2VisionConfig): def create_cumulative_seq_lengths( seq_sizes: torch.Tensor, device: torch.device -) -> tuple[torch.Tensor, int]: +) -> tuple[torch.Tensor, torch.Tensor]: """Create cumulative sequence lengths for variable-length attention.""" cu_seqlens = torch.zeros(len(seq_sizes) + 1, dtype=torch.int32, device=device) cu_seqlens[1:] = seq_sizes.cumsum(0) - max_seqlen = int(seq_sizes.max().item()) if len(seq_sizes) > 0 else 0 + max_seqlen = ( + seq_sizes.max() + if len(seq_sizes) > 0 + else torch.tensor(0, dtype=torch.int32, device=device) + ) return cu_seqlens, max_seqlen @@ -763,9 +763,6 @@ class IsaacImageProcessor: class IsaacProcessor: """Processor wrapper (tokenizer + IsaacImageProcessor).""" - attributes = ["tokenizer"] - tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") - def __init__(self, image_processor=None, tokenizer=None, **kwargs): self.image_token = kwargs.pop("image_token", "") self.image_processor = image_processor or IsaacImageProcessor(kwargs) @@ -963,24 +960,6 @@ class IsaacMultiModalProcessor(BaseMultiModalProcessor): ] -def all_gather_interleave(local_tensor: torch.Tensor, hidden_size: int, tp_size: int): - """All-gather the input tensor interleavely across model parallel group.""" - import torch.distributed as dist - - gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)] - dist.all_gather( - gathered_tensors, local_tensor, group=parallel_state.get_tp_group().device_group - ) - - gathered_tensors_split = [ - torch.split(tensor, hidden_size // tp_size, -1) for tensor in gathered_tensors - ] - ordered_tensors = [ - tensor for pair in zip(*gathered_tensors_split) for tensor in pair - ] - return torch.cat(ordered_tensors, dim=-1) - - class Siglip2VisionAttention(nn.Module): def __init__( self, @@ -988,12 +967,15 @@ class Siglip2VisionAttention(nn.Module): quant_config: QuantizationConfig | None = None, *, prefix: str = "", - use_data_parallel: bool = False, - attn_backend: AttentionBackendEnum | None = None, - attn_backend_override: AttentionBackendEnum | None = None, + multimodal_config: MultiModalConfig | None = None, ) -> None: super().__init__() + use_data_parallel = ( + multimodal_config.mm_encoder_tp_mode == "data" + if multimodal_config + else False + ) self.tp_size = ( 1 if use_data_parallel @@ -1025,26 +1007,12 @@ class Siglip2VisionAttention(nn.Module): disable_tp=use_data_parallel, ) - self.attn_backend = attn_backend - - if self.attn_backend not in { - AttentionBackendEnum.FLASH_ATTN, - AttentionBackendEnum.TORCH_SDPA, - AttentionBackendEnum.ROCM_AITER_FA, - }: - raise RuntimeError( - f"Isaac vision embedding does not support {self.attn_backend} backend." - ) - self.attn_backend, self.flash_attn_varlen_func = ( - maybe_get_vit_flash_attn_backend( - self.attn_backend, - attn_backend_override=attn_backend_override, - ) + self.attn = MMEncoderAttention( + num_heads=self.num_attention_heads_per_partition, + head_size=self.hidden_size_per_attention_head, + prefix=f"{prefix}.attn", + multimodal_config=multimodal_config, ) - self.is_flash_attn_backend = self.attn_backend in { - AttentionBackendEnum.FLASH_ATTN, - AttentionBackendEnum.ROCM_AITER_FA, - } def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: seq_len, bs, _ = qkv.shape @@ -1064,7 +1032,6 @@ class Siglip2VisionAttention(nn.Module): *, cu_seqlens: torch.Tensor, max_seqlen: torch.Tensor | None, - seqlens: torch.Tensor | None, ) -> torch.Tensor: batch_size, _, _ = hidden_states.shape if batch_size != 1: @@ -1074,45 +1041,14 @@ class Siglip2VisionAttention(nn.Module): q, k, v = self.split_qkv(x) q, k, v = (rearrange(t, "s b h d -> b s h d") for t in (q, k, v)) - if self.is_flash_attn_backend: - q, k, v = (rearrange(t, "b s ... -> (b s) ...") for t in (q, k, v)) - output = self.flash_attn_varlen_func( - q, - k, - v, - cu_seqlens_q=cu_seqlens, - cu_seqlens_k=cu_seqlens, - max_seqlen_q=max_seqlen, - max_seqlen_k=max_seqlen, - dropout_p=0.0, - causal=False, - ) - context_layer = rearrange( - output, "(b s) h d -> s b (h d)", b=batch_size - ).contiguous() - elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA: - outputs = [] - for i in range(1, len(cu_seqlens)): - start_idx = cu_seqlens[i - 1] - end_idx = cu_seqlens[i] - q_i = q[:, start_idx:end_idx] - k_i = k[:, start_idx:end_idx] - v_i = v[:, start_idx:end_idx] - q_i, k_i, v_i = ( - rearrange(tensor, "b s h d -> b h s d") - for tensor in (q_i, k_i, v_i) - ) - output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0) - output_i = rearrange(output_i, "b h s d -> b s h d") - outputs.append(output_i) - context_layer = torch.cat(outputs, dim=1) - context_layer = rearrange( - context_layer, "b s h d -> s b (h d)" - ).contiguous() - else: - raise RuntimeError( - f"Isaac vision embedding does not support {self.attn_backend} backend." - ) + context_layer = self.attn( + query=q, + key=k, + value=v, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + ) + context_layer = rearrange(context_layer, "b s h d -> s b (h d)").contiguous() output, _ = self.out_proj(context_layer) output = rearrange(output, "s b d -> b s d") @@ -1126,9 +1062,7 @@ class Siglip2EncoderLayer(nn.Module): quant_config: QuantizationConfig | None = None, *, prefix: str = "", - attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA, - attn_backend_override: AttentionBackendEnum | None = None, - use_data_parallel: bool = False, + multimodal_config: MultiModalConfig | None = None, ) -> None: super().__init__() self.embed_dim = config.hidden_size @@ -1137,9 +1071,7 @@ class Siglip2EncoderLayer(nn.Module): config, quant_config=quant_config, prefix=f"{prefix}.self_attn", - use_data_parallel=use_data_parallel, - attn_backend=attn_backend, - attn_backend_override=attn_backend_override, + multimodal_config=multimodal_config, ) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) self.mlp = SiglipMLP( @@ -1154,7 +1086,6 @@ class Siglip2EncoderLayer(nn.Module): *, cu_seqlens: torch.Tensor, max_seqlen: torch.Tensor | None, - seqlens: torch.Tensor | None, ) -> torch.Tensor: residual = hidden_states @@ -1163,7 +1094,6 @@ class Siglip2EncoderLayer(nn.Module): hidden_states=hidden_states, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, - seqlens=seqlens, ) hidden_states = residual + hidden_states @@ -1182,36 +1112,17 @@ class Siglip2Encoder(nn.Module): quant_config: QuantizationConfig | None = None, *, prefix: str = "", - use_data_parallel: bool = False, - attn_backend_override: AttentionBackendEnum | None = None, + multimodal_config: MultiModalConfig | None = None, ) -> None: super().__init__() self.config = config - embed_dim = config.hidden_size - num_heads = config.num_attention_heads - head_dim = embed_dim // num_heads - self.attn_backend = get_vit_attn_backend( - head_size=head_dim, - dtype=torch.get_default_dtype(), - attn_backend_override=attn_backend_override, - ) - if self.attn_backend not in { - AttentionBackendEnum.FLASH_ATTN, - AttentionBackendEnum.TORCH_SDPA, - AttentionBackendEnum.ROCM_AITER_FA, - }: - raise RuntimeError( - f"Isaac vision embedding does not support {self.attn_backend} backend." - ) self.layers = nn.ModuleList( [ Siglip2EncoderLayer( config, quant_config=quant_config, prefix=f"{prefix}.layers.{layer_idx}", - attn_backend=self.attn_backend, - attn_backend_override=attn_backend_override, - use_data_parallel=use_data_parallel, + multimodal_config=multimodal_config, ) for layer_idx in range(config.num_hidden_layers) ] @@ -1223,7 +1134,6 @@ class Siglip2Encoder(nn.Module): *, cu_seqlens: torch.Tensor | None = None, max_seqlen: torch.Tensor | None = None, - seqlens: torch.Tensor | None = None, ) -> torch.Tensor: hidden_states = inputs_embeds for encoder_layer in self.layers: @@ -1231,7 +1141,6 @@ class Siglip2Encoder(nn.Module): hidden_states, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, - seqlens=seqlens, ) return hidden_states @@ -1242,8 +1151,7 @@ class Siglip2VisionTransformer(nn.Module): config: PixelShuffleSiglip2VisionConfig, quant_config: QuantizationConfig | None = None, prefix: str = "", - use_data_parallel: bool = False, - attn_backend_override: AttentionBackendEnum | None = None, + multimodal_config: MultiModalConfig | None = None, ): super().__init__() self.config = config @@ -1256,22 +1164,10 @@ class Siglip2VisionTransformer(nn.Module): config, quant_config=quant_config, prefix=f"{prefix}.encoder", - use_data_parallel=use_data_parallel, - attn_backend_override=attn_backend_override, + multimodal_config=multimodal_config, ) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) - def compute_attn_mask_seqlen( - self, cu_seqlens: torch.Tensor - ) -> tuple[torch.Tensor | None, torch.Tensor | None]: - max_seqlen, seqlens = None, None - if self.encoder.attn_backend in { - AttentionBackendEnum.FLASH_ATTN, - AttentionBackendEnum.ROCM_AITER_FA, - }: - max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() - return max_seqlen, seqlens - def forward( self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor], @@ -1291,14 +1187,14 @@ class Siglip2VisionTransformer(nn.Module): # Add a pseudo batch dimension for the encoder hidden_states = hidden_states.unsqueeze(0) - cu_seqlens, _ = create_cumulative_seq_lengths(seq_sizes, hidden_states.device) - max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens) + cu_seqlens, max_seqlen = create_cumulative_seq_lengths( + seq_sizes, hidden_states.device + ) hidden_states = self.encoder( inputs_embeds=hidden_states, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, - seqlens=seqlens, ) hidden_states = self.post_layernorm(hidden_states) @@ -1349,18 +1245,22 @@ class IsaacVisionEmbedding(nn.Module): hidden_dim: int, output_dim: int, quant_config: QuantizationConfig | None = None, + multimodal_config: MultiModalConfig | None = None, prefix: str = "", ): super().__init__() self.transformer = Siglip2VisionTransformer( - vision_cfg, prefix=maybe_prefix(prefix, "vision_embedding") + vision_cfg, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "0"), + multimodal_config=multimodal_config, ) self.linear_fc1 = ColumnParallelLinear( hidden_dim, 4 * hidden_dim, bias=False, quant_config=quant_config, - prefix=maybe_prefix(prefix, "vision_embedding.1"), + prefix=maybe_prefix(prefix, "1"), return_bias=False, ) self.act = nn.SiLU() @@ -1369,7 +1269,7 @@ class IsaacVisionEmbedding(nn.Module): output_dim, bias=False, quant_config=quant_config, - prefix=maybe_prefix(prefix, "vision_embedding.3"), + prefix=maybe_prefix(prefix, "3"), return_bias=False, ) @@ -1457,11 +1357,6 @@ class IsaacForConditionalGeneration( vision_cfg = config.vision_config if vision_cfg is None: raise ValueError("IsaacConfig should always have vision_config") - vision_cfg.preserve_original_pe = True - vision_cfg.use_rope = False - vision_cfg.hidden_stride = vision_cfg.pixel_shuffle_scale_factor - vision_cfg.window_size = 32 * 2 - vision_cfg.fullatt_block_indexes = None attn_impl = ( config.vision_attn_implementation if config.vision_attn_implementation is not None @@ -1476,6 +1371,7 @@ class IsaacForConditionalGeneration( hidden_dim=hidden_dim, output_dim=config.hidden_size, quant_config=quant_config, + multimodal_config=self.multimodal_config, prefix=maybe_prefix(prefix, "vision_embedding"), ) From c4a6119925e1e7575704b20a9b134834ed71ca38 Mon Sep 17 00:00:00 2001 From: Yang Date: Thu, 18 Dec 2025 17:19:39 -0800 Subject: [PATCH 15/18] fix cr Signed-off-by: Yang --- vllm/model_executor/models/isaac.py | 125 +++++++------------- vllm/transformers_utils/config.py | 1 + vllm/transformers_utils/configs/__init__.py | 4 + vllm/transformers_utils/configs/isaac.py | 86 ++++++++++++++ 4 files changed, 131 insertions(+), 85 deletions(-) create mode 100644 vllm/transformers_utils/configs/isaac.py diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index 097363f83c4dd..f435b24335a62 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -4,7 +4,7 @@ from __future__ import annotations import math from collections.abc import Iterable, Iterator, Mapping, Sequence -from typing import Any +from typing import Annotated, Any import numpy as np import PIL.Image @@ -12,9 +12,7 @@ import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange -from transformers import Qwen3Config from transformers.image_processing_utils import BatchFeature -from transformers.models.siglip2.configuration_siglip2 import Siglip2VisionConfig from transformers.tokenization_utils import TensorType from typing_extensions import TypedDict, Unpack @@ -67,28 +65,11 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.tokenizers import get_tokenizer from vllm.tokenizers.hf import get_cached_tokenizer - - -class PixelShuffleSiglip2VisionConfig(Siglip2VisionConfig): - """Vision configuration for Isaac with Pixel Shuffle support. - - Extends Siglip2VisionConfig with additional fields for pixel shuffle. - """ - - model_type = "pixel_shuffle_siglip2" - base_config_key = "vision_config" - - def __init__( - self, - pixel_shuffle_scale_factor: int = 1, - num_patches: int = 256, - **kwargs, - ): - super().__init__(**kwargs) - - # Add our custom fields - self.pixel_shuffle_scale_factor = pixel_shuffle_scale_factor - self.num_patches = num_patches +from vllm.transformers_utils.configs import ( + IsaacConfig, + PixelShuffleSiglip2VisionConfig, +) +from vllm.utils.tensor_schema import TensorSchema, TensorShape def create_cumulative_seq_lengths( @@ -629,58 +610,6 @@ def process_vision_for_patches( return patches, dims_virtual -class IsaacConfig(Qwen3Config): - """Configuration class for Isaac multimodal model.""" - - model_type = "isaac" - sub_configs = {"vision_config": PixelShuffleSiglip2VisionConfig} - - def __init__( - self, - vision_config=None, - vision_patch_size: int = 16, - vision_max_num_patches: int = 256, - vision_min_num_patches: int | None = None, - pixel_shuffle_scale: int = 1, - max_sequence_length: int = 16384, - vision_token: str = "", - vision_attn_implementation: str | None = None, - **kwargs, - ): - super().__init__(**kwargs) - - # EventStreamProcessor parameters (for backward compatibility) - self.video_patch_size = vision_patch_size - self.vision_max_num_patches = vision_max_num_patches - self.vision_min_num_patches = vision_min_num_patches - self.pixel_shuffle_scale = pixel_shuffle_scale - - # Processing parameters - self.max_sequence_length = max_sequence_length - self.vision_token = vision_token - - # Handle vision config - PixelShuffleSiglip2VisionConfig instance - if isinstance(vision_config, dict): - self.vision_config = PixelShuffleSiglip2VisionConfig(**vision_config) - elif vision_config is None: - self.vision_config = PixelShuffleSiglip2VisionConfig() - else: - self.vision_config = vision_config - - # Ensure compatibility with pretrained checkpoints - self.vision_config.pixel_shuffle_scale_factor = getattr( - self.vision_config, - "pixel_shuffle_scale_factor", - pixel_shuffle_scale, - ) - self.vision_config.num_patches = getattr( - self.vision_config, - "num_patches", - vision_max_num_patches, - ) - self.vision_attn_implementation = vision_attn_implementation - - class IsaacImageProcessorKwargs(TypedDict, total=False): patch_size: int max_num_patches: int @@ -914,6 +843,32 @@ class IsaacDummyInputsBuilder(BaseDummyInputsBuilder[IsaacProcessingInfo]): } +class IsaacImagePixelInputs(TensorSchema): + """ + Schema for validating Isaac image inputs. + + Dimensions: + - np: Number of patches + - d: Patch dimension + - ni: Number of images + + The schema enforces: + - pixel_values must be 2D: (num_patches, patch_dim) + - image_grid_thw must be 2D: (num_images, 3) + where 3 represents [T, H, W] + """ + + pixel_values: Annotated[ + torch.Tensor, + TensorShape("np", "d"), + ] + + image_grid_thw: Annotated[ + torch.Tensor, + TensorShape("ni", 3), + ] + + class IsaacMultiModalProcessor(BaseMultiModalProcessor): def _get_mm_fields_config( self, @@ -1423,19 +1378,21 @@ class IsaacForConditionalGeneration( def _parse_and_validate_image_input( self, **kwargs: object - ) -> dict[str, torch.Tensor] | None: + ) -> IsaacImagePixelInputs | None: pixel_values = kwargs.get("pixel_values") image_grid_thw = kwargs.get("image_grid_thw") if pixel_values is None or image_grid_thw is None: return None - return { - "pixel_values": pixel_values, - "image_grid_thw": image_grid_thw, - } + + # TensorSchema will automatically validate shapes on initialization + return IsaacImagePixelInputs( + pixel_values=pixel_values, + image_grid_thw=image_grid_thw, + ) def _process_image_input( self, - image_input: dict[str, torch.Tensor], + image_input: IsaacImagePixelInputs, ) -> tuple[torch.Tensor, ...]: pixel_values = image_input["pixel_values"] image_grid_thw = image_input["image_grid_thw"] @@ -1445,8 +1402,6 @@ class IsaacForConditionalGeneration( device = next(self.language_model.parameters()).device dtype = self.vision_embedding.linear_fc1.weight.dtype pixel_values = pixel_values.to(device=device, dtype=dtype) - if image_grid_thw.dim() == 3: - image_grid_thw = image_grid_thw[0] spatial_grids = image_grid_thw[:, 1:3].to(device, dtype=torch.int32) vision_embeddings = self.vision_embedding((pixel_values, spatial_grids)) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index a11d37b4b2edf..d487462613bdd 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -72,6 +72,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict( deepseek_v32="DeepseekV3Config", flex_olmo="FlexOlmoConfig", hunyuan_vl="HunYuanVLConfig", + isaac="IsaacConfig", kimi_linear="KimiLinearConfig", kimi_vl="KimiVLConfig", RefinedWeb="RWConfig", # For tiiuae/falcon-40b(-instruct) diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 54fe1b8d7b523..00d5ecd25c38d 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -25,6 +25,7 @@ _CLASS_TO_MODULE: dict[str, str] = { "HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl", "HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl", "HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl", + "IsaacConfig": "vllm.transformers_utils.configs.isaac", # RWConfig is for the original tiiuae/falcon-40b(-instruct) and # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the # `FalconConfig` class from the official HuggingFace transformers library. @@ -41,6 +42,7 @@ _CLASS_TO_MODULE: dict[str, str] = { "NemotronHConfig": "vllm.transformers_utils.configs.nemotron_h", "Olmo3Config": "vllm.transformers_utils.configs.olmo3", "OvisConfig": "vllm.transformers_utils.configs.ovis", + "PixelShuffleSiglip2VisionConfig": "vllm.transformers_utils.configs.isaac", "RadioConfig": "vllm.transformers_utils.configs.radio", "SpeculatorsConfig": "vllm.transformers_utils.configs.speculators.base", "UltravoxConfig": "vllm.transformers_utils.configs.ultravox", @@ -65,6 +67,7 @@ __all__ = [ "HunYuanVLConfig", "HunYuanVLTextConfig", "HunYuanVLVisionConfig", + "IsaacConfig", "RWConfig", "JAISConfig", "Lfm2MoeConfig", @@ -78,6 +81,7 @@ __all__ = [ "NemotronHConfig", "Olmo3Config", "OvisConfig", + "PixelShuffleSiglip2VisionConfig", "RadioConfig", "SpeculatorsConfig", "UltravoxConfig", diff --git a/vllm/transformers_utils/configs/isaac.py b/vllm/transformers_utils/configs/isaac.py new file mode 100644 index 0000000000000..fc15011b54911 --- /dev/null +++ b/vllm/transformers_utils/configs/isaac.py @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from __future__ import annotations + +from transformers import Qwen3Config +from transformers.models.siglip2.configuration_siglip2 import Siglip2VisionConfig + + +class PixelShuffleSiglip2VisionConfig(Siglip2VisionConfig): + """Vision configuration for Isaac with Pixel Shuffle support. + + Extends Siglip2VisionConfig with additional fields for pixel shuffle. + """ + + model_type = "pixel_shuffle_siglip2" + base_config_key = "vision_config" + + def __init__( + self, + pixel_shuffle_scale_factor: int = 1, + num_patches: int = 256, + **kwargs, + ): + super().__init__(**kwargs) + + # Add our custom fields + self.pixel_shuffle_scale_factor = pixel_shuffle_scale_factor + self.num_patches = num_patches + + +class IsaacConfig(Qwen3Config): + """Configuration class for Isaac multimodal model.""" + + model_type = "isaac" + sub_configs = {"vision_config": PixelShuffleSiglip2VisionConfig} + + def __init__( + self, + vision_config=None, + vision_patch_size: int = 16, + vision_max_num_patches: int = 256, + vision_min_num_patches: int | None = None, + pixel_shuffle_scale: int = 1, + max_sequence_length: int = 16384, + vision_token: str = "", + vision_attn_implementation: str | None = None, + **kwargs, + ): + super().__init__(**kwargs) + + # EventStreamProcessor parameters (for backward compatibility) + self.video_patch_size = vision_patch_size + self.vision_max_num_patches = vision_max_num_patches + self.vision_min_num_patches = vision_min_num_patches + self.pixel_shuffle_scale = pixel_shuffle_scale + + # Processing parameters + self.max_sequence_length = max_sequence_length + self.vision_token = vision_token + + # Handle vision config - PixelShuffleSiglip2VisionConfig instance + if isinstance(vision_config, dict): + self.vision_config = PixelShuffleSiglip2VisionConfig(**vision_config) + elif vision_config is None: + self.vision_config = PixelShuffleSiglip2VisionConfig() + else: + self.vision_config = vision_config + + # Ensure compatibility with pretrained checkpoints + self.vision_config.pixel_shuffle_scale_factor = getattr( + self.vision_config, + "pixel_shuffle_scale_factor", + pixel_shuffle_scale, + ) + self.vision_config.num_patches = getattr( + self.vision_config, + "num_patches", + vision_max_num_patches, + ) + self.vision_attn_implementation = vision_attn_implementation + + +__all__ = [ + "IsaacConfig", + "PixelShuffleSiglip2VisionConfig", +] From af3529f6519b125b2d9bec435ad924187578ba6e Mon Sep 17 00:00:00 2001 From: Oscar Gonzalez Date: Mon, 22 Dec 2025 16:49:59 -0500 Subject: [PATCH 16/18] Added test for validating correctness of Isaac-0.1. Signed-off-by: Oscar Gonzalez --- .../multimodal/generation/test_isaac.py | 269 ++++++++++++++++++ vllm/model_executor/models/isaac.py | 29 +- 2 files changed, 293 insertions(+), 5 deletions(-) create mode 100644 tests/models/multimodal/generation/test_isaac.py diff --git a/tests/models/multimodal/generation/test_isaac.py b/tests/models/multimodal/generation/test_isaac.py new file mode 100644 index 0000000000000..cf3081696a630 --- /dev/null +++ b/tests/models/multimodal/generation/test_isaac.py @@ -0,0 +1,269 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Separated from test_common.py because HF loading for PerceptronAI/Isaac-0.1 +requires perceptron package (Run 'pip install perceptron'). +""" + +import pytest + +pytest.importorskip("perceptron", reason="Requires 'pip install perceptron'") + +import types +from pathlib import PosixPath + +import torch +from perceptron.tensorstream import TextType +from perceptron.tensorstream.ops import compute_mrope_pos_tensor, modality_mask +from transformers.modeling_outputs import BaseModelOutputWithPast + +from ....conftest import IMAGE_ASSETS, HfRunner, ImageTestAssets, VllmRunner +from .vlm_utils import runners +from .vlm_utils.case_filtering import get_parametrized_options +from .vlm_utils.types import ExpandableVLMTestArgs, VLMTestInfo, VLMTestType + + +def compute_position_ids_input_ids(input_ids: torch.Tensor) -> torch.Tensor: + r"""Create 3D positional indices for token input. + Args: + input_ids (`torch.Tensor`): + Tensor of shape `(batch_size, seq_len)` containing token ids. + Returns: + `torch.Tensor`: Positional indices with shape `(batch_size, seq_len, 3)` + where each channel duplicates the 1D position so it can be consumed by + the 3-axis MRoPE rotary embedding. + """ + batch_size, seq_length = input_ids.shape + position_ids = torch.arange(seq_length, device=input_ids.device) + position_ids = position_ids.view(1, -1).expand(batch_size, -1) + position_ids = position_ids.unsqueeze(2).expand(-1, -1, 3) # Add 3D for MRoPE + return position_ids + + +def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + """Patch HF runner for Isaac: + 1) move processor outputs to model device + 2) ensure IsaacModel.forward returns hidden_states + for compatibility with hidden_states_to_seq_logprobs() + """ + + model_device = next(hf_model.model.parameters()).device + + # ---------------------------- + # 1) Patch processor: move BatchFeature input_ids and TensorStream to model device + # ---------------------------- + original_processor = hf_model.processor + + def patched_processor(*args, **kwargs): + result = original_processor(*args, **kwargs) + for k, v in result.data.items(): + result[k] = v.to(model_device) + return result + + hf_model.processor = patched_processor + + # ---------------------------- + # 2) Patch IsaacModel.forward: add hidden_states to the output + # ---------------------------- + isaac_model = hf_model.model.model # IsaacModel + + def patched_forward( + self, + input_ids=None, + tensor_stream=None, + attention_mask=None, + position_ids=None, + modality_tensor=None, + past_key_values=None, + inputs_embeds=None, + use_cache=None, + output_hidden_states=None, + return_dict=None, + cache_position=None, + **kwargs, + ): + """ + Forward pass with MRoPE position embeddings. + Computes position embeddings once and passes them through all layers. + """ + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # Get inputs + if tensor_stream is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both tensor_stream and inputs_embeds") + elif tensor_stream is not None: + # Embed TensorStream directly + inputs_embeds = self.embed_stream(tensor_stream) + # Create modality tensor if not provided + if modality_tensor is None: + modality_tensor = modality_mask(tensor_stream) + elif input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time" + ) + elif input_ids is not None: + inputs_embeds = self.embed_tokens(input_ids) + # Create text modality tensor if not provided + if modality_tensor is None: + batch_size, seq_length = input_ids.shape + modality_tensor = torch.full( + (batch_size, seq_length), + TextType.text.value, + device=input_ids.device, + dtype=torch.long, + ) + elif inputs_embeds is None: + raise ValueError( + "You have to specify either tensor_stream, input_ids or inputs_embeds" + ) + + # Create default position_ids if not provided + if position_ids is None: + if tensor_stream is not None: + position_ids = compute_mrope_pos_tensor(tensor_stream) # (B,L,3) + else: + position_ids = compute_position_ids_input_ids(input_ids) + + # Compute MRoPE position embeddings if we have custom rotary_emb + cos, sin = self.rotary_emb(position_ids, modality_tensor) + cos = cos.to(inputs_embeds.dtype) + sin = sin.to(inputs_embeds.dtype) + + # Prepare attention mask + if attention_mask is not None: + attention_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, False + ) + + # Initialize and collect hidden states + all_hidden_states = () + hidden_states = inputs_embeds + all_hidden_states += (hidden_states,) + + for decoder_layer in self.layers: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=(cos, sin), + **kwargs, + ) + + hidden_states = ( + layer_outputs[0] if isinstance(layer_outputs, tuple) else layer_outputs + ) + all_hidden_states += (hidden_states,) + + # Final layer norm + hidden_states = self.norm(hidden_states) + all_hidden_states += (hidden_states,) + + # Include hiden_states for compatibility with hidden_states_to_seq_logprobs() + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + ) + + isaac_model.forward = types.MethodType(patched_forward, isaac_model) + + return hf_model + + +ISAAC_TEST_SETTINGS = { + "isaac": VLMTestInfo( + models=["PerceptronAI/Isaac-0.1"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt: ( + f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n" + ), + img_idx_to_prompt=lambda idx: "", + single_image_prompts=IMAGE_ASSETS.prompts( + { + "stop_sign": "Please describe the image shortly.", + "cherry_blossom": "Please infer the season with reason.", + } + ), + multi_image_prompt=( + "Picture 1: \n" + "Picture 2: \n" + "Describe these two images with one paragraph respectively." + ), + enforce_eager=False, + max_model_len=4096, + max_num_seqs=2, + hf_model_kwargs={"device_map": "auto"}, + patch_hf_runner=isaac_patch_hf_runner, + image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + ) +} + + +### Test wrappers +# Wrappers around the test running func for: +# - single image +# - multi-image +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + ISAAC_TEST_SETTINGS, + test_type=VLMTestType.IMAGE, + create_new_process_for_each_test=False, + ), +) +def test_isaac_single_image( + tmp_path: PosixPath, + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + image_assets: ImageTestAssets, +): + model_test_info = ISAAC_TEST_SETTINGS[model_type] + runners.run_single_image_test( + tmp_path=tmp_path, + model_test_info=model_test_info, + test_case=test_case, + hf_runner=hf_runner, + vllm_runner=vllm_runner, + image_assets=image_assets, + ) + + +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + ISAAC_TEST_SETTINGS, + test_type=VLMTestType.MULTI_IMAGE, + create_new_process_for_each_test=False, + ), +) +def test_isaac_multi_image( + tmp_path: PosixPath, + model_type: str, + test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + image_assets: ImageTestAssets, +): + model_test_info = ISAAC_TEST_SETTINGS[model_type] + runners.run_multi_image_test( + tmp_path=tmp_path, + model_test_info=model_test_info, + test_case=test_case, + hf_runner=hf_runner, + vllm_runner=vllm_runner, + image_assets=image_assets, + ) diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index f435b24335a62..d3bdb1370347c 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -642,7 +642,7 @@ class IsaacImageProcessor: return_tensors: str | TensorType | None, **kwargs: Unpack[IsaacImageProcessorKwargs], ) -> BatchFeature: - """Isaac's resize → normalize → patchify → pack.""" + """Preprocess images into format compatibile with vLLM input processing.""" all_pixel_values: list[torch.Tensor] = [] all_image_grids: list[torch.Tensor] = [] @@ -668,7 +668,7 @@ class IsaacImageProcessor: # Use real patch dimensions for image_grid_thw, not virtual dimensions # This ensures the vision model receives correct grid info for pixel shuffle dims_real = [1, hp, wp] # Real patch dimensions - image_grid_thw = torch.tensor(dims_real).unsqueeze(0) # [1, [T, H, W]] + image_grid_thw = torch.tensor(dims_real).unsqueeze(0) all_pixel_values.append(pixel_values) all_image_grids.append(image_grid_thw) @@ -700,11 +700,30 @@ class IsaacProcessor: def __call__(self, text=None, images=None, **kwargs) -> BatchFeature: result = {} + if images is not None: + image_inputs = self.image_processor.preprocess(images, **kwargs) + image_grid_thw = image_inputs["image_grid_thw"] + result.update(image_inputs) + + if text is not None: + if not isinstance(text, list): + text = [text] + + text = text.copy() # below lines change text in-place + merge_length = self.image_processor.pixel_shuffle_scale**2 + index = 0 + for i in range(len(text)): + while self.image_token in text[i]: + num_image_tokens = image_grid_thw[index].prod() // merge_length + text[i] = text[i].replace( + self.image_token, "<|placeholder|>" * num_image_tokens, 1 + ) + index += 1 + text[i] = text[i].replace("<|placeholder|>", "<|image_pad|>") + if text is not None: result.update(self.tokenizer(text, **kwargs)) - if images is not None: - image_result = self.image_processor.preprocess(images, **kwargs) - result.update(image_result) + return BatchFeature(result) def apply_chat_template( From 4789b9911024b894d99e34a31648c5453c252cbb Mon Sep 17 00:00:00 2001 From: Oscar Gonzalez Date: Mon, 22 Dec 2025 18:24:58 -0500 Subject: [PATCH 17/18] fix: resolve mypy type errors in isaac_patch_hf_runner Signed-off-by: Oscar Gonzalez --- .../models/multimodal/generation/test_isaac.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tests/models/multimodal/generation/test_isaac.py b/tests/models/multimodal/generation/test_isaac.py index cf3081696a630..7cc2cfb311572 100644 --- a/tests/models/multimodal/generation/test_isaac.py +++ b/tests/models/multimodal/generation/test_isaac.py @@ -145,9 +145,11 @@ def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ) # Initialize and collect hidden states - all_hidden_states = () hidden_states = inputs_embeds - all_hidden_states += (hidden_states,) + hidden_states_list: list[torch.Tensor] = [] + + if output_hidden_states: + hidden_states_list.append(hidden_states) for decoder_layer in self.layers: layer_outputs = decoder_layer( @@ -164,11 +166,18 @@ def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner: hidden_states = ( layer_outputs[0] if isinstance(layer_outputs, tuple) else layer_outputs ) - all_hidden_states += (hidden_states,) + + if output_hidden_states: + hidden_states_list.append(hidden_states) # Final layer norm hidden_states = self.norm(hidden_states) - all_hidden_states += (hidden_states,) + + if output_hidden_states: + hidden_states_list.append(hidden_states) + + # Convert to tuple or None + all_hidden_states = tuple(hidden_states_list) if output_hidden_states else None # Include hiden_states for compatibility with hidden_states_to_seq_logprobs() return BaseModelOutputWithPast( From bab4dea597e3ecefd665ca9b530225407adfbcb8 Mon Sep 17 00:00:00 2001 From: Oscar Gonzalez Date: Tue, 23 Dec 2025 15:43:21 -0500 Subject: [PATCH 18/18] Add perceptron dependency for Isaac tests and refactor tests. Signed-off-by: Oscar Gonzalez --- requirements/test.in | 2 + requirements/test.txt | 21 ++- .../multimodal/generation/test_common.py | 25 +++ .../generation/vlm_utils/model_utils.py | 177 ++++++++++++++++++ 4 files changed, 223 insertions(+), 2 deletions(-) diff --git a/requirements/test.in b/requirements/test.in index 55452ce83f232..68e607ff7e308 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -57,3 +57,5 @@ pydantic>=2.12 # 2.11 leads to error on python 3.13 decord==0.6.0 terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test gpt-oss >= 0.0.7; python_version > '3.11' + +perceptron # required for isaac test diff --git a/requirements/test.txt b/requirements/test.txt index ea2093e4347fe..843a8212b819f 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -135,6 +135,7 @@ cloudpickle==3.1.1 # via mlflow-skinny colorama==0.4.6 # via + # perceptron # sacrebleu # schemathesis # tqdm-multiprocess @@ -302,6 +303,8 @@ h11==0.14.0 # via # httpcore # uvicorn +h2==4.3.0 + # via httpx h5py==3.13.0 # via terratorch harfile==0.3.0 @@ -310,6 +313,8 @@ hf-xet==1.1.7 # via huggingface-hub hiredis==3.0.0 # via tensorizer +hpack==4.1.0 + # via h2 html2text==2025.4.15 # via gpt-oss httpcore==1.0.6 @@ -317,6 +322,7 @@ httpcore==1.0.6 httpx==0.27.2 # via # -r requirements/test.in + # perceptron # schemathesis huggingface-hub==0.34.3 # via @@ -338,6 +344,8 @@ hydra-core==1.3.2 # via # lightly # lightning +hyperframe==6.1.0 + # via h2 hypothesis==6.131.0 # via # hypothesis-graphql @@ -549,6 +557,7 @@ numpy==1.26.4 # pandas # patsy # peft + # perceptron # pycocotools # pyogrio # rasterio @@ -702,6 +711,8 @@ peft==0.16.0 # via # -r requirements/test.in # lm-eval +perceptron==0.1.4 + # via -r requirements/test.in pillow==10.4.0 # via # genai-perf @@ -709,6 +720,7 @@ pillow==10.4.0 # lightly-utils # matplotlib # mistral-common + # perceptron # scikit-image # segmentation-models-pytorch # sentence-transformers @@ -952,6 +964,7 @@ rich==13.9.4 # genai-perf # lightning # mteb + # perceptron # typer rioxarray==0.19.0 # via terratorch @@ -1024,7 +1037,9 @@ shapely==2.1.1 # geopandas # torchgeo shellingham==1.5.4 - # via typer + # via + # perceptron + # typer six==1.16.0 # via # junit-xml @@ -1218,7 +1233,9 @@ typepy==1.3.2 # pytablewriter # tabledata typer==0.15.2 - # via fastsafetensors + # via + # fastsafetensors + # perceptron types-python-dateutil==2.9.0.20241206 # via arrow typeshed-client==2.8.2 diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index c5a0b6748f797..17f45e79a7387 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -510,6 +510,31 @@ VLM_TEST_SETTINGS = { use_tokenizer_eos=True, auto_cls=AutoModelForImageTextToText, ), + "isaac": VLMTestInfo( + models=["PerceptronAI/Isaac-0.1"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt: ( + f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n" + ), + img_idx_to_prompt=lambda idx: "", + single_image_prompts=IMAGE_ASSETS.prompts( + { + "stop_sign": "Please describe the image shortly.", + "cherry_blossom": "Please infer the season with reason.", + } + ), + multi_image_prompt=( + "Picture 1: \n" + "Picture 2: \n" + "Describe these two images with one paragraph respectively." + ), + enforce_eager=False, + max_model_len=4096, + max_num_seqs=2, + hf_model_kwargs={"device_map": "auto"}, + patch_hf_runner=model_utils.isaac_patch_hf_runner, + image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + ), "kimi_vl": VLMTestInfo( models=["moonshotai/Kimi-VL-A3B-Instruct"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index b2c62fbd119cc..acc18021859b5 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -522,6 +522,183 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: return hf_model +def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + """Patch HF runner for Isaac: + 1) Move processor outputs to model device + 2) Ensure IsaacModel.forward returns hidden_states + for compatibility with hidden_states_to_seq_logprobs() + """ + + from perceptron.tensorstream import TextType + from perceptron.tensorstream.ops import compute_mrope_pos_tensor, modality_mask + from transformers.modeling_outputs import BaseModelOutputWithPast + + def compute_position_ids_input_ids(input_ids: torch.Tensor) -> torch.Tensor: + """ + Create 3D positional indices for token input. + """ + batch_size, seq_length = input_ids.shape + position_ids = torch.arange(seq_length, device=input_ids.device) + position_ids = position_ids.view(1, -1).expand(batch_size, -1) + position_ids = position_ids.unsqueeze(2).expand(-1, -1, 3) # Add 3D for MRoPE + return position_ids + + model_device = next(hf_model.model.parameters()).device + + # ---------------------------- + # 1) Patch processor: move BatchFeature input_ids and TensorStream to model device + # ---------------------------- + original_processor = hf_model.processor + + def patched_processor(*args, **kwargs): + result = original_processor(*args, **kwargs) + for k, v in result.data.items(): + result[k] = v.to(model_device) + return result + + hf_model.processor = patched_processor + + tokenizer = AutoTokenizer.from_pretrained( + hf_model.model_name, trust_remote_code=True + ) + + original_generate = hf_model.model.generate + + def patched_generate(*args, **kwargs): + kwargs["pad_token_id"] = tokenizer.eos_token_id + kwargs["eos_token_id"] = tokenizer.eos_token_id + return original_generate(*args, **kwargs) + + hf_model.model.generate = patched_generate + + # ---------------------------- + # 2) Patch IsaacModel.forward: add hidden_states to the output + # ---------------------------- + isaac_model = hf_model.model.model + + def patched_forward( + self, + input_ids=None, + tensor_stream=None, + attention_mask=None, + position_ids=None, + modality_tensor=None, + past_key_values=None, + inputs_embeds=None, + use_cache=None, + output_hidden_states=None, + return_dict=None, + cache_position=None, + **kwargs, + ): + """ + Forward pass with MRoPE position embeddings. + Computes position embeddings once and passes them through all layers. + """ + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # Get inputs + if tensor_stream is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both tensor_stream and inputs_embeds") + elif tensor_stream is not None: + # Embed TensorStream directly + inputs_embeds = self.embed_stream(tensor_stream) + # Create modality tensor if not provided + if modality_tensor is None: + modality_tensor = modality_mask(tensor_stream) + elif input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time" + ) + elif input_ids is not None: + inputs_embeds = self.embed_tokens(input_ids) + # Create text modality tensor if not provided + if modality_tensor is None: + batch_size, seq_length = input_ids.shape + modality_tensor = torch.full( + (batch_size, seq_length), + TextType.text.value, + device=input_ids.device, + dtype=torch.long, + ) + elif inputs_embeds is None: + raise ValueError( + "You have to specify either tensor_stream, input_ids or inputs_embeds" + ) + + # Create default position_ids if not provided + if position_ids is None: + if tensor_stream is not None: + position_ids = compute_mrope_pos_tensor(tensor_stream) # (B,L,3) + else: + position_ids = compute_position_ids_input_ids(input_ids) + + # Compute MRoPE position embeddings if we have custom rotary_emb + cos, sin = self.rotary_emb(position_ids, modality_tensor) + cos = cos.to(inputs_embeds.dtype) + sin = sin.to(inputs_embeds.dtype) + + # Prepare attention mask + if attention_mask is not None: + attention_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, False + ) + + # Initialize and collect hidden states + hidden_states = inputs_embeds + hidden_states_list: list[torch.Tensor] = [] + + if output_hidden_states: + hidden_states_list.append(hidden_states) + + for decoder_layer in self.layers: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=(cos, sin), + **kwargs, + ) + + hidden_states = ( + layer_outputs[0] if isinstance(layer_outputs, tuple) else layer_outputs + ) + + if output_hidden_states: + hidden_states_list.append(hidden_states) + + # Final layer norm + hidden_states = self.norm(hidden_states) + + if output_hidden_states: + hidden_states_list.append(hidden_states) + + # Convert to tuple or None + all_hidden_states = tuple(hidden_states_list) if output_hidden_states else None + + # Include hiden_states for compatibility with hidden_states_to_seq_logprobs() + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + ) + + isaac_model.forward = types.MethodType(patched_forward, isaac_model) + + return hf_model + + def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner: """Patches and returns an instance of the HfRunner to use for SkyworkR1V."""