1. Add support for Isaac model in the registry and documentation

2. optimize Isaac model implementation.

Signed-off-by: Yang <lymailforjob@gmail.com>
This commit is contained in:
Yang 2025-11-26 15:09:47 -08:00 committed by Yang Liu
parent ac8a0b936a
commit c10f5653ba
3 changed files with 212 additions and 187 deletions

View File

@ -679,6 +679,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
| `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ | | `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
| `IsaacForConditionalGeneration` | Isaac | T + I<sup>+</sup> | `PerceptronAI/Isaac-0.1` | ✅︎ | ✅︎ |
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ | | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ |
| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ |
| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | | `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ |

View File

@ -646,6 +646,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"HuggingFaceM4/Idefics3-8B-Llama3", "HuggingFaceM4/Idefics3-8B-Llama3",
extras={"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}, extras={"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"},
), ),
"IsaacForConditionalGeneration": _HfExamplesInfo(
"PerceptronAI/Isaac-0.1",
trust_remote_code=True,
),
"InternS1ForConditionalGeneration": _HfExamplesInfo( "InternS1ForConditionalGeneration": _HfExamplesInfo(
"internlm/Intern-S1", trust_remote_code=True "internlm/Intern-S1", trust_remote_code=True
), ),

View File

@ -4,7 +4,7 @@ from __future__ import annotations
import itertools import itertools
import math import math
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Iterable, Iterator, Mapping, Sequence
from dataclasses import dataclass from dataclasses import dataclass
from enum import Enum from enum import Enum
from typing import Any from typing import Any
@ -15,7 +15,7 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from einops import rearrange from einops import rearrange
from transformers import PretrainedConfig, Qwen3Config from transformers import Qwen3Config
from transformers.image_processing_utils import BatchFeature from transformers.image_processing_utils import BatchFeature
from transformers.models.siglip2.configuration_siglip2 import Siglip2VisionConfig from transformers.models.siglip2.configuration_siglip2 import Siglip2VisionConfig
from transformers.tokenization_utils import TensorType from transformers.tokenization_utils import TensorType
@ -30,8 +30,10 @@ from vllm.attention.ops.vit_attn_wrappers import (
vit_xformers_attn_wrapper, vit_xformers_attn_wrapper,
) )
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.config.model import ModelConfig
from vllm.distributed import parallel_state from vllm.distributed import parallel_state
from vllm.distributed import utils as dist_utils from vllm.distributed import utils as dist_utils
from vllm.logger import init_logger
from vllm.model_executor.layers.linear import ( from vllm.model_executor.layers.linear import (
ColumnParallelLinear, ColumnParallelLinear,
QKVParallelLinear, QKVParallelLinear,
@ -50,18 +52,18 @@ from vllm.model_executor.models.interfaces import (
SupportsPP, SupportsPP,
) )
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
from vllm.model_executor.models.siglip import SiglipMLP from vllm.model_executor.models.siglip import SiglipMLP
from vllm.model_executor.models.utils import ( from vllm.model_executor.models.utils import (
AutoWeightsLoader, AutoWeightsLoader,
WeightsMapper, WeightsMapper,
_merge_multimodal_embeddings, init_vllm_registered_model,
maybe_prefix, maybe_prefix,
) )
from vllm.model_executor.models.vision import get_vit_attn_backend from vllm.model_executor.models.vision import get_vit_attn_backend
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import ( from vllm.multimodal.inputs import (
MultiModalDataDict, MultiModalDataDict,
MultiModalFeatureSpec,
MultiModalFieldConfig, MultiModalFieldConfig,
MultiModalKwargs, MultiModalKwargs,
) )
@ -73,6 +75,13 @@ from vllm.multimodal.processing import (
PromptUpdate, PromptUpdate,
) )
from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.tokenizer import (
get_cached_tokenizer,
get_tokenizer,
)
logger = init_logger(__name__)
# ===== TensorStream Compatibility Layer for Isaac MRoPE ===== # ===== TensorStream Compatibility Layer for Isaac MRoPE =====
# Minimal implementation of TensorStream classes needed for Isaac's 3D positional # Minimal implementation of TensorStream classes needed for Isaac's 3D positional
@ -286,12 +295,14 @@ def compute_mrope_pos_tensor(ts: TensorStream, n_pos_dims: int = 3) -> torch.Ten
dims = (event.dims() or [1]) + [1] * (n_pos_dims - len(event.dims() or [])) dims = (event.dims() or [1]) + [1] * (n_pos_dims - len(event.dims() or []))
# Create ranges for each dimension (similar to old _finalize implementation) # Create ranges for each dimension (similar to old _finalize implementation)
first_dim = range(cumulative_offset, cumulative_offset + dims[0]) first_dim = list(range(cumulative_offset, cumulative_offset + dims[0]))
cumulative_offset += dims[0] # advance time for the next event cumulative_offset += dims[0] # advance time for the next event
other_dims = [range(d) for d in dims[1:]]
# Use itertools.product to create all coordinate combinations if event.modality_type != VisionType.image:
full_coords = list(itertools.product(first_dim, *other_dims)) full_coords = [(t, t, t) for t in first_dim]
else:
other_dims = [range(d) for d in dims[1:]]
full_coords = list(itertools.product(first_dim, *other_dims))
# Slice if the event is partial # Slice if the event is partial
s, e = event.idx_range s, e = event.idx_range
@ -307,6 +318,19 @@ def compute_mrope_pos_tensor(ts: TensorStream, n_pos_dims: int = 3) -> torch.Ten
) )
def _resolve_vision_token_id(model_config: ModelConfig, vision_token: str) -> int:
tokenizer_name = model_config.tokenizer or model_config.model
tokenizer = get_cached_tokenizer(
get_tokenizer(
tokenizer_name,
tokenizer_mode=model_config.tokenizer_mode,
trust_remote_code=model_config.trust_remote_code,
revision=model_config.tokenizer_revision or model_config.revision,
)
)
return tokenizer.encode(vision_token, add_special_tokens=False)[0]
def modality_mask(ts: TensorStream, modality_type: ModalityType) -> torch.Tensor: def modality_mask(ts: TensorStream, modality_type: ModalityType) -> torch.Tensor:
"""Create boolean mask for specific modality type in the tensor stream.""" """Create boolean mask for specific modality type in the tensor stream."""
B, T = ts.shape B, T = ts.shape
@ -883,7 +907,8 @@ class IsaacConfig(Qwen3Config):
vision_min_num_patches: int | None = None, vision_min_num_patches: int | None = None,
pixel_shuffle_scale: int = 1, pixel_shuffle_scale: int = 1,
max_sequence_length: int = 16384, max_sequence_length: int = 16384,
vision_token: str = "<|image_pad|>", vision_token: str = "<image>",
vision_attn_implementation: str | None = None,
**kwargs, **kwargs,
): ):
super().__init__(**kwargs) super().__init__(**kwargs)
@ -899,10 +924,25 @@ class IsaacConfig(Qwen3Config):
self.vision_token = vision_token self.vision_token = vision_token
# Handle vision config - PixelShuffleSiglip2VisionConfig instance # Handle vision config - PixelShuffleSiglip2VisionConfig instance
self.vision_config = PixelShuffleSiglip2VisionConfig( if isinstance(vision_config, dict):
pixel_shuffle_scale_factor=pixel_shuffle_scale, self.vision_config = PixelShuffleSiglip2VisionConfig(**vision_config)
num_patches=vision_max_num_patches, elif vision_config is None:
self.vision_config = PixelShuffleSiglip2VisionConfig()
else:
self.vision_config = vision_config
# Ensure compatibility with pretrained checkpoints
self.vision_config.pixel_shuffle_scale_factor = getattr(
self.vision_config,
"pixel_shuffle_scale_factor",
pixel_shuffle_scale,
) )
self.vision_config.num_patches = getattr(
self.vision_config,
"num_patches",
vision_max_num_patches,
)
self.vision_attn_implementation = vision_attn_implementation
class IsaacImageProcessorKwargs(TypedDict, total=False): class IsaacImageProcessorKwargs(TypedDict, total=False):
@ -991,9 +1031,9 @@ class IsaacProcessor:
tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
def __init__(self, image_processor=None, tokenizer=None, **kwargs): def __init__(self, image_processor=None, tokenizer=None, **kwargs):
self.image_token = kwargs.pop("image_token", "<image>")
self.image_processor = image_processor or IsaacImageProcessor(kwargs) self.image_processor = image_processor or IsaacImageProcessor(kwargs)
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.image_token = "<|image_pad|>"
def __call__(self, text=None, images=None, **kwargs) -> BatchFeature: def __call__(self, text=None, images=None, **kwargs) -> BatchFeature:
result = {} result = {}
@ -1062,12 +1102,20 @@ class IsaacProcessingInfo(BaseProcessingInfo):
max_sequence_length=getattr( max_sequence_length=getattr(
original_config, "max_sequence_length", 16384 original_config, "max_sequence_length", 16384
), ),
vision_token="<|image_pad|>", vision_token=getattr(original_config, "vision_token", "<image>"),
vision_attn_implementation=getattr(
original_config, "vision_attn_implementation", None
),
) )
return IsaacConfig() return IsaacConfig()
def get_hf_processor(self, **kwargs) -> IsaacProcessor: def get_hf_processor(self, **kwargs) -> IsaacProcessor:
return self.ctx.get_hf_processor(IsaacProcessor, **kwargs) hf_config = self.get_hf_config()
processor_kwargs = {
"image_token": hf_config.vision_token,
}
processor_kwargs.update(kwargs)
return self.ctx.get_hf_processor(IsaacProcessor, **processor_kwargs)
def get_tokenizer(self): def get_tokenizer(self):
return self.ctx.tokenizer return self.ctx.tokenizer
@ -1157,11 +1205,13 @@ class IsaacMultiModalProcessor(BaseMultiModalProcessor):
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargs,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
# hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) # hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
hf_config = self.info.get_hf_config()
image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs) image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
tokenizer = self.info.get_tokenizer() tokenizer = self.info.get_tokenizer()
placeholder_id = tokenizer.encode(
vocab = tokenizer.get_vocab() hf_config.vision_token,
placeholder_id = vocab.get("<|image_pad|>", 151655) add_special_tokens=False,
)
pixel_shuffle_scale = getattr(image_processor, "pixel_shuffle_scale", 2) pixel_shuffle_scale = getattr(image_processor, "pixel_shuffle_scale", 2)
merge_length = pixel_shuffle_scale**2 merge_length = pixel_shuffle_scale**2
@ -1172,12 +1222,12 @@ class IsaacMultiModalProcessor(BaseMultiModalProcessor):
assert isinstance(grid_thw, torch.Tensor) assert isinstance(grid_thw, torch.Tensor)
num_tokens = int(grid_thw.prod()) // merge_length num_tokens = int(grid_thw.prod()) // merge_length
return [placeholder_id] * num_tokens return placeholder_id * num_tokens
return [ return [
PromptReplacement( PromptReplacement(
modality="image", modality="image",
target=[placeholder_id], target=placeholder_id,
replacement=get_replacement_isaac, replacement=get_replacement_isaac,
) )
] ]
@ -1278,16 +1328,7 @@ class Siglip2VisionAttention(nn.Module):
def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
seq_len, bs, _ = qkv.shape seq_len, bs, _ = qkv.shape
if self.tp_size > 1:
qkv = all_gather_interleave(qkv, self.qkv_proj.hidden_size, self.tp_size)
q, k, v = qkv.chunk(3, dim=2) q, k, v = qkv.chunk(3, dim=2)
if self.tp_size > 1:
q = dist_utils.split_tensor_along_last_dim(q, self.tp_size)[self.tp_rank]
k = dist_utils.split_tensor_along_last_dim(k, self.tp_size)[self.tp_rank]
v = dist_utils.split_tensor_along_last_dim(v, self.tp_size)[self.tp_rank]
new_shape = ( new_shape = (
seq_len, seq_len,
bs, bs,
@ -1604,7 +1645,8 @@ class IsaacVisionEmbedding(nn.Module):
vision_cfg: PixelShuffleSiglip2VisionConfig, vision_cfg: PixelShuffleSiglip2VisionConfig,
hidden_dim: int, hidden_dim: int,
output_dim: int, output_dim: int,
prefix: str, quant_config: QuantizationConfig | None = None,
prefix: str = "",
): ):
super().__init__() super().__init__()
self.transformer = Siglip2VisionTransformer( self.transformer = Siglip2VisionTransformer(
@ -1614,6 +1656,7 @@ class IsaacVisionEmbedding(nn.Module):
hidden_dim, hidden_dim,
4 * hidden_dim, 4 * hidden_dim,
bias=False, bias=False,
quant_config=quant_config,
prefix=maybe_prefix(prefix, "vision_embedding.1"), prefix=maybe_prefix(prefix, "vision_embedding.1"),
return_bias=False, return_bias=False,
) )
@ -1622,6 +1665,7 @@ class IsaacVisionEmbedding(nn.Module):
4 * hidden_dim, 4 * hidden_dim,
output_dim, output_dim,
bias=False, bias=False,
quant_config=quant_config,
prefix=maybe_prefix(prefix, "vision_embedding.3"), prefix=maybe_prefix(prefix, "vision_embedding.3"),
return_bias=False, return_bias=False,
) )
@ -1642,8 +1686,9 @@ class IsaacVisionEmbedding(nn.Module):
dummy_inputs=IsaacDummyInputsBuilder, dummy_inputs=IsaacDummyInputsBuilder,
) )
class IsaacForConditionalGeneration( class IsaacForConditionalGeneration(
Qwen3ForCausalLM, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
): ):
merge_by_field_config = True
packed_modules_mapping = { packed_modules_mapping = {
"qkv_proj": [ "qkv_proj": [
"q_proj", "q_proj",
@ -1661,221 +1706,196 @@ class IsaacForConditionalGeneration(
# To ensure correct weight loading and mapping. # To ensure correct weight loading and mapping.
hf_to_vllm_mapper = WeightsMapper( hf_to_vllm_mapper = WeightsMapper(
orig_to_new_prefix={ orig_to_new_prefix={
"lm_head.": "language_model.lm_head.",
"model.vision_embedding.0": "vision_embedding.transformer",
"model.vision_embedding.1": "vision_embedding.linear_fc1",
"model.vision_embedding.2": "vision_embedding.act",
"model.vision_embedding.3": "vision_embedding.linear_fc2",
"model.vision_embedding.": "vision_embedding.", "model.vision_embedding.": "vision_embedding.",
"vision_embedding.0": "vision_embedding.transformer", "model.": "language_model.model.",
"vision_embedding.1": "vision_embedding.linear_fc1",
"vision_embedding.2": "vision_embedding.act",
"vision_embedding.3": "vision_embedding.linear_fc2",
} }
) )
@classmethod @classmethod
def get_placeholder_str(cls, modality: str, i: int) -> str | None: def get_placeholder_str(cls, modality: str, i: int) -> str | None:
if modality.startswith("image"): if modality.startswith("image"):
return "<|image_pad|>" return "<image>"
raise ValueError("Only image modality is supported") raise ValueError("Only image modality is supported")
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"): def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
super().__init__()
config: IsaacConfig = vllm_config.model_config.hf_config config: IsaacConfig = vllm_config.model_config.hf_config
head_dim = config.head_dim quant_config = vllm_config.quant_config
self.config = config
self.multimodal_config = vllm_config.model_config.multimodal_config
head_dim = config.head_dim
calculated_mrope_section = [ calculated_mrope_section = [
head_dim // 4, # 2x more for temporal dim head_dim // 4, # 2x more for temporal dim
head_dim // 8, head_dim // 8,
head_dim // 8, head_dim // 8,
] ]
self.vision_token_id = _resolve_vision_token_id(
vllm_config.model_config, config.vision_token
)
config.image_token_id = self.vision_token_id
logger.info("vllm config: %s", repr(vllm_config))
config.rope_scaling["mrope_section"] = calculated_mrope_section config.rope_scaling["mrope_section"] = calculated_mrope_section
self.config = config self.language_model = init_vllm_registered_model(
vllm_config=vllm_config,
# Initialize the parent class with updated config architectures=["Qwen3ForCausalLM"],
super().__init__(vllm_config=vllm_config, prefix=prefix) prefix=maybe_prefix(prefix, "language_model"),
)
# Create the language model module to match checkpoint structure self.make_empty_intermediate_tensors = (
self.language_model = nn.ModuleDict( self.language_model.make_empty_intermediate_tensors
{
"embed_tokens": self.model.embed_tokens,
"layers": self.model.layers,
"norm": self.model.norm,
}
) )
config.vision_config.preserve_original_pe = True
config.vision_config.use_rope = False
config.vision_config.hidden_stride = (
config.vision_config.pixel_shuffle_scale_factor
)
config.vision_config.window_size = 32 * 2
config.vision_config.fullatt_block_indexes = None
vision_cfg = config.vision_config vision_cfg = config.vision_config
if vision_cfg is None: if vision_cfg is None:
raise ValueError("IsaacConfig should always have vision_config") raise ValueError("IsaacConfig should always have vision_config")
vision_cfg.preserve_original_pe = True
vision_cfg.use_rope = False
vision_cfg.hidden_stride = vision_cfg.pixel_shuffle_scale_factor
vision_cfg.window_size = 32 * 2
vision_cfg.fullatt_block_indexes = None
attn_impl = (
config.vision_attn_implementation
if config.vision_attn_implementation is not None
else getattr(config, "_attn_implementation", None)
)
if attn_impl is not None:
vision_cfg._attn_implementation = attn_impl
hidden_dim = vision_cfg.hidden_size * (vision_cfg.pixel_shuffle_scale_factor**2) hidden_dim = vision_cfg.hidden_size * (vision_cfg.pixel_shuffle_scale_factor**2)
self.vision_embedding = IsaacVisionEmbedding( self.vision_embedding = IsaacVisionEmbedding(
vision_cfg=vision_cfg, vision_cfg=vision_cfg,
hidden_dim=hidden_dim, hidden_dim=hidden_dim,
output_dim=config.hidden_size, output_dim=config.hidden_size,
prefix=prefix, quant_config=quant_config,
prefix=maybe_prefix(prefix, "vision_embedding"),
) )
def iter_mm_grid_hw(
self, input_tokens: list[int], mm_features: list[MultiModalFeatureSpec]
) -> Iterator[tuple[int, int, int]]:
spatial_merge_size = self.config.vision_config.pixel_shuffle_scale_factor
for mm_feature in sorted(mm_features, key=lambda f: f.mm_position.offset):
offset = mm_feature.mm_position.offset
if mm_feature.modality == "image":
t, h, w = mm_feature.data["image_grid_thw"].data.tolist()
assert t == 1, f"Image must have 1 frame, got {t}"
yield offset, h // spatial_merge_size, w // spatial_merge_size
else:
raise ValueError(f"Unsupported modality: {mm_feature.modality}")
def get_mrope_input_positions( def get_mrope_input_positions(
self, self,
input_tokens: list[int], input_tokens: list[int],
hf_config: PretrainedConfig, mm_features: list[MultiModalFeatureSpec],
image_grid_thw: list[list[int]] | torch.Tensor,
video_grid_thw: list[list[int]] | torch.Tensor,
context_len: int = 0,
seq_len: int | None = None,
second_per_grid_ts: list[float] | None = None,
audio_feature_lengths: torch.Tensor | None = None,
use_audio_in_video: bool = False,
) -> tuple[torch.Tensor, int]: ) -> tuple[torch.Tensor, int]:
"""Get mrope input positions and delta value.""" llm_pos_ids_list = []
st = 0
vision_token_id = getattr(self.config, "image_token_id", 151655) for offset, llm_grid_h, llm_grid_w in self.iter_mm_grid_hw(
spatial_merge_size = hf_config.vision_config.pixel_shuffle_scale_factor input_tokens, mm_features
input_tokens_tensor = torch.tensor(input_tokens) ):
text_len = offset - st
# Find image token positions st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
image_positions = torch.where(input_tokens_tensor == vision_token_id)[ llm_pos_ids_list.append(
0 np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx
].tolist()
# For text-only inputs, use Isaac's original logic from
# compute_position_ids_input_ids()
if len(image_positions) == 0:
seq_len = len(input_tokens)
# Create 3D positions where all dimensions get the same 1D temporal
# progression
position_ids = torch.arange(seq_len, dtype=torch.long)
position_ids = position_ids.view(1, -1).expand(1, -1) # [1, seq_len]
position_ids = position_ids.unsqueeze(2).expand(
-1, -1, 3
) # [1, seq_len, 3]
# vLLM expects shape [3, seq_len], so transpose
position_ids = position_ids.squeeze(0).transpose(0, 1) # [3, seq_len]
return position_ids, 0
events = []
image_idx = 0
current_pos = 0
last_processed_pos = -1
for image_pos in image_positions:
if image_pos <= last_processed_pos:
continue # Skip already processed positions
# Add any text before this image
if image_pos > current_pos:
text_tokens = image_pos - current_pos
text_event = Event(
modality_type=TextType.text,
dims_virtual=[text_tokens, 1],
idx_range=(0, text_tokens),
)
events.append(text_event)
# Add image
t, h, w = image_grid_thw[image_idx]
llm_grid_h, llm_grid_w = h // spatial_merge_size, w // spatial_merge_size
image_tokens = t * llm_grid_h * llm_grid_w
image_event = Event(
modality_type=VisionType.image,
dims_virtual=[t, llm_grid_h, llm_grid_w],
idx_range=(0, image_tokens),
) )
events.append(image_event)
current_pos = image_pos + image_tokens grid_indices = np.indices((1, llm_grid_h, llm_grid_w)).reshape(3, -1)
last_processed_pos = ( grid_indices[0, :] = grid_indices[0, :] + text_len + st_idx
current_pos - 1 llm_pos_ids_list.append(grid_indices)
) # Mark up to this position as processed st = offset + llm_grid_h * llm_grid_w
image_idx += 1
# Add final text segment if any if st < len(input_tokens):
if current_pos < len(input_tokens): st_idx = llm_pos_ids_list[-1][0, -1] + 1 if len(llm_pos_ids_list) > 0 else 0
text_tokens = len(input_tokens) - current_pos text_len = len(input_tokens) - st
text_event = Event( llm_pos_ids_list.append(
modality_type=TextType.text, np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx
dims_virtual=[text_tokens, 1],
idx_range=(0, text_tokens),
) )
events.append(text_event)
stream = Stream(events) llm_positions = np.concatenate(llm_pos_ids_list, axis=1).reshape(3, -1)
tensor_stream = TensorStream([stream]) mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
# Use Isaac's native MRoPE calculation return torch.from_numpy(llm_positions), mrope_position_delta
position_ids = compute_mrope_pos_tensor(tensor_stream, n_pos_dims=3)
# Max position per batch across the 3 planes and sequence dimension: (B,) def _parse_and_validate_image_input(
m_per_batch = position_ids.amax(dim=(1, 2)) self, **kwargs: object
) -> dict[str, torch.Tensor] | None:
pixel_values = kwargs.get("pixel_values")
image_grid_thw = kwargs.get("image_grid_thw")
if pixel_values is None or image_grid_thw is None:
return None
return {
"pixel_values": pixel_values,
"image_grid_thw": image_grid_thw,
}
mrope_position_delta = (m_per_batch + 1 - len(input_tokens)).item() def _process_image_input(
self,
image_input: dict[str, torch.Tensor],
) -> tuple[torch.Tensor, ...]:
pixel_values = image_input["pixel_values"]
image_grid_thw = image_input["image_grid_thw"]
if pixel_values.numel() == 0:
return ()
# vLLM expects shape [3, seq_len] but Isaac returns [batch, seq_len, 3] device = next(self.language_model.parameters()).device
# Transpose to match vLLM's expected format dtype = self.vision_embedding.linear_fc1.weight.dtype
position_ids = position_ids.squeeze(0).transpose(0, 1) pixel_values = pixel_values.to(device=device, dtype=dtype)
if image_grid_thw.dim() == 3:
image_grid_thw = image_grid_thw[0]
spatial_grids = image_grid_thw[:, 1:3].to(device, dtype=torch.int32)
return position_ids, mrope_position_delta vision_embeddings = self.vision_embedding((pixel_values, spatial_grids))
merge_size = self.config.vision_config.pixel_shuffle_scale_factor
sizes = spatial_grids.prod(-1) // (merge_size * merge_size)
return tuple(vision_embeddings.split(sizes.tolist()))
def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return ()
return self._process_image_input(image_input)
def get_multimodal_embeddings( def get_multimodal_embeddings(
self, **kwargs: object self, **kwargs: object
) -> MultiModalEmbeddings | None: ) -> MultiModalEmbeddings | None:
pixel_values = kwargs.get("pixel_values") # Backward compatibility for older runners.
image_grid_thw = kwargs.get("image_grid_thw") embeddings = self.embed_multimodal(**kwargs)
if not embeddings:
if pixel_values is None:
return [] return []
return embeddings
# Convert image_grid_thw from [batch, 1, [T, H, W]] to [batch, [H, W]] def get_language_model(self) -> torch.nn.Module:
spatial_grids = image_grid_thw[ return self.language_model
:, 0, 1:3
] # Extract H, W from [T, H, W] for each image
# Process packed sequence patches through vision_embedding module def forward(
vision_embeddings = self.vision_embedding((pixel_values, spatial_grids))
# Split concatenated embeddings for each image item (following Qwen2-VL pattern)
merge_size = (
self.config.vision_config.pixel_shuffle_scale_factor
) # Isaac uses pixel shuffle
sizes = spatial_grids.prod(-1) // (
merge_size * merge_size
) # H * W / (merge_size^2)
return vision_embeddings.split(sizes.tolist())
def get_input_embeddings(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
multimodal_embeddings: MultiModalEmbeddings | None = None, positions: torch.Tensor,
*, intermediate_tensors: IntermediateTensors | None = None,
is_multimodal: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
handle_oov_mm_token: bool = False, **kwargs: object,
) -> torch.Tensor: ) -> torch.Tensor | IntermediateTensors:
# Get text embeddings from the base language model return self.language_model(
inputs_embeds = super().get_input_embeddings(input_ids) input_ids=input_ids,
positions=positions,
intermediate_tensors=intermediate_tensors,
inputs_embeds=inputs_embeds,
**kwargs,
)
# If we have multimodal embeddings, merge them with text embeddings def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
if multimodal_embeddings is not None and len(multimodal_embeddings) != 0: return self.language_model.compute_logits(hidden_states)
inputs_embeds = _merge_multimodal_embeddings(
inputs_embeds=inputs_embeds,
multimodal_embeddings=multimodal_embeddings,
is_multimodal=is_multimodal,
)
return inputs_embeds
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
skip_prefixes = [] loader = AutoWeightsLoader(self)
loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
def get_mm_mapping(self) -> MultiModelKeys: def get_mm_mapping(self) -> MultiModelKeys: