mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-23 23:11:28 +08:00
Merge bab4dea597e3ecefd665ca9b530225407adfbcb8 into 254f6b986720c92ddf97fbb1a6a6465da8e87e29
This commit is contained in:
commit
7be6e6ce62
@ -688,6 +688,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
|
||||
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
|
||||
| `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
|
||||
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
|
||||
| `IsaacForConditionalGeneration` | Isaac | T + I<sup>+</sup> | `PerceptronAI/Isaac-0.1` | ✅︎ | ✅︎ |
|
||||
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ |
|
||||
| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ |
|
||||
|
||||
@ -56,3 +56,5 @@ pydantic>=2.12 # 2.11 leads to error on python 3.13
|
||||
decord==0.6.0
|
||||
terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
|
||||
gpt-oss >= 0.0.7; python_version > '3.11'
|
||||
|
||||
perceptron # required for isaac test
|
||||
|
||||
@ -135,6 +135,7 @@ cloudpickle==3.1.1
|
||||
# via mlflow-skinny
|
||||
colorama==0.4.6
|
||||
# via
|
||||
# perceptron
|
||||
# sacrebleu
|
||||
# schemathesis
|
||||
# tqdm-multiprocess
|
||||
@ -302,6 +303,8 @@ h11==0.14.0
|
||||
# via
|
||||
# httpcore
|
||||
# uvicorn
|
||||
h2==4.3.0
|
||||
# via httpx
|
||||
h5py==3.13.0
|
||||
# via terratorch
|
||||
harfile==0.3.0
|
||||
@ -310,6 +313,8 @@ hf-xet==1.1.7
|
||||
# via huggingface-hub
|
||||
hiredis==3.0.0
|
||||
# via tensorizer
|
||||
hpack==4.1.0
|
||||
# via h2
|
||||
html2text==2025.4.15
|
||||
# via gpt-oss
|
||||
httpcore==1.0.6
|
||||
@ -317,6 +322,7 @@ httpcore==1.0.6
|
||||
httpx==0.27.2
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# perceptron
|
||||
# schemathesis
|
||||
huggingface-hub==0.34.3
|
||||
# via
|
||||
@ -338,6 +344,8 @@ hydra-core==1.3.2
|
||||
# via
|
||||
# lightly
|
||||
# lightning
|
||||
hyperframe==6.1.0
|
||||
# via h2
|
||||
hypothesis==6.131.0
|
||||
# via
|
||||
# hypothesis-graphql
|
||||
@ -549,6 +557,7 @@ numpy==1.26.4
|
||||
# pandas
|
||||
# patsy
|
||||
# peft
|
||||
# perceptron
|
||||
# pycocotools
|
||||
# pyogrio
|
||||
# rasterio
|
||||
@ -702,6 +711,8 @@ peft==0.16.0
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# lm-eval
|
||||
perceptron==0.1.4
|
||||
# via -r requirements/test.in
|
||||
pillow==10.4.0
|
||||
# via
|
||||
# genai-perf
|
||||
@ -709,6 +720,7 @@ pillow==10.4.0
|
||||
# lightly-utils
|
||||
# matplotlib
|
||||
# mistral-common
|
||||
# perceptron
|
||||
# scikit-image
|
||||
# segmentation-models-pytorch
|
||||
# sentence-transformers
|
||||
@ -952,6 +964,7 @@ rich==13.9.4
|
||||
# genai-perf
|
||||
# lightning
|
||||
# mteb
|
||||
# perceptron
|
||||
# typer
|
||||
rioxarray==0.19.0
|
||||
# via terratorch
|
||||
@ -1024,7 +1037,9 @@ shapely==2.1.1
|
||||
# geopandas
|
||||
# torchgeo
|
||||
shellingham==1.5.4
|
||||
# via typer
|
||||
# via
|
||||
# perceptron
|
||||
# typer
|
||||
six==1.16.0
|
||||
# via
|
||||
# junit-xml
|
||||
@ -1218,7 +1233,9 @@ typepy==1.3.2
|
||||
# pytablewriter
|
||||
# tabledata
|
||||
typer==0.15.2
|
||||
# via fastsafetensors
|
||||
# via
|
||||
# fastsafetensors
|
||||
# perceptron
|
||||
types-python-dateutil==2.9.0.20241206
|
||||
# via arrow
|
||||
typeshed-client==2.8.2
|
||||
|
||||
@ -529,6 +529,31 @@ VLM_TEST_SETTINGS = {
|
||||
use_tokenizer_eos=True,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
),
|
||||
"isaac": VLMTestInfo(
|
||||
models=["PerceptronAI/Isaac-0.1"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: (
|
||||
f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n"
|
||||
),
|
||||
img_idx_to_prompt=lambda idx: "<image>",
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<vlm_image>Please describe the image shortly.",
|
||||
"cherry_blossom": "<vlm_image>Please infer the season with reason.",
|
||||
}
|
||||
),
|
||||
multi_image_prompt=(
|
||||
"Picture 1: <vlm_image>\n"
|
||||
"Picture 2: <vlm_image>\n"
|
||||
"Describe these two images with one paragraph respectively."
|
||||
),
|
||||
enforce_eager=False,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
hf_model_kwargs={"device_map": "auto"},
|
||||
patch_hf_runner=model_utils.isaac_patch_hf_runner,
|
||||
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
),
|
||||
"kimi_vl": VLMTestInfo(
|
||||
models=["moonshotai/Kimi-VL-A3B-Instruct"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
|
||||
278
tests/models/multimodal/generation/test_isaac.py
Normal file
278
tests/models/multimodal/generation/test_isaac.py
Normal file
@ -0,0 +1,278 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
Separated from test_common.py because HF loading for PerceptronAI/Isaac-0.1
|
||||
requires perceptron package (Run 'pip install perceptron').
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
pytest.importorskip("perceptron", reason="Requires 'pip install perceptron'")
|
||||
|
||||
import types
|
||||
from pathlib import PosixPath
|
||||
|
||||
import torch
|
||||
from perceptron.tensorstream import TextType
|
||||
from perceptron.tensorstream.ops import compute_mrope_pos_tensor, modality_mask
|
||||
from transformers.modeling_outputs import BaseModelOutputWithPast
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, ImageTestAssets, VllmRunner
|
||||
from .vlm_utils import runners
|
||||
from .vlm_utils.case_filtering import get_parametrized_options
|
||||
from .vlm_utils.types import ExpandableVLMTestArgs, VLMTestInfo, VLMTestType
|
||||
|
||||
|
||||
def compute_position_ids_input_ids(input_ids: torch.Tensor) -> torch.Tensor:
|
||||
r"""Create 3D positional indices for token input.
|
||||
Args:
|
||||
input_ids (`torch.Tensor`):
|
||||
Tensor of shape `(batch_size, seq_len)` containing token ids.
|
||||
Returns:
|
||||
`torch.Tensor`: Positional indices with shape `(batch_size, seq_len, 3)`
|
||||
where each channel duplicates the 1D position so it can be consumed by
|
||||
the 3-axis MRoPE rotary embedding.
|
||||
"""
|
||||
batch_size, seq_length = input_ids.shape
|
||||
position_ids = torch.arange(seq_length, device=input_ids.device)
|
||||
position_ids = position_ids.view(1, -1).expand(batch_size, -1)
|
||||
position_ids = position_ids.unsqueeze(2).expand(-1, -1, 3) # Add 3D for MRoPE
|
||||
return position_ids
|
||||
|
||||
|
||||
def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patch HF runner for Isaac:
|
||||
1) move processor outputs to model device
|
||||
2) ensure IsaacModel.forward returns hidden_states
|
||||
for compatibility with hidden_states_to_seq_logprobs()
|
||||
"""
|
||||
|
||||
model_device = next(hf_model.model.parameters()).device
|
||||
|
||||
# ----------------------------
|
||||
# 1) Patch processor: move BatchFeature input_ids and TensorStream to model device
|
||||
# ----------------------------
|
||||
original_processor = hf_model.processor
|
||||
|
||||
def patched_processor(*args, **kwargs):
|
||||
result = original_processor(*args, **kwargs)
|
||||
for k, v in result.data.items():
|
||||
result[k] = v.to(model_device)
|
||||
return result
|
||||
|
||||
hf_model.processor = patched_processor
|
||||
|
||||
# ----------------------------
|
||||
# 2) Patch IsaacModel.forward: add hidden_states to the output
|
||||
# ----------------------------
|
||||
isaac_model = hf_model.model.model # IsaacModel
|
||||
|
||||
def patched_forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
tensor_stream=None,
|
||||
attention_mask=None,
|
||||
position_ids=None,
|
||||
modality_tensor=None,
|
||||
past_key_values=None,
|
||||
inputs_embeds=None,
|
||||
use_cache=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
cache_position=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Forward pass with MRoPE position embeddings.
|
||||
Computes position embeddings once and passes them through all layers.
|
||||
"""
|
||||
output_hidden_states = (
|
||||
output_hidden_states
|
||||
if output_hidden_states is not None
|
||||
else self.config.output_hidden_states
|
||||
)
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
return_dict = (
|
||||
return_dict if return_dict is not None else self.config.use_return_dict
|
||||
)
|
||||
|
||||
# Get inputs
|
||||
if tensor_stream is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both tensor_stream and inputs_embeds")
|
||||
elif tensor_stream is not None:
|
||||
# Embed TensorStream directly
|
||||
inputs_embeds = self.embed_stream(tensor_stream)
|
||||
# Create modality tensor if not provided
|
||||
if modality_tensor is None:
|
||||
modality_tensor = modality_mask(tensor_stream)
|
||||
elif input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError(
|
||||
"You cannot specify both input_ids and inputs_embeds at the same time"
|
||||
)
|
||||
elif input_ids is not None:
|
||||
inputs_embeds = self.embed_tokens(input_ids)
|
||||
# Create text modality tensor if not provided
|
||||
if modality_tensor is None:
|
||||
batch_size, seq_length = input_ids.shape
|
||||
modality_tensor = torch.full(
|
||||
(batch_size, seq_length),
|
||||
TextType.text.value,
|
||||
device=input_ids.device,
|
||||
dtype=torch.long,
|
||||
)
|
||||
elif inputs_embeds is None:
|
||||
raise ValueError(
|
||||
"You have to specify either tensor_stream, input_ids or inputs_embeds"
|
||||
)
|
||||
|
||||
# Create default position_ids if not provided
|
||||
if position_ids is None:
|
||||
if tensor_stream is not None:
|
||||
position_ids = compute_mrope_pos_tensor(tensor_stream) # (B,L,3)
|
||||
else:
|
||||
position_ids = compute_position_ids_input_ids(input_ids)
|
||||
|
||||
# Compute MRoPE position embeddings if we have custom rotary_emb
|
||||
cos, sin = self.rotary_emb(position_ids, modality_tensor)
|
||||
cos = cos.to(inputs_embeds.dtype)
|
||||
sin = sin.to(inputs_embeds.dtype)
|
||||
|
||||
# Prepare attention mask
|
||||
if attention_mask is not None:
|
||||
attention_mask = self._update_causal_mask(
|
||||
attention_mask, inputs_embeds, cache_position, past_key_values, False
|
||||
)
|
||||
|
||||
# Initialize and collect hidden states
|
||||
hidden_states = inputs_embeds
|
||||
hidden_states_list: list[torch.Tensor] = []
|
||||
|
||||
if output_hidden_states:
|
||||
hidden_states_list.append(hidden_states)
|
||||
|
||||
for decoder_layer in self.layers:
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_value=past_key_values,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
position_embeddings=(cos, sin),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
hidden_states = (
|
||||
layer_outputs[0] if isinstance(layer_outputs, tuple) else layer_outputs
|
||||
)
|
||||
|
||||
if output_hidden_states:
|
||||
hidden_states_list.append(hidden_states)
|
||||
|
||||
# Final layer norm
|
||||
hidden_states = self.norm(hidden_states)
|
||||
|
||||
if output_hidden_states:
|
||||
hidden_states_list.append(hidden_states)
|
||||
|
||||
# Convert to tuple or None
|
||||
all_hidden_states = tuple(hidden_states_list) if output_hidden_states else None
|
||||
|
||||
# Include hiden_states for compatibility with hidden_states_to_seq_logprobs()
|
||||
return BaseModelOutputWithPast(
|
||||
last_hidden_state=hidden_states,
|
||||
past_key_values=past_key_values,
|
||||
hidden_states=all_hidden_states,
|
||||
)
|
||||
|
||||
isaac_model.forward = types.MethodType(patched_forward, isaac_model)
|
||||
|
||||
return hf_model
|
||||
|
||||
|
||||
ISAAC_TEST_SETTINGS = {
|
||||
"isaac": VLMTestInfo(
|
||||
models=["PerceptronAI/Isaac-0.1"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: (
|
||||
f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n"
|
||||
),
|
||||
img_idx_to_prompt=lambda idx: "<image>",
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<vlm_image>Please describe the image shortly.",
|
||||
"cherry_blossom": "<vlm_image>Please infer the season with reason.",
|
||||
}
|
||||
),
|
||||
multi_image_prompt=(
|
||||
"Picture 1: <vlm_image>\n"
|
||||
"Picture 2: <vlm_image>\n"
|
||||
"Describe these two images with one paragraph respectively."
|
||||
),
|
||||
enforce_eager=False,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
hf_model_kwargs={"device_map": "auto"},
|
||||
patch_hf_runner=isaac_patch_hf_runner,
|
||||
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
### Test wrappers
|
||||
# Wrappers around the test running func for:
|
||||
# - single image
|
||||
# - multi-image
|
||||
@pytest.mark.parametrize(
|
||||
"model_type,test_case",
|
||||
get_parametrized_options(
|
||||
ISAAC_TEST_SETTINGS,
|
||||
test_type=VLMTestType.IMAGE,
|
||||
create_new_process_for_each_test=False,
|
||||
),
|
||||
)
|
||||
def test_isaac_single_image(
|
||||
tmp_path: PosixPath,
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
model_test_info = ISAAC_TEST_SETTINGS[model_type]
|
||||
runners.run_single_image_test(
|
||||
tmp_path=tmp_path,
|
||||
model_test_info=model_test_info,
|
||||
test_case=test_case,
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
image_assets=image_assets,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_type,test_case",
|
||||
get_parametrized_options(
|
||||
ISAAC_TEST_SETTINGS,
|
||||
test_type=VLMTestType.MULTI_IMAGE,
|
||||
create_new_process_for_each_test=False,
|
||||
),
|
||||
)
|
||||
def test_isaac_multi_image(
|
||||
tmp_path: PosixPath,
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
model_test_info = ISAAC_TEST_SETTINGS[model_type]
|
||||
runners.run_multi_image_test(
|
||||
tmp_path=tmp_path,
|
||||
model_test_info=model_test_info,
|
||||
test_case=test_case,
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
image_assets=image_assets,
|
||||
)
|
||||
@ -522,6 +522,183 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
return hf_model
|
||||
|
||||
|
||||
def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patch HF runner for Isaac:
|
||||
1) Move processor outputs to model device
|
||||
2) Ensure IsaacModel.forward returns hidden_states
|
||||
for compatibility with hidden_states_to_seq_logprobs()
|
||||
"""
|
||||
|
||||
from perceptron.tensorstream import TextType
|
||||
from perceptron.tensorstream.ops import compute_mrope_pos_tensor, modality_mask
|
||||
from transformers.modeling_outputs import BaseModelOutputWithPast
|
||||
|
||||
def compute_position_ids_input_ids(input_ids: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Create 3D positional indices for token input.
|
||||
"""
|
||||
batch_size, seq_length = input_ids.shape
|
||||
position_ids = torch.arange(seq_length, device=input_ids.device)
|
||||
position_ids = position_ids.view(1, -1).expand(batch_size, -1)
|
||||
position_ids = position_ids.unsqueeze(2).expand(-1, -1, 3) # Add 3D for MRoPE
|
||||
return position_ids
|
||||
|
||||
model_device = next(hf_model.model.parameters()).device
|
||||
|
||||
# ----------------------------
|
||||
# 1) Patch processor: move BatchFeature input_ids and TensorStream to model device
|
||||
# ----------------------------
|
||||
original_processor = hf_model.processor
|
||||
|
||||
def patched_processor(*args, **kwargs):
|
||||
result = original_processor(*args, **kwargs)
|
||||
for k, v in result.data.items():
|
||||
result[k] = v.to(model_device)
|
||||
return result
|
||||
|
||||
hf_model.processor = patched_processor
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
hf_model.model_name, trust_remote_code=True
|
||||
)
|
||||
|
||||
original_generate = hf_model.model.generate
|
||||
|
||||
def patched_generate(*args, **kwargs):
|
||||
kwargs["pad_token_id"] = tokenizer.eos_token_id
|
||||
kwargs["eos_token_id"] = tokenizer.eos_token_id
|
||||
return original_generate(*args, **kwargs)
|
||||
|
||||
hf_model.model.generate = patched_generate
|
||||
|
||||
# ----------------------------
|
||||
# 2) Patch IsaacModel.forward: add hidden_states to the output
|
||||
# ----------------------------
|
||||
isaac_model = hf_model.model.model
|
||||
|
||||
def patched_forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
tensor_stream=None,
|
||||
attention_mask=None,
|
||||
position_ids=None,
|
||||
modality_tensor=None,
|
||||
past_key_values=None,
|
||||
inputs_embeds=None,
|
||||
use_cache=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
cache_position=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Forward pass with MRoPE position embeddings.
|
||||
Computes position embeddings once and passes them through all layers.
|
||||
"""
|
||||
output_hidden_states = (
|
||||
output_hidden_states
|
||||
if output_hidden_states is not None
|
||||
else self.config.output_hidden_states
|
||||
)
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
return_dict = (
|
||||
return_dict if return_dict is not None else self.config.use_return_dict
|
||||
)
|
||||
|
||||
# Get inputs
|
||||
if tensor_stream is not None and inputs_embeds is not None:
|
||||
raise ValueError("You cannot specify both tensor_stream and inputs_embeds")
|
||||
elif tensor_stream is not None:
|
||||
# Embed TensorStream directly
|
||||
inputs_embeds = self.embed_stream(tensor_stream)
|
||||
# Create modality tensor if not provided
|
||||
if modality_tensor is None:
|
||||
modality_tensor = modality_mask(tensor_stream)
|
||||
elif input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError(
|
||||
"You cannot specify both input_ids and inputs_embeds at the same time"
|
||||
)
|
||||
elif input_ids is not None:
|
||||
inputs_embeds = self.embed_tokens(input_ids)
|
||||
# Create text modality tensor if not provided
|
||||
if modality_tensor is None:
|
||||
batch_size, seq_length = input_ids.shape
|
||||
modality_tensor = torch.full(
|
||||
(batch_size, seq_length),
|
||||
TextType.text.value,
|
||||
device=input_ids.device,
|
||||
dtype=torch.long,
|
||||
)
|
||||
elif inputs_embeds is None:
|
||||
raise ValueError(
|
||||
"You have to specify either tensor_stream, input_ids or inputs_embeds"
|
||||
)
|
||||
|
||||
# Create default position_ids if not provided
|
||||
if position_ids is None:
|
||||
if tensor_stream is not None:
|
||||
position_ids = compute_mrope_pos_tensor(tensor_stream) # (B,L,3)
|
||||
else:
|
||||
position_ids = compute_position_ids_input_ids(input_ids)
|
||||
|
||||
# Compute MRoPE position embeddings if we have custom rotary_emb
|
||||
cos, sin = self.rotary_emb(position_ids, modality_tensor)
|
||||
cos = cos.to(inputs_embeds.dtype)
|
||||
sin = sin.to(inputs_embeds.dtype)
|
||||
|
||||
# Prepare attention mask
|
||||
if attention_mask is not None:
|
||||
attention_mask = self._update_causal_mask(
|
||||
attention_mask, inputs_embeds, cache_position, past_key_values, False
|
||||
)
|
||||
|
||||
# Initialize and collect hidden states
|
||||
hidden_states = inputs_embeds
|
||||
hidden_states_list: list[torch.Tensor] = []
|
||||
|
||||
if output_hidden_states:
|
||||
hidden_states_list.append(hidden_states)
|
||||
|
||||
for decoder_layer in self.layers:
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_value=past_key_values,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
position_embeddings=(cos, sin),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
hidden_states = (
|
||||
layer_outputs[0] if isinstance(layer_outputs, tuple) else layer_outputs
|
||||
)
|
||||
|
||||
if output_hidden_states:
|
||||
hidden_states_list.append(hidden_states)
|
||||
|
||||
# Final layer norm
|
||||
hidden_states = self.norm(hidden_states)
|
||||
|
||||
if output_hidden_states:
|
||||
hidden_states_list.append(hidden_states)
|
||||
|
||||
# Convert to tuple or None
|
||||
all_hidden_states = tuple(hidden_states_list) if output_hidden_states else None
|
||||
|
||||
# Include hiden_states for compatibility with hidden_states_to_seq_logprobs()
|
||||
return BaseModelOutputWithPast(
|
||||
last_hidden_state=hidden_states,
|
||||
past_key_values=past_key_values,
|
||||
hidden_states=all_hidden_states,
|
||||
)
|
||||
|
||||
isaac_model.forward = types.MethodType(patched_forward, isaac_model)
|
||||
|
||||
return hf_model
|
||||
|
||||
|
||||
def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for SkyworkR1V."""
|
||||
|
||||
|
||||
@ -662,6 +662,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"HuggingFaceM4/Idefics3-8B-Llama3",
|
||||
extras={"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"},
|
||||
),
|
||||
"IsaacForConditionalGeneration": _HfExamplesInfo(
|
||||
"PerceptronAI/Isaac-0.1",
|
||||
trust_remote_code=True,
|
||||
),
|
||||
"InternS1ForConditionalGeneration": _HfExamplesInfo(
|
||||
"internlm/Intern-S1", trust_remote_code=True
|
||||
),
|
||||
|
||||
1480
vllm/model_executor/models/isaac.py
Normal file
1480
vllm/model_executor/models/isaac.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -333,6 +333,7 @@ _MULTIMODAL_MODELS = {
|
||||
"idefics3",
|
||||
"Idefics3ForConditionalGeneration",
|
||||
),
|
||||
"IsaacForConditionalGeneration": ("isaac", "IsaacForConditionalGeneration"),
|
||||
"SmolVLMForConditionalGeneration": ("smolvlm", "SmolVLMForConditionalGeneration"), # noqa: E501
|
||||
"KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),
|
||||
"KeyeVL1_5ForConditionalGeneration": (
|
||||
|
||||
@ -81,6 +81,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
|
||||
deepseek_v32="DeepseekV3Config",
|
||||
flex_olmo="FlexOlmoConfig",
|
||||
hunyuan_vl="HunYuanVLConfig",
|
||||
isaac="IsaacConfig",
|
||||
kimi_linear="KimiLinearConfig",
|
||||
kimi_vl="KimiVLConfig",
|
||||
RefinedWeb="RWConfig", # For tiiuae/falcon-40b(-instruct)
|
||||
|
||||
@ -25,6 +25,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
|
||||
"HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl",
|
||||
"HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl",
|
||||
"HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl",
|
||||
"IsaacConfig": "vllm.transformers_utils.configs.isaac",
|
||||
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
|
||||
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
|
||||
# `FalconConfig` class from the official HuggingFace transformers library.
|
||||
@ -41,6 +42,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
|
||||
"NemotronHConfig": "vllm.transformers_utils.configs.nemotron_h",
|
||||
"Olmo3Config": "vllm.transformers_utils.configs.olmo3",
|
||||
"OvisConfig": "vllm.transformers_utils.configs.ovis",
|
||||
"PixelShuffleSiglip2VisionConfig": "vllm.transformers_utils.configs.isaac",
|
||||
"RadioConfig": "vllm.transformers_utils.configs.radio",
|
||||
"SpeculatorsConfig": "vllm.transformers_utils.configs.speculators.base",
|
||||
"UltravoxConfig": "vllm.transformers_utils.configs.ultravox",
|
||||
@ -65,6 +67,7 @@ __all__ = [
|
||||
"HunYuanVLConfig",
|
||||
"HunYuanVLTextConfig",
|
||||
"HunYuanVLVisionConfig",
|
||||
"IsaacConfig",
|
||||
"RWConfig",
|
||||
"JAISConfig",
|
||||
"Lfm2MoeConfig",
|
||||
@ -78,6 +81,7 @@ __all__ = [
|
||||
"NemotronHConfig",
|
||||
"Olmo3Config",
|
||||
"OvisConfig",
|
||||
"PixelShuffleSiglip2VisionConfig",
|
||||
"RadioConfig",
|
||||
"SpeculatorsConfig",
|
||||
"UltravoxConfig",
|
||||
|
||||
86
vllm/transformers_utils/configs/isaac.py
Normal file
86
vllm/transformers_utils/configs/isaac.py
Normal file
@ -0,0 +1,86 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from __future__ import annotations
|
||||
|
||||
from transformers import Qwen3Config
|
||||
from transformers.models.siglip2.configuration_siglip2 import Siglip2VisionConfig
|
||||
|
||||
|
||||
class PixelShuffleSiglip2VisionConfig(Siglip2VisionConfig):
|
||||
"""Vision configuration for Isaac with Pixel Shuffle support.
|
||||
|
||||
Extends Siglip2VisionConfig with additional fields for pixel shuffle.
|
||||
"""
|
||||
|
||||
model_type = "pixel_shuffle_siglip2"
|
||||
base_config_key = "vision_config"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
pixel_shuffle_scale_factor: int = 1,
|
||||
num_patches: int = 256,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
# Add our custom fields
|
||||
self.pixel_shuffle_scale_factor = pixel_shuffle_scale_factor
|
||||
self.num_patches = num_patches
|
||||
|
||||
|
||||
class IsaacConfig(Qwen3Config):
|
||||
"""Configuration class for Isaac multimodal model."""
|
||||
|
||||
model_type = "isaac"
|
||||
sub_configs = {"vision_config": PixelShuffleSiglip2VisionConfig}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vision_config=None,
|
||||
vision_patch_size: int = 16,
|
||||
vision_max_num_patches: int = 256,
|
||||
vision_min_num_patches: int | None = None,
|
||||
pixel_shuffle_scale: int = 1,
|
||||
max_sequence_length: int = 16384,
|
||||
vision_token: str = "<image>",
|
||||
vision_attn_implementation: str | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
# EventStreamProcessor parameters (for backward compatibility)
|
||||
self.video_patch_size = vision_patch_size
|
||||
self.vision_max_num_patches = vision_max_num_patches
|
||||
self.vision_min_num_patches = vision_min_num_patches
|
||||
self.pixel_shuffle_scale = pixel_shuffle_scale
|
||||
|
||||
# Processing parameters
|
||||
self.max_sequence_length = max_sequence_length
|
||||
self.vision_token = vision_token
|
||||
|
||||
# Handle vision config - PixelShuffleSiglip2VisionConfig instance
|
||||
if isinstance(vision_config, dict):
|
||||
self.vision_config = PixelShuffleSiglip2VisionConfig(**vision_config)
|
||||
elif vision_config is None:
|
||||
self.vision_config = PixelShuffleSiglip2VisionConfig()
|
||||
else:
|
||||
self.vision_config = vision_config
|
||||
|
||||
# Ensure compatibility with pretrained checkpoints
|
||||
self.vision_config.pixel_shuffle_scale_factor = getattr(
|
||||
self.vision_config,
|
||||
"pixel_shuffle_scale_factor",
|
||||
pixel_shuffle_scale,
|
||||
)
|
||||
self.vision_config.num_patches = getattr(
|
||||
self.vision_config,
|
||||
"num_patches",
|
||||
vision_max_num_patches,
|
||||
)
|
||||
self.vision_attn_implementation = vision_attn_implementation
|
||||
|
||||
|
||||
__all__ = [
|
||||
"IsaacConfig",
|
||||
"PixelShuffleSiglip2VisionConfig",
|
||||
]
|
||||
Loading…
x
Reference in New Issue
Block a user