diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 0f3b730eabedc..3847fc15119fd 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -593,6 +593,7 @@ Specified using `--task generate`. | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | +| `InternS1ForConditionalGeneration` | Intern-S1 | T + IE+ | `internlm/Intern-S1`, etc. | | ✅︎ | ✅︎ | | `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + IE+ + VE+ | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ | | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ | diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index eb6b410848558..61f5525c6d7e7 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -468,6 +468,37 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: ) +# Intern-S1 +def run_interns1(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "internlm/Intern-S1" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=8192, + max_num_seqs=2, + limit_mm_per_prompt={modality: 1}, + enforce_eager=True, + ) + + placeholder = "" + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + messages = [ + [{"role": "user", "content": f"{placeholder}\n{question}"}] + for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # InternVL def run_internvl(questions: list[str], modality: str) -> ModelRequestData: model_name = "OpenGVLab/InternVL3-2B" @@ -1303,6 +1334,7 @@ model_example_map = { "h2ovl_chat": run_h2ovl, "hyperclovax_seed_vision": run_hyperclovax_seed_vision, "idefics3": run_idefics3, + "interns1": run_interns1, "internvl_chat": run_internvl, "nemotron_vl": run_nemotron_vl, "keye_vl": run_keye_vl, diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 2e14fc807e104..e312a0953e9be 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -253,6 +253,33 @@ def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "internlm/Intern-S1" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = "\n".join( + f"Image-{i}: \n" for i, _ in enumerate(image_urls, start=1) + ) + messages = [{"role": "user", "content": f"{placeholders}\n{question}"}] + + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "OpenGVLab/InternVL2-2B" @@ -946,6 +973,7 @@ model_example_map = { "gemma3": load_gemma3, "h2ovl_chat": load_h2ovl, "idefics3": load_idefics3, + "interns1": load_interns1, "internvl_chat": load_internvl, "hyperclovax_seed_vision": load_hyperclovax_seed_vision, "keye_vl": load_keye_vl, diff --git a/tests/models/registry.py b/tests/models/registry.py index b41e432d738a7..0dc5aec8db12e 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -381,6 +381,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { extras={"2B": "OpenGVLab/InternVL2-2B", "3.0": "OpenGVLab/InternVL3-1B"}, # noqa: E501 trust_remote_code=True), + "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1", + trust_remote_code=True), "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501 {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501 "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501 diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py new file mode 100644 index 0000000000000..36204e4c5953f --- /dev/null +++ b/vllm/model_executor/models/interns1.py @@ -0,0 +1,711 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# -------------------------------------------------------- +# InternS1 +# Copyright (c) 2025 Shanghai AI Lab +# Licensed under The MIT License [see LICENSE for details] +# -------------------------------------------------------- +from collections.abc import Iterable, Mapping, Sequence +from typing import Literal, Optional, TypedDict, Union + +import torch +import torch.nn as nn +from transformers import InternVLProcessor, PretrainedConfig +from transformers.activations import ACT2FN +from transformers.models.got_ocr2.image_processing_got_ocr2_fast import ( + GotOcr2ImageProcessorFast) + +from vllm.config import VllmConfig +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.models.interns1_vit import InternS1VisionModel +from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalKwargs, NestedTensors) +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + ImageSize, MultiModalDataItems) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement, + PromptUpdate, PromptUpdateDetails) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors + +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsPP) +from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, + init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings) + + +class InternS1MultiModalProjector(nn.Module): + + def __init__(self, config): + super().__init__() + self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size * + int(1 / config.downsample_ratio)**2) + self.linear_1 = nn.Linear( + config.vision_config.hidden_size * + int(1 / config.downsample_ratio)**2, + config.text_config.hidden_size) + self.act = ACT2FN[config.projector_hidden_act] + self.linear_2 = nn.Linear(config.text_config.hidden_size, + config.text_config.hidden_size) + + def forward(self, image_features): + hidden_states = self.layer_norm(image_features) + hidden_states = self.linear_1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + return hidden_states + + +class InternS1ImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + pixel_values: torch.Tensor + """ + Shape: + `(batch_size * num_images * (1 + num_patches), num_channels, height, width)` + """ + + +class InternS1ImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: Union[torch.Tensor, list[torch.Tensor]] + """ + A tensor of shape `(num_images, total_image_feature_size, hidden_size)` + or a list of tensors of shape `(total_image_feature_size, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + """ + + +InternS1ImageInputs = Union[InternS1ImagePixelInputs, + InternS1ImageEmbeddingInputs] + + +class InternS1VideoPixelInputs(TypedDict): + type: Literal["pixel_values_videos"] + pixel_values: torch.Tensor + """ + Shape: + `(batch_size * num_video * num_frames, num_channels, height, width)` + """ + + num_patches: torch.Tensor + """Shape: `(batch_size * num_images)`""" + + +class InternS1VideoEmbeddingInputs(TypedDict): + type: Literal["video_embeds"] + data: Union[torch.Tensor, list[torch.Tensor]] + """ + A tensor of shape `(num_videos, total_video_feature_size, hidden_size)` + or a list of tensors of shape `(total_video_feature_size, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + """ + + +InternS1VideoInputs = Union[InternS1VideoPixelInputs, + InternS1VideoEmbeddingInputs] + + +def resolve_interns1_min_max_num( + min_dynamic_patch: int, + max_dynamic_patch: int, + dynamic_image_size: bool, + use_thumbnail: bool, +) -> tuple[int, int]: + min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1 + max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 + + if use_thumbnail and max_dynamic_patch != 1: + max_dynamic_patch += 1 + + return min_dynamic_patch, max_dynamic_patch + + +def get_interns1_target_ratios( + min_num: int, + max_num: int, +) -> list[tuple[int, int]]: + target_ratios = {(i, j) + for n in range(min_num, max_num + 1) + for i in range(1, n + 1) + for j in range(1, n + 1) if min_num <= i * j <= max_num} + return sorted(target_ratios, key=lambda x: x[0] * x[1]) + + +class InternS1ProcessingInfo(BaseProcessingInfo): + """Basic image-only ProcessingInfo for InternS1-style models.""" + + def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: + return self.ctx.get_hf_processor(InternVLProcessor, **kwargs) + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + processor: Optional['GotOcr2ImageProcessorFast'] = None, + ) -> int: + if processor is None: + processor = self.get_hf_processor().image_processor + + if not isinstance(processor, GotOcr2ImageProcessorFast): + raise ValueError(f'GotOcr2ImageProcessorFast is expected but got ' + f'{type(processor)}') + num_image_patches = processor.get_number_of_image_tokens( + image_height, image_width, images_kwargs=dict()) + num_image_tokens = self.get_hf_processor( + ).image_seq_length * num_image_patches + return num_image_tokens + + def resolve_target_ratios(self, use_thumbnail: Optional[bool] = None): + image_processor = self.get_hf_processor().image_processor + min_dynamic_patch = image_processor.min_patches + max_dynamic_patch = image_processor.max_patches + # HF format's InternVL processor uses `crop_to_patches` which is + # equivalent to `use_thumbnail` in original format. + use_thumbnail = image_processor.crop_to_patches + dynamic_image_size = True + min_num, max_num = resolve_interns1_min_max_num( + min_dynamic_patch, + max_dynamic_patch, + dynamic_image_size, + use_thumbnail=use_thumbnail) + + return get_interns1_target_ratios(min_num, max_num) + + def get_image_size_with_most_features(self) -> ImageSize: + processor = self.get_hf_processor() + + hf_config = self.ctx.get_hf_config() + base_height, base_width = hf_config.vision_config.image_size + target_ratios = self.resolve_target_ratios() + + largest_feature_size, largest_feature_pinpoint = 0, None + for wr, hr in target_ratios: + width, height = base_width * wr, base_height * hr + + feat_size = self.get_num_image_tokens( + image_width=width, + image_height=height, + processor=processor.image_processor, + ) + if feat_size > largest_feature_size: + largest_feature_size = feat_size + largest_feature_pinpoint = ImageSize(width=width, + height=height) + + assert not (largest_feature_size == 0 or largest_feature_pinpoint + is None), ("Cannot have a largest feature size of 0!") + + return largest_feature_pinpoint + + def get_max_image_tokens(self) -> int: + processor = self.get_hf_processor() + target_width, target_height = self.get_image_size_with_most_features() + + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + processor=processor.image_processor, + ) + + +class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo] + ): + """Basic image-only DummyInputsBuilder for InternS1-style models.""" + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + image_token = self.info.get_hf_processor().image_token + + return image_token * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + target_width, target_height = \ + self.info.get_image_size_with_most_features() + num_images = mm_counts.get("image", 0) + + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + +class InternS1MultiModalProcessor( + BaseMultiModalProcessor[InternS1ProcessingInfo]): + """ Basic image-only MultiModalProcessor for InternS1-style models.""" + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> Mapping[str, NestedTensors]: + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + tok_kwargs=tok_kwargs, + ) + + hf_processor = self.info.get_hf_processor(**mm_kwargs) + image_token_id = hf_processor.image_token_id + + # Since there may be extra tokens in the feature placeholders, + # we need to pass the image token ID to the model to select the + # tokens to merge from the vision encoder outputs + processed_outputs["image_token_id"] = torch.tensor(image_token_id) + images = mm_data.get('images', None) + image_processor = self.info.get_hf_processor().image_processor + if images is not None: + image_inputs = image_processor(images=images) + image_num_patches = image_inputs.pop("num_patches") + if not isinstance(image_num_patches, list): + raise ValueError( + f'num_patches is supposed to be list, but got ' + f'{type(image_num_patches)}') + image_num_patches = torch.tensor(image_num_patches) + processed_outputs['image_num_patches'] = image_num_patches + + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: Mapping[str, NestedTensors], + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + + image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0)) + num_images = len(image_num_patches) + + return dict( + pixel_values=MultiModalFieldConfig.flat_from_sizes( + "image", image_num_patches), + image_num_patches=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + image_token_id=MultiModalFieldConfig.shared("image", num_images), + ) + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + img_context_token = hf_processor.image_token + start_image_token = hf_processor.start_image_token + end_image_token = hf_processor.end_image_token + + def get_replacement(item_idx: int): + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) + + if isinstance(images, ImageEmbeddingItems): + feature_size = images.get_feature_size(item_idx) + else: + image_size = images.get_image_size(item_idx) + feature_size = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + processor=hf_processor.image_processor, + ) + + repl_features = img_context_token * feature_size + repl_full = start_image_token + repl_features + end_image_token + return PromptUpdateDetails.select_text(repl_full, + img_context_token) + + return [ + PromptReplacement( + modality="image", + target=img_context_token, + replacement=get_replacement, + ) + ] + + +@MULTIMODAL_REGISTRY.register_processor( + InternS1MultiModalProcessor, + info=InternS1ProcessingInfo, + dummy_inputs=InternS1DummyInputsBuilder) +class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal, + SupportsPP, SupportsLoRA): + + # To ensure correct weight loading and mapping. + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "lm_head.": "language_model.lm_head.", + "model.language_model.": "language_model.model.", + "model.vision_tower.": "vision_tower.", + "model.multi_modal_projector.": "multi_modal_projector.", + }) + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: + # transformers InternVLProcessor uses as the seperator + # refer to https://github.com/huggingface/transformers/blob/f90de364c2484c7c325bbe05befdcf487bd75b63/src/transformers/models/internvl/processing_internvl.py#L116 + if modality.startswith("image"): + return '' + if modality.startswith("video"): + return "