diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 7a209b6587a62..001a5b96174ac 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -634,6 +634,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen |--------------|--------|--------|-------------------|----------------------|---------------------------| | `AriaForConditionalGeneration` | Aria | T + I+ | `rhymes-ai/Aria` | | | | `AyaVisionForConditionalGeneration` | Aya Vision | T + I+ | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | +| `BeeForConditionalGeneration` | Bee-8B | T + IE+ | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ | | `Blip2ForConditionalGeneration` | BLIP-2 | T + IE | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | | `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | | `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I+ | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 5d9cac8dfabbf..35311a0ca7e1a 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -90,6 +90,33 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData: ) +# Bee-8B +def run_bee(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + model_name = "Open-Bee/Bee-8B-RL" + + prompts = [ + ( + f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + f"<|im_start|>user\n\n{question}<|im_end|>" + f"<|im_start|>assistant\n\n" + ) + for question in questions + ] + + engine_args = EngineArgs( + model=model_name, + max_model_len=16384, + limit_mm_per_prompt={modality: 1}, + trust_remote_code=True, + ) + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # BLIP-2 def run_blip2(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1708,6 +1735,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData: model_example_map = { "aria": run_aria, "aya_vision": run_aya_vision, + "bee": run_bee, "blip-2": run_blip2, "chameleon": run_chameleon, "dots_ocr": run_dots_ocr, diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index accb6c742a2b6..bd7e1d6b0466b 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -107,6 +107,41 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_bee(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "Open-Bee/Bee-8B-RL" + + engine_args = EngineArgs( + model=model_name, + max_model_len=16384, + max_num_seqs=16, + limit_mm_per_prompt={"image": len(image_urls)}, + trust_remote_code=True, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] + + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) + + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_command_a_vision(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "CohereLabs/command-a-vision-07-2025" @@ -1215,6 +1250,7 @@ def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData: model_example_map = { "aria": load_aria, "aya_vision": load_aya_vision, + "bee": load_bee, "command_a_vision": load_command_a_vision, "deepseek_vl_v2": load_deepseek_vl2, "gemma3": load_gemma3, diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 78bd284b565f5..4e693b3102771 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -326,6 +326,7 @@ def _test_processing_correctness_one( [ "rhymes-ai/Aria", "CohereForAI/aya-vision-8b", + "Open-Bee/Bee-8B-RL", "Salesforce/blip2-opt-2.7b", "facebook/chameleon-7b", "CohereLabs/command-a-vision-07-2025", diff --git a/tests/models/registry.py b/tests/models/registry.py index c6d6fa3f52ba5..7345d2e07dc7b 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -566,6 +566,10 @@ _MULTIMODAL_EXAMPLE_MODELS = { # [Decoder-only] "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"), "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), + "BeeForConditionalGeneration": _HfExamplesInfo( + "Open-Bee/Bee-8B-RL", + trust_remote_code=True, + ), "Blip2ForConditionalGeneration": _HfExamplesInfo( "Salesforce/blip2-opt-2.7b", extras={"6b": "Salesforce/blip2-opt-6.7b"}, diff --git a/vllm/model_executor/models/bee.py b/vllm/model_executor/models/bee.py new file mode 100644 index 0000000000000..4f0342df404b3 --- /dev/null +++ b/vllm/model_executor/models/bee.py @@ -0,0 +1,157 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Mapping + +import torch +import torch.nn as nn +from transformers.activations import GELUActivation + +from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalDataDict + +from .llava_next import ( + LlavaDummyInputsBuilder, + LlavaNextMultiModalProcessor, + LlavaNextProcessingInfo, +) +from .llava_onevision import LlavaOnevisionForConditionalGeneration +from .utils import WeightsMapper + + +class BeeProcessingInfo(LlavaNextProcessingInfo): + def get_hf_config(self): + return self.ctx.get_hf_config() + + def get_hf_processor(self, **kwargs: object): + return self.ctx.get_hf_processor(**kwargs) + + def _get_num_unpadded_features( + self, + *, + original_height: int, + original_width: int, + npatches: int, + num_patch_height: int, + num_patch_width: int, + ) -> tuple[int, int]: + """Override to use correct max_num_patches from vision_aspect_ratio.""" + import math + + current_height = npatches * num_patch_height + current_width = npatches * num_patch_width + + aspect_ratio = original_width / original_height + current_aspect_ratio = current_width / current_height + + if aspect_ratio > current_aspect_ratio: + new_height = int( + round(original_height * (current_width / original_width), 7) + ) + padding = (current_height - new_height) // 2 + current_height = current_height - (2 * padding) + else: + new_width = int( + round(original_width * (current_height / original_height), 7) + ) + padding = (current_width - new_width) // 2 + current_width = current_width - (2 * padding) + + unpadded_features = current_height * current_width + newline_features = current_height + + # Get max_num_patches from vision_aspect_ratio config + hf_config = self.get_hf_config() + vision_aspect_ratio = getattr(hf_config, "vision_aspect_ratio", "anyres_max_9") + max_num_patches = int(vision_aspect_ratio.replace("anyres_max_", "")) + + ratio = math.sqrt( + current_height * current_width / (max_num_patches * npatches**2) + ) + if ratio > 1.1: + height_factor = int(current_height // ratio) + width_factor = int(current_width // ratio) + unpadded_features = height_factor * width_factor + newline_features = height_factor + + return (unpadded_features, newline_features) + + +class BeeDummyInputsBuilder(LlavaDummyInputsBuilder[BeeProcessingInfo]): + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + image_token = "" + + return image_token * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + mm_options: Mapping[str, BaseDummyOptions] | None = None, + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + + target_width, target_height = self.info.get_image_size_with_most_features() + + image_overrides = mm_options.get("image") if mm_options else None + + return { + "image": self._get_dummy_images( + width=target_width, + height=target_height, + num_images=num_images, + overrides=image_overrides, + ), + } + + +class BeeMultiModalProjector(nn.Module): + def __init__(self, config): + super().__init__() + self.pre_norm = nn.LayerNorm(config.vision_config.hidden_size, eps=1e-06) + self.linear_1 = nn.Linear( + config.vision_config.hidden_size, + config.text_config.hidden_size * 4, + bias=True, + ) + self.act = GELUActivation() + self.linear_2 = nn.Linear( + config.text_config.hidden_size * 4, + config.text_config.hidden_size, + bias=True, + ) + + def forward(self, image_feature: torch.Tensor) -> torch.Tensor: + image_feature = self.pre_norm(image_feature) + hidden_states = self.linear_1(image_feature) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_2(hidden_states) + + return hidden_states + + +@MULTIMODAL_REGISTRY.register_processor( + LlavaNextMultiModalProcessor, + info=BeeProcessingInfo, + dummy_inputs=BeeDummyInputsBuilder, +) +class BeeForConditionalGeneration(LlavaOnevisionForConditionalGeneration): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + # mapping for new names in checkpoint saved after transformers + # v4.55 + "model.language_model.": "language_model.model.", + "model.vision_tower.": "vision_tower.", + "model.multi_modal_projector.": "multi_modal_projector.", + "model.image_newline": "image_newline", + "lm_head.": "language_model.lm_head.", + } + ) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__(vllm_config=vllm_config, prefix=prefix) + config = vllm_config.model_config.hf_config + self.multi_modal_projector = BeeMultiModalProjector(config) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 118c225431a50..da1606a7568dd 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -247,6 +247,7 @@ _MULTIMODAL_MODELS = { "aya_vision", "AyaVisionForConditionalGeneration", ), + "BeeForConditionalGeneration": ("bee", "BeeForConditionalGeneration"), "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"), "ChameleonForConditionalGeneration": ( "chameleon",