From 19a00eb21099f7cd82fc4460718613df6f54fea1 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 6 Oct 2025 17:45:26 +0800 Subject: [PATCH] [Model] Use `merge_by_field_config` for MM models (Llava family) (#26280) Signed-off-by: DarkLight1337 --- .../vision_language_multi_image.py | 218 +++++++++--------- vllm/model_executor/models/llava.py | 20 +- vllm/model_executor/models/llava_next.py | 24 +- .../model_executor/models/llava_next_video.py | 29 ++- vllm/model_executor/models/llava_onevision.py | 34 +-- vllm/model_executor/models/minimax_vl_01.py | 25 +- vllm/model_executor/models/mistral3.py | 11 +- vllm/model_executor/models/pixtral.py | 6 +- vllm/model_executor/models/tarsier.py | 17 +- 9 files changed, 155 insertions(+), 229 deletions(-) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index c584dce793543..3e711e2053131 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -371,6 +371,115 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "Kwai-Keye/Keye-VL-8B-Preview" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=8192, + max_num_seqs=5, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + }, + ] + + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) + + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + image_data = [fetch_image(url) for url in image_urls] + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=image_data, + ) + + +def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "Kwai-Keye/Keye-VL-1_5-8B" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=32768, + max_num_seqs=5, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + }, + ] + + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) + + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + image_data = [fetch_image(url) for url in image_urls] + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=image_data, + ) + + +def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "moonshotai/Kimi-VL-A3B-Instruct" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + max_num_seqs=4, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ + { + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], + } + ] + + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) + + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" @@ -505,115 +614,6 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa ) -def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData: - model_name = "Kwai-Keye/Keye-VL-8B-Preview" - - engine_args = EngineArgs( - model=model_name, - trust_remote_code=True, - max_model_len=8192, - max_num_seqs=5, - limit_mm_per_prompt={"image": len(image_urls)}, - ) - - placeholders = [{"type": "image", "image": url} for url in image_urls] - messages = [ - { - "role": "user", - "content": [ - *placeholders, - {"type": "text", "text": question}, - ], - }, - ] - - processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) - - prompt = processor.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - - image_data = [fetch_image(url) for url in image_urls] - - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image_data=image_data, - ) - - -def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData: - model_name = "Kwai-Keye/Keye-VL-1_5-8B" - - engine_args = EngineArgs( - model=model_name, - trust_remote_code=True, - max_model_len=32768, - max_num_seqs=5, - limit_mm_per_prompt={"image": len(image_urls)}, - ) - - placeholders = [{"type": "image", "image": url} for url in image_urls] - messages = [ - { - "role": "user", - "content": [ - *placeholders, - {"type": "text", "text": question}, - ], - }, - ] - - processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) - - prompt = processor.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - - image_data = [fetch_image(url) for url in image_urls] - - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image_data=image_data, - ) - - -def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData: - model_name = "moonshotai/Kimi-VL-A3B-Instruct" - - engine_args = EngineArgs( - model=model_name, - trust_remote_code=True, - max_model_len=4096, - max_num_seqs=4, - limit_mm_per_prompt={"image": len(image_urls)}, - ) - - placeholders = [{"type": "image", "image": url} for url in image_urls] - messages = [ - { - "role": "user", - "content": [ - *placeholders, - {"type": "text", "text": question}, - ], - } - ] - - processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) - - prompt = processor.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], - ) - - def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index d784be069f067..3d46e22a0d217 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -57,7 +57,6 @@ from .siglip import SiglipVisionModel from .utils import ( AutoWeightsLoader, WeightsMapper, - flatten_bn, init_vllm_registered_model, maybe_prefix, ) @@ -507,6 +506,8 @@ def init_vision_tower_for_llava( dummy_inputs=LlavaDummyInputsBuilder, ) class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): + merge_by_field_config = True + packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], @@ -592,37 +593,26 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): return None if pixel_values is not None: - if not isinstance(pixel_values, (torch.Tensor, list)): - raise ValueError( - f"Incorrect type of pixel values. Got type: {type(pixel_values)}" - ) - if self.config.vision_config.model_type == "pixtral": return PixtralHFImagePixelInputs( type="pixel_values_pixtral", - pixel_values=flatten_bn(pixel_values), + pixel_values=pixel_values, ) expected_h = expected_w = self.config.vision_config.image_size return LlavaImagePixelInputs( type="pixel_values", - pixel_values=flatten_bn(pixel_values, concat=True), + pixel_values=pixel_values, resolve_bindings={"h": expected_h, "w": expected_w}, ) if image_embeds is not None: - if not isinstance(image_embeds, (torch.Tensor, list)): - raise ValueError( - "Incorrect type of image embeddings. " - f"Got type: {type(image_embeds)}" - ) - if self.config.vision_config.model_type == "pixtral": raise ValueError("Pixtral-HF does not support image_embeds.") return LlavaImageEmbeddingInputs( type="image_embeds", - data=flatten_bn(image_embeds, concat=True), + data=image_embeds, ) raise AssertionError("This line should be unreachable.") diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 57fccdf942336..caedace7cab1e 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -34,7 +34,6 @@ from .siglip import SiglipVisionModel from .utils import ( AutoWeightsLoader, WeightsMapper, - flatten_bn, init_vllm_registered_model, maybe_prefix, ) @@ -222,6 +221,8 @@ class LlavaNextMultiModalProcessor( dummy_inputs=LlavaDummyInputsBuilder, ) class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): + merge_by_field_config = True + hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ # mapping for new names in checkpoint saved after transformers v4.52 @@ -302,21 +303,11 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP return None if pixel_values is not None: - if not isinstance(pixel_values, (torch.Tensor, list)): - raise ValueError( - f"Incorrect type of pixel values. Got type: {type(pixel_values)}" - ) - - if not isinstance(image_sizes, (torch.Tensor, list)): - raise ValueError( - f"Incorrect type of image sizes. Got type: {type(image_sizes)}" - ) - expected_h = expected_w = self.config.vision_config.image_size return LlavaNextImagePixelInputs( type="pixel_values", - pixel_values=flatten_bn(pixel_values), - image_sizes=flatten_bn(image_sizes, concat=True), + pixel_values=pixel_values, + image_sizes=image_sizes, resolve_bindings={ "h": expected_h, "w": expected_w, @@ -324,14 +315,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP ) if image_embeds is not None: - if not isinstance(image_embeds, torch.Tensor): - raise ValueError( - f"Incorrect type of image embeds. Got type: {type(image_embeds)}" - ) - return LlavaNextImageEmbeddingInputs( type="image_embeds", - data=flatten_bn(image_embeds), + data=image_embeds, ) raise AssertionError("This line should be unreachable.") diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 9041b18d0f633..074acc7943a43 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -51,14 +51,13 @@ from .vision import get_vision_encoder_info class LlavaNextVideoPixelInputs(TensorSchema): """ Dimensions: - - bs: Batch size - - nv: Number of videos - - nf: Number of frames - - nc: Number of channels (3) + - bn: Batch size * number of videos + - f: Number of frames + - c: Number of channels (3) - h: Height of each frame - w: Width of each frame - Note that `num_frames` may be different for each batch, in which case + Note that `f` may be different for each batch, in which case the data is passed as a list instead of a batched tensor. Note that it only supports one video input for one batch. @@ -66,9 +65,9 @@ class LlavaNextVideoPixelInputs(TensorSchema): type: Literal["pixel_values_videos"] = "pixel_values_videos" - data: Annotated[ + pixel_values_videos: Annotated[ Union[torch.Tensor, list[torch.Tensor]], - TensorShape("bs", "nv", "nf", 3, "h", "w"), + TensorShape("bn", "f", 3, "h", "w", dynamic_dims={"f"}), ] @@ -300,6 +299,8 @@ class LlavaNextMultiModalProjector(nn.Module): dummy_inputs=LlavaNextVideoDummyInputsBuilder, ) class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): + merge_by_field_config = True + hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ # mapping for new names in checkpoint saved after transformers v4.52 @@ -371,7 +372,7 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, Supp expected_h = expected_w = self.config.vision_config.image_size return LlavaNextVideoPixelInputs( type="pixel_values_videos", - data=pixel_values_videos, + pixel_values_videos=pixel_values_videos, resolve_bindings={ "h": expected_h, "w": expected_w, @@ -396,19 +397,15 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, Supp def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs): assert self.vision_tower is not None - video_pixels = inputs["data"] + video_pixels = inputs["pixel_values_videos"] if isinstance(video_pixels, torch.Tensor): - # TODO: support multiple videos per input - b, num_videos, num_frames, c, h, w = video_pixels.shape - assert num_videos == 1 - stacked_pixels = video_pixels.view(b * num_videos * num_frames, c, h, w) + bn, f, c, h, w = video_pixels.shape + stacked_pixels = video_pixels.view(bn * f, c, h, w) stacked_embeddings = self._video_pixels_to_features( self.vision_tower, stacked_pixels ) - embeds = stacked_embeddings.view( - b, num_frames, *stacked_embeddings.shape[1:] - ) + embeds = stacked_embeddings.view(bn, f, *stacked_embeddings.shape[1:]) elif is_list_of(video_pixels, torch.Tensor): frames_per_videos = [v.shape[0] for v in video_pixels] diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 643854ef7c16f..05f1621694c36 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -44,7 +44,6 @@ from .siglip import SiglipVisionModel from .utils import ( AutoWeightsLoader, WeightsMapper, - flatten_bn, init_vllm_registered_model, maybe_prefix, ) @@ -62,7 +61,7 @@ class LlavaOnevisionVideoPixelInputs(TensorSchema): - h: Height - w: Width - Note that `num_videos` may be different for each batch, and 'num_frames' + Note that `f` may be different for each batch, and 'num_frames' may be different for each video, in which case the data is passed as a list instead of a batched tensor. """ @@ -480,6 +479,8 @@ class LlavaOnevisionMultiModalProjector(nn.Module): dummy_inputs=LlavaOnevisionDummyInputsBuilder, ) class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): + merge_by_field_config = True + hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ # mapping for new names in checkpoint saved after transformers v4.52 @@ -539,20 +540,10 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp return None if pixel_values is not None: - if not isinstance(pixel_values, (torch.Tensor, list)): - raise ValueError( - f"Incorrect type of pixel values. Got type: {type(pixel_values)}" - ) - - if not isinstance(image_sizes, (torch.Tensor, list)): - raise ValueError( - f"Incorrect type of image sizes. Got type: {type(image_sizes)}" - ) - return LlavaOnevisionImagePixelInputs( type="pixel_values", - pixel_values=flatten_bn(pixel_values), - image_sizes=flatten_bn(image_sizes, concat=True), + pixel_values=pixel_values, + image_sizes=image_sizes, resolve_bindings={ "h": self.config.vision_config.image_size, "w": self.config.vision_config.image_size, @@ -560,14 +551,9 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp ) if image_embeds is not None: - if not isinstance(image_embeds, torch.Tensor): - raise ValueError( - f"Incorrect type of image embeds. Got type: {type(image_embeds)}" - ) - return LlavaOnevisionImageEmbeddingInputs( type="image_embeds", - data=flatten_bn(image_embeds), + data=image_embeds, ) raise AssertionError("This line should be unreachable.") @@ -586,15 +572,9 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, Supp if pixel_values_videos is None: return None - if not isinstance(pixel_values_videos, (torch.Tensor, list)): - raise ValueError( - "Incorrect type of pixel_values_videos. " - f"Got type: {type(pixel_values_videos)}" - ) - return LlavaOnevisionVideoPixelInputs( type="pixel_values_videos", - pixel_values_videos=flatten_bn(pixel_values_videos), + pixel_values_videos=pixel_values_videos, resolve_bindings={ "h": self.config.vision_config.image_size, "w": self.config.vision_config.image_size, diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py index 338e3d5dca7be..a25a7097a6ece 100644 --- a/vllm/model_executor/models/minimax_vl_01.py +++ b/vllm/model_executor/models/minimax_vl_01.py @@ -32,7 +32,6 @@ from .pixtral import PixtralHFVisionModel from .siglip import SiglipVisionModel from .utils import ( AutoWeightsLoader, - flatten_bn, init_vllm_registered_model, maybe_prefix, ) @@ -180,6 +179,8 @@ class MiniMaxVL01MultiModalProcessor( dummy_inputs=MiniMaxVL01DummyInputsBuilder, ) class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): + merge_by_field_config = True + packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], @@ -338,32 +339,16 @@ class MiniMaxVL01ForConditionalGeneration(nn.Module, SupportsMultiModal, Support return None if pixel_values is not None and image_sizes is not None: - if not isinstance(pixel_values, (torch.Tensor, list)): - raise ValueError( - f"Incorrect type of pixel values. Got type: {type(pixel_values)}" - ) - - if not isinstance(image_sizes, (torch.Tensor, list)): - raise ValueError( - f"Incorrect type of image sizes. Got type: {type(image_sizes)}" - ) - return MiniMaxVL01ImagePixelInputs( type="pixel_values", - pixel_values=flatten_bn(pixel_values), - image_sizes=flatten_bn(image_sizes, concat=True), + pixel_values=pixel_values, + image_sizes=image_sizes, ) if image_embeds is not None: - if not isinstance(image_embeds, (torch.Tensor, list)): - raise ValueError( - "Incorrect type of image embeddings. " - f"Got type: {type(image_embeds)}" - ) - return MiniMaxVL01ImageEmbeddingInputs( type="image_embeds", - data=flatten_bn(image_embeds, concat=True), + data=image_embeds, ) raise AssertionError("This line should be unreachable.") diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 65fb1894b61f4..8e74425c5dbdd 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -52,7 +52,6 @@ from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel from .utils import ( AutoWeightsLoader, WeightsMapper, - flatten_bn, init_vllm_registered_model, maybe_prefix, ) @@ -424,6 +423,8 @@ def init_vision_tower_for_llava( class Mistral3ForConditionalGeneration( nn.Module, SupportsLoRA, SupportsMultiModal, SupportsPP ): + merge_by_field_config = True + packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], @@ -510,15 +511,9 @@ class Mistral3ForConditionalGeneration( if pixel_values is None and image_embeds is None: return None - assert pixel_values is not None - if not isinstance(pixel_values, (torch.Tensor, list)): - raise ValueError( - f"Incorrect type of pixel values. Got type: {type(pixel_values)}" - ) - return Mistral3ImagePixelInputs( type="pixel_values_pixtral", - pixel_values=flatten_bn(pixel_values), + pixel_values=pixel_values, ) def _process_image_input( diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 52a5a63244570..65abebcf37de9 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -64,7 +64,7 @@ from vllm.transformers_utils.tokenizer import ( from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP -from .utils import flatten_bn, init_vllm_registered_model, maybe_prefix +from .utils import init_vllm_registered_model, maybe_prefix from .vision import ( VisionEncoderInfo, VisionFeatureSelectStrategy, @@ -365,6 +365,8 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]) dummy_inputs=PixtralDummyInputsBuilder, ) class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): + merge_by_field_config = True + @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: if modality.startswith("image"): @@ -424,7 +426,7 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP) return PixtralImagePixelInputs( type="pixel_values", - images=flatten_bn(images), + images=images, ) def _process_image_input( diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py index 73f1e79fd62de..482ad9cb77484 100644 --- a/vllm/model_executor/models/tarsier.py +++ b/vllm/model_executor/models/tarsier.py @@ -49,7 +49,6 @@ from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .siglip import SiglipVisionModel from .utils import ( AutoWeightsLoader, - flatten_bn, init_vllm_registered_model, maybe_prefix, ) @@ -404,6 +403,8 @@ def init_vision_tower_for_tarsier( dummy_inputs=TarsierDummyInputsBuilder, ) class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): + merge_by_field_config = True + packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], @@ -467,25 +468,15 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP) return None if pixel_values is not None: - if not isinstance(pixel_values, (torch.Tensor, list)): - raise ValueError( - f"Incorrect type of pixel values. Got type: {type(pixel_values)}" - ) - return TarsierImagePixelInputs( type="pixel_values", - pixel_values=flatten_bn(pixel_values, concat=True), + pixel_values=pixel_values, ) if image_embeds is not None: - if not isinstance(image_embeds, (torch.Tensor, list)): - raise ValueError( - "Incorrect type of image embeddings. " - f"Got type: {type(image_embeds)}" - ) return TarsierImageEmbeddingInputs( type="image_embeds", - data=flatten_bn(image_embeds, concat=True), + data=image_embeds, ) raise AssertionError("This line should be unreachable.")