[V1] Refactor model executable interface for multimodal models (#10570)

Signed-off-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
Roger Wang 2024-11-26 12:46:11 -08:00 committed by GitHub
parent 7576cd38df
commit 2f0a0a17a4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 568 additions and 293 deletions

View File

@ -16,6 +16,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import NestedTensors
from vllm.multimodal.utils import consecutive_placeholder_ranges from vllm.multimodal.utils import consecutive_placeholder_ranges
from vllm.sequence import IntermediateTensors, SequenceData from vllm.sequence import IntermediateTensors, SequenceData
@ -609,6 +610,25 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
return self.language_projection(query_output) return self.language_projection(query_output)
def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
vision_embeddings = self._process_image_input(image_input)
return vision_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[NestedTensors] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
BLIP2_IMAGE_TOKEN_ID)
return inputs_embeds
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
@ -616,6 +636,7 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
kv_caches: List[torch.Tensor], kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata, attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object, **kwargs: object,
) -> Union[SamplerOutput, IntermediateTensors]: ) -> Union[SamplerOutput, IntermediateTensors]:
"""Run forward pass for BLIP-2. """Run forward pass for BLIP-2.
@ -648,32 +669,24 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
See also: See also:
:class:`Blip2ImageInputs` :class:`Blip2ImageInputs`
""" """
if intermediate_tensors is not None: if intermediate_tensors is not None:
input_ids = None
inputs_embeds = None inputs_embeds = None
else:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is not None: # NOTE: In v1, inputs_embeds is always generated at model runner, this
vision_embeddings = self._process_image_input(image_input) # condition is for v0 compatibility.
inputs_embeds = self.language_model.model.get_input_embeddings( elif inputs_embeds is None:
input_ids) vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(input_ids,
vision_embeddings)
input_ids = None
inputs_embeds = merge_multimodal_embeddings( hidden_states = self.language_model.model(input_ids,
input_ids, inputs_embeds, vision_embeddings, positions,
BLIP2_IMAGE_TOKEN_ID) kv_caches,
attn_metadata,
input_ids = None intermediate_tensors,
else: inputs_embeds=inputs_embeds)
inputs_embeds = None
hidden_states = self.language_model.model(
input_ids,
positions,
kv_caches,
attn_metadata,
intermediate_tensors=intermediate_tensors,
inputs_embeds=inputs_embeds)
return hidden_states return hidden_states

View File

@ -29,6 +29,7 @@ from vllm.model_executor.model_loader.weight_utils import (
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.model_executor.utils import set_weight_attrs from vllm.model_executor.utils import set_weight_attrs
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import NestedTensors
from vllm.multimodal.utils import (cached_get_tokenizer, from vllm.multimodal.utils import (cached_get_tokenizer,
consecutive_placeholder_ranges, consecutive_placeholder_ranges,
repeat_and_pad_placeholder_tokens) repeat_and_pad_placeholder_tokens)
@ -38,7 +39,7 @@ from vllm.utils import print_warning_once
from .interfaces import SupportsMultiModal, SupportsPP from .interfaces import SupportsMultiModal, SupportsPP
from .utils import (is_pp_missing_parameter, from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers, make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix) maybe_prefix, merge_multimodal_embeddings)
# These configs are not part of the model config but the preprocessor # These configs are not part of the model config but the preprocessor
# and processor files, so we hardcode them in the model file for now. # and processor files, so we hardcode them in the model file for now.
@ -987,6 +988,29 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
data=self._validate_pixel_values(pixel_values), data=self._validate_pixel_values(pixel_values),
) )
def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
assert self.model.vqmodel is not None
image_tokens = self.model.get_image_tokens(image_input["data"].to(
self.config.torch_dtype))
vision_embeddings = self.model.get_input_embeddings(image_tokens)
return vision_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[NestedTensors] = None,
) -> torch.Tensor:
inputs_embeds = self.model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.model.vocabulary_mapping.image_token_id)
return inputs_embeds
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
@ -994,27 +1018,27 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
kv_caches: List[torch.Tensor], kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata, attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs, **kwargs,
) -> Union[torch.Tensor, IntermediateTensors]: ) -> Union[torch.Tensor, IntermediateTensors]:
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(input_ids,
vision_embeddings)
input_ids = None input_ids = None
else:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is not None: hidden_states = self.model(input_ids,
assert self.model.vqmodel is not None positions,
image_tokens = self.model.get_image_tokens( kv_caches,
image_input["data"].to(self.config.torch_dtype)) attn_metadata,
image_token_id = self.model.vocabulary_mapping.image_token_id intermediate_tensors,
special_image_mask = input_ids == image_token_id inputs_embeds=inputs_embeds)
image_tokens = image_tokens.to(input_ids.device,
input_ids.dtype)
input_ids = input_ids.masked_scatter(special_image_mask,
image_tokens)
hidden_states = self.model(input_ids, positions, kv_caches,
attn_metadata, intermediate_tensors)
return hidden_states return hidden_states
def compute_logits( def compute_logits(

View File

@ -33,7 +33,8 @@ from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalData, MultiModalKwargs from vllm.multimodal.inputs import (MultiModalData, MultiModalKwargs,
NestedTensors)
from vllm.multimodal.utils import cached_get_tokenizer from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
SequenceData) SequenceData)
@ -545,6 +546,30 @@ class ChatGLMModel(nn.Module):
""") """)
return GLMImagePixelInputs(pixel_values=pixel_values) return GLMImagePixelInputs(pixel_values=pixel_values)
def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input["pixel_values"] is None:
return None
pixel_values = image_input["pixel_values"].to(
dtype=self.config.torch_dtype)
vision_embeddings = self.vision(pixel_values)
return vision_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[NestedTensors] = None,
) -> torch.Tensor:
inputs_embeds = self.embedding(input_ids)
if multimodal_embeddings is not None:
inputs_embeds = merge_glm_vision_embeddings(
input_ids=input_ids,
inputs_embeds=inputs_embeds,
vision_embeddings=multimodal_embeddings,
boi_token_id=self.config.boi_token_id,
eoi_token_id=self.config.eoi_token_id)
return inputs_embeds
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
@ -552,26 +577,17 @@ class ChatGLMModel(nn.Module):
kv_caches: List[torch.Tensor], kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata, attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object, **kwargs: object,
) -> torch.Tensor: ) -> torch.Tensor:
if intermediate_tensors is None:
inputs_embeds = self.embedding(input_ids)
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input["pixel_values"] is not None: # NOTE: In v1, inputs_embeds is always generated at model runner, this
pixel_values = image_input["pixel_values"].to( # condition is for v0 compatibility.
dtype=inputs_embeds.dtype) if intermediate_tensors is None and inputs_embeds is None:
image_embeds = self.vision(pixel_values) vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(input_ids,
boi_token_id = self.config.boi_token_id vision_embeddings)
eoi_token_id = self.config.eoi_token_id input_ids = None
inputs_embeds = merge_glm_vision_embeddings(
input_ids=input_ids,
inputs_embeds=inputs_embeds,
vision_embeddings=image_embeds,
boi_token_id=boi_token_id,
eoi_token_id=eoi_token_id)
else: else:
inputs_embeds = intermediate_tensors["hidden_states"] inputs_embeds = intermediate_tensors["hidden_states"]

View File

@ -35,6 +35,7 @@ from vllm.model_executor.models.persimmon import PersimmonForCausalLM
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.image import cached_get_image_processor
from vllm.multimodal.inputs import NestedTensors
from vllm.multimodal.utils import (cached_get_tokenizer, from vllm.multimodal.utils import (cached_get_tokenizer,
consecutive_placeholder_ranges) consecutive_placeholder_ranges)
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
@ -302,6 +303,25 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
vision_embeddings, _ = self.vision_embed_tokens(image_input["data"]) vision_embeddings, _ = self.vision_embed_tokens(image_input["data"])
return vision_embeddings return vision_embeddings
def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
vision_embeddings = self._process_image_input(image_input)
return vision_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[NestedTensors] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
_IMAGE_TOKEN_ID)
return inputs_embeds
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
@ -309,24 +329,19 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
kv_caches: List[torch.Tensor], kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata, attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object, **kwargs: object,
): ):
if intermediate_tensors is not None: if intermediate_tensors is not None:
input_ids = None
inputs_embeds = None inputs_embeds = None
else:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is not None: # NOTE: In v1, inputs_embeds is always generated at model runner, this
vision_embeddings = self._process_image_input(image_input) # condition is for v0 compatibility.
inputs_embeds = self.language_model.model.embed_tokens( elif inputs_embeds is None:
input_ids) vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = merge_multimodal_embeddings( inputs_embeds = self.get_input_embeddings(input_ids,
input_ids, inputs_embeds, vision_embeddings, vision_embeddings)
self.image_token_id) input_ids = None
else:
inputs_embeds = None
hidden_states = self.language_model( hidden_states = self.language_model(
input_ids=input_ids, input_ids=input_ids,

View File

@ -2,7 +2,7 @@ from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional,
Protocol, Type, Union, overload, runtime_checkable) Protocol, Type, Union, overload, runtime_checkable)
import torch import torch
from typing_extensions import TypeIs from typing_extensions import TypeIs, TypeVar
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils import supports_kw from vllm.utils import supports_kw
@ -10,10 +10,14 @@ from vllm.utils import supports_kw
from .interfaces_base import is_embedding_model from .interfaces_base import is_embedding_model
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.attention import AttentionMetadata
from vllm.multimodal.inputs import NestedTensors # noqa: F401
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
logger = init_logger(__name__) logger = init_logger(__name__)
T = TypeVar("T", default="NestedTensors")
@runtime_checkable @runtime_checkable
class SupportsMultiModal(Protocol): class SupportsMultiModal(Protocol):
@ -28,6 +32,36 @@ class SupportsMultiModal(Protocol):
MRO of your model class. MRO of your model class.
""" """
def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
"""
Returns multimodal embeddings generated from multimodal kwargs
to be merged with text embeddings.
"""
...
# Only for models that support v0 chunked prefill
# TODO(ywang96): Remove this overload once v0 is deprecated
@overload
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[T] = None,
attn_metadata: Optional["AttentionMetadata"] = None,
) -> torch.Tensor:
...
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[T] = None,
) -> torch.Tensor:
"""
Returns the input embeddings merged from the text embeddings from
input_ids and the multimodal embeddings generated from multimodal
kwargs.
"""
...
# We can't use runtime_checkable with ClassVar for issubclass checks # We can't use runtime_checkable with ClassVar for issubclass checks
# so we need to treat the class as an instance and use isinstance instead # so we need to treat the class as an instance and use isinstance instead

View File

@ -26,6 +26,7 @@ from vllm.model_executor.models.intern_vit import (InternVisionModel,
InternVisionPatchModel) InternVisionPatchModel)
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.inputs import NestedTensors
from vllm.multimodal.utils import cached_get_tokenizer from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.utils import is_list_of from vllm.utils import is_list_of
@ -641,6 +642,26 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
visual_token_mask = None visual_token_mask = None
return visual_token_mask return visual_token_mask
def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
vision_embeddings = self._process_image_input(image_input)
return vision_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[NestedTensors] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
assert self.img_context_token_id is not None
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.img_context_token_id)
return inputs_embeds
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
@ -648,26 +669,22 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
kv_caches: List[torch.Tensor], kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata, attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object, **kwargs: object,
) -> Union[SamplerOutput, IntermediateTensors]: ) -> Union[SamplerOutput, IntermediateTensors]:
visual_token_mask = None
if intermediate_tensors is not None: if intermediate_tensors is not None:
input_ids = None input_ids = None
inputs_embeds = None inputs_embeds = None
visual_token_mask = None
else: # NOTE: In v1, inputs_embeds is always generated at model runner, this
image_input = self._parse_and_validate_image_input(**kwargs) # condition is for v0 compatibility.
if image_input is not None: elif inputs_embeds is None:
inputs_embeds = self.language_model.model.get_input_embeddings( vision_embeddings = self.get_multimodal_embeddings(**kwargs)
input_ids) inputs_embeds = self.get_input_embeddings(input_ids,
vision_embeddings = self._process_image_input(image_input) vision_embeddings)
inputs_embeds = merge_multimodal_embeddings( input_ids = None
input_ids, inputs_embeds, vision_embeddings,
self.img_context_token_id)
visual_token_mask = self._get_visual_token_mask(input_ids)
input_ids = None
else:
inputs_embeds = None
visual_token_mask = None
forward_kwargs = { forward_kwargs = {
"input_ids": input_ids, "input_ids": input_ids,
@ -677,6 +694,13 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
"intermediate_tensors": intermediate_tensors, "intermediate_tensors": intermediate_tensors,
"inputs_embeds": inputs_embeds, "inputs_embeds": inputs_embeds,
} }
if self.img_context_token_id is not None:
visual_token_mask = self._get_visual_token_mask(input_ids)
# We always overwrite it back to None after computing visual token
# mask so that this doesn't need to depend on encoder output
self.img_context_token_id = None
if self.is_mono: if self.is_mono:
forward_kwargs.update({"visual_token_mask": visual_token_mask}) forward_kwargs.update({"visual_token_mask": visual_token_mask})

View File

@ -478,7 +478,7 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
image_features = self._process_image_pixels(image_input) image_features = self._process_image_pixels(image_input)
return self.multi_modal_projector(image_features) return self.multi_modal_projector(image_features)
def process_mm_inputs(self, **kwargs): def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
image_input = self._parse_and_validate_image_input(**kwargs) image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None: if image_input is None:
return None return None
@ -488,12 +488,12 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
def get_input_embeddings( def get_input_embeddings(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
vision_embeddings: Optional[NestedTensors] = None, multimodal_embeddings: Optional[NestedTensors] = None,
) -> torch.Tensor: ) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids) inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if vision_embeddings is not None: if multimodal_embeddings is not None:
inputs_embeds = merge_multimodal_embeddings( inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, vision_embeddings, input_ids, inputs_embeds, multimodal_embeddings,
self.config.image_token_index) self.config.image_token_index)
return inputs_embeds return inputs_embeds
@ -544,10 +544,11 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
""" """
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None: elif inputs_embeds is None:
vision_embeddings = self.process_mm_inputs(**kwargs) vision_embeddings = self.get_multimodal_embeddings(**kwargs)
# always pass the input via `inputs_embeds`
# to make sure the computation graph is consistent
inputs_embeds = self.get_input_embeddings(input_ids, inputs_embeds = self.get_input_embeddings(input_ids,
vision_embeddings) vision_embeddings)
input_ids = None input_ids = None

View File

@ -19,6 +19,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.pooling_metadata import PoolingMetadata
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import NestedTensors
from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.sequence import IntermediateTensors, PoolerOutput
from vllm.utils import is_list_of from vllm.utils import is_list_of
@ -565,6 +566,30 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
for i, patch_features_batch in enumerate(patch_embeddings) for i, patch_features_batch in enumerate(patch_embeddings)
] ]
def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
vision_embeddings = self._process_image_input(image_input)
return vision_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[NestedTensors] = None,
) -> torch.Tensor:
if multimodal_embeddings is None:
return self.language_model.get_input_embeddings(input_ids)
inputs_embeds = embed_multimodal(
input_ids,
self.config.image_token_index,
self.language_model.model.get_input_embeddings,
multimodal_embeddings,
)
return inputs_embeds
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
@ -572,6 +597,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
kv_caches: List[torch.Tensor], kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata, attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object, **kwargs: object,
) -> Union[torch.Tensor, IntermediateTensors]: ) -> Union[torch.Tensor, IntermediateTensors]:
"""Run forward pass for LlaVA-NeXT. """Run forward pass for LlaVA-NeXT.
@ -620,24 +646,14 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
""" """
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
else:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is not None: # NOTE: In v1, inputs_embeds is always generated at model runner, this
inputs_embeds = embed_multimodal( # condition is for v0 compatibility.
input_ids, elif inputs_embeds is None:
self.config.image_token_index, vision_embeddings = self.get_multimodal_embeddings(**kwargs)
self.language_model.model.get_input_embeddings, inputs_embeds = self.get_input_embeddings(input_ids,
lambda _: self._process_image_input(image_input), vision_embeddings)
) input_ids = None
else:
inputs_embeds = self.language_model.model.get_input_embeddings(
input_ids)
# always pass the input via `inputs_embeds`
# to make sure the computation graph is consistent
# for `torch.compile` integration
input_ids = None
hidden_states = self.language_model.model(input_ids, hidden_states = self.language_model.model(input_ids,
positions, positions,
@ -645,7 +661,6 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
attn_metadata, attn_metadata,
intermediate_tensors, intermediate_tensors,
inputs_embeds=inputs_embeds) inputs_embeds=inputs_embeds)
return hidden_states return hidden_states
def compute_logits( def compute_logits(

View File

@ -18,6 +18,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.models.clip import CLIPVisionModel
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import NestedTensors
from vllm.multimodal.utils import (cached_get_tokenizer, from vllm.multimodal.utils import (cached_get_tokenizer,
repeat_and_pad_placeholder_tokens) repeat_and_pad_placeholder_tokens)
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
@ -388,6 +389,25 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
raise ValueError( raise ValueError(
f"Unsupported type of video input {type(video_pixels)}") f"Unsupported type of video input {type(video_pixels)}")
def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
video_input = self._parse_and_validate_video_input(**kwargs)
if video_input is None:
return None
vision_embeddings = self._process_video_pixels(video_input)
return vision_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[NestedTensors] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.config.video_token_index)
return inputs_embeds
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
@ -395,6 +415,7 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
kv_caches: List[torch.Tensor], kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata, attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object, **kwargs: object,
) -> Union[torch.Tensor, IntermediateTensors]: ) -> Union[torch.Tensor, IntermediateTensors]:
"""Run forward pass for LlaVA-NeXT-Video. """Run forward pass for LlaVA-NeXT-Video.
@ -404,22 +425,15 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
pixel_values_videos: Pixels in each frames for each input videos. pixel_values_videos: Pixels in each frames for each input videos.
""" """
if intermediate_tensors is not None: if intermediate_tensors is not None:
input_ids = None
inputs_embeds = None inputs_embeds = None
else:
video_input = self._parse_and_validate_video_input(**kwargs)
if video_input is not None:
video_embeddings = self._process_video_pixels(video_input)
inputs_embeds = self.language_model \
.model.get_input_embeddings(input_ids)
inputs_embeds = merge_multimodal_embeddings( # NOTE: In v1, inputs_embeds is always generated at model runner, this
input_ids, inputs_embeds, video_embeddings, # condition is for v0 compatibility.
self.config.video_token_index) elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
input_ids = None inputs_embeds = self.get_input_embeddings(input_ids,
else: vision_embeddings)
inputs_embeds = None input_ids = None
hidden_states = self.language_model.model(input_ids, hidden_states = self.language_model.model(input_ids,
positions, positions,

View File

@ -21,6 +21,7 @@ from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import NestedTensors
from vllm.multimodal.utils import (cached_get_tokenizer, from vllm.multimodal.utils import (cached_get_tokenizer,
repeat_and_pad_placeholder_tokens) repeat_and_pad_placeholder_tokens)
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
@ -824,6 +825,49 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
image_feature = image_feature.view(batch_frames, -1, dim) image_feature = image_feature.view(batch_frames, -1, dim)
return image_feature return image_feature
def get_multimodal_embeddings(
self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]:
modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
if not modalities:
return None
# We make a tuple of each embedding with its modality string. This is a
# temporary workaround for models to handle mixed modalities when
# get_multimodal_embeddings and get_input_embeddings are called
# separately.
# TODO(ywang96): Add support for mixed-modality inference for v1.
multimodal_embeddings: List[Tuple[NestedTensors, str]] = []
if "images" in modalities:
image_input = modalities["images"]
vision_embeddings = self._process_image_input(image_input)
multimodal_embeddings.append((vision_embeddings, "image"))
if "videos" in modalities:
video_input = modalities["videos"]
video_embeddings = self._process_video_pixels(video_input)
multimodal_embeddings.append((video_embeddings, "video"))
return multimodal_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[List[Tuple[NestedTensors,
str]]] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
for embeddings, modality in multimodal_embeddings:
if modality == "image":
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, embeddings,
self.config.image_token_index)
if modality == "video":
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, embeddings,
self.config.video_token_index)
return inputs_embeds
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
@ -831,6 +875,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
kv_caches: List[torch.Tensor], kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata, attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object, **kwargs: object,
) -> Union[torch.Tensor, IntermediateTensors]: ) -> Union[torch.Tensor, IntermediateTensors]:
"""Run forward pass for LlaVA-Onevision. """Run forward pass for LlaVA-Onevision.
@ -840,28 +885,15 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
pixel_values_videos: Pixels in each frames for each input videos. pixel_values_videos: Pixels in each frames for each input videos.
""" """
if intermediate_tensors is not None: if intermediate_tensors is not None:
input_ids = None
inputs_embeds = None inputs_embeds = None
else:
modalities = self._parse_and_validate_multimodal_inputs(**kwargs) # NOTE: In v1, inputs_embeds is always generated at model runner, this
if modalities: # condition is for v0 compatibility.
inputs_embeds = self.language_model.model.get_input_embeddings( elif inputs_embeds is None:
input_ids) multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
if "images" in modalities: inputs_embeds = self.get_input_embeddings(input_ids,
image_input = modalities["images"] multimodal_embeddings)
vision_embeddings = self._process_image_input(image_input) input_ids = None
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, vision_embeddings,
self.config.image_token_index)
if "videos" in modalities:
video_input = modalities["videos"]
video_embeddings = self._process_video_pixels(video_input)
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, video_embeddings,
self.config.video_token_index)
input_ids = None
else:
inputs_embeds = None
hidden_states = self.language_model.model(input_ids, hidden_states = self.language_model.model(input_ids,
positions, positions,

View File

@ -3,7 +3,7 @@ import re
from array import array from array import array
from dataclasses import dataclass from dataclasses import dataclass
from functools import lru_cache, partial from functools import lru_cache, partial
from typing import Iterable, List, Mapping, Optional, Tuple, TypedDict, Union from typing import Iterable, List, Mapping, Optional, Tuple, TypedDict
import torch import torch
from einops import rearrange from einops import rearrange
@ -36,6 +36,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding) ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.inputs import NestedTensors
from vllm.multimodal.utils import cached_get_tokenizer from vllm.multimodal.utils import cached_get_tokenizer
from vllm.platforms import _Backend from vllm.platforms import _Backend
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
@ -756,6 +757,12 @@ class MolmoModel(nn.Module):
make_empty_intermediate_tensors_factory( make_empty_intermediate_tensors_factory(
["hidden_states", "residual"], config.hidden_size)) ["hidden_states", "residual"], config.hidden_size))
def get_input_embeddings(
self,
input_ids: torch.Tensor,
) -> torch.Tensor:
return self.embed_tokens(input_ids)
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
@ -1098,19 +1105,16 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
return image_features return image_features
def _merge_multimodal_embeddings( def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
self, image_input = self._parse_and_validate_image_input(**kwargs)
inputs_embeds: torch.Tensor, if image_input is None:
image_features: torch.Tensor, return None
image_input_idx: torch.Tensor, image_features = self._process_image_input(image_input)
seq_len: Union[torch.Tensor, List[torch.Tensor]], image_input_idx = image_input["image_input_idx"]
) -> torch.Tensor: seq_len = image_input["seq_len"]
batch_size, num_image, num_patch = image_features.shape[:3] batch_size, num_image, num_patch = image_features.shape[:3]
assert image_input_idx.shape == (batch_size, num_image, num_patch) assert image_input_idx.shape == (batch_size, num_image, num_patch)
image_features = image_features.to(inputs_embeds.device)
seq_len = seq_len.to(inputs_embeds.device)
# insert the image feature into the embedding. # insert the image feature into the embedding.
image_features = image_features.view(batch_size, num_image * num_patch, image_features = image_features.view(batch_size, num_image * num_patch,
-1) -1)
@ -1130,12 +1134,24 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
image_input_idx = image_input_idx + offset.to(image_input_idx.dtype) image_input_idx = image_input_idx + offset.to(image_input_idx.dtype)
image_input_idx = image_input_idx.flatten()[:, None] image_input_idx = image_input_idx.flatten()[:, None]
mat = image_input_idx == torch.arange( mat = image_input_idx == torch.arange(
seq_len.sum().item(), device=inputs_embeds.device)[None, :] seq_len.sum().item(), device=image_features.device)[None, :]
mat = mat.to(image_features.dtype) mat = mat.to(image_features.dtype)
inputs_embeds = inputs_embeds + torch.einsum('nd,nm->md', # Note: In this original implementation from AI2, the final
image_features, mat) # vision_embeddings will be always be the same length
# of input embedddings, which is not very efficient.
# TODO(ywang96): see if this can be optimized.
vision_embeddings = torch.einsum('nd,nm->md', image_features, mat)
return vision_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[NestedTensors] = None,
) -> torch.Tensor:
inputs_embeds = self.model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
inputs_embeds = inputs_embeds + multimodal_embeddings
return inputs_embeds return inputs_embeds
def forward( def forward(
@ -1145,39 +1161,27 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
kv_caches: List[torch.Tensor], kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata, attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object, **kwargs: object,
) -> SamplerOutput: ) -> SamplerOutput:
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
else:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is not None: # NOTE: In v1, inputs_embeds is always generated at model runner, this
inputs_embeds = self.model.embed_tokens(input_ids) # condition is for v0 compatibility.
image_features = self._process_image_input(image_input) elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(input_ids,
vision_embeddings)
input_ids = None
inputs_embeds = self._merge_multimodal_embeddings( hidden_states = self.model(input_ids,
inputs_embeds, positions,
image_features, kv_caches,
image_input["image_input_idx"], attn_metadata,
image_input["seq_len"], intermediate_tensors,
) inputs_embeds=inputs_embeds)
else:
inputs_embeds = self.model.embed_tokens(input_ids)
# always pass the input via `inputs_embeds`
# to make sure the computation graph is consistent
# for `torch.compile` integration
input_ids = None
hidden_states = self.model(
input_ids=input_ids,
positions=positions,
kv_caches=kv_caches,
attn_metadata=attn_metadata,
intermediate_tensors=intermediate_tensors,
inputs_embeds=inputs_embeds,
)
return hidden_states return hidden_states

View File

@ -13,6 +13,7 @@ from vllm.logger import init_logger
from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import NestedTensors
from vllm.multimodal.utils import cached_get_tokenizer from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
@ -240,36 +241,45 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
return self.multi_modal_projector(image_features) return self.multi_modal_projector(image_features)
def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
vision_embeddings = self._process_image_input(image_input)
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa
vision_embeddings = vision_embeddings * (self.config.hidden_size**-0.5)
return vision_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[NestedTensors] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.config.image_token_index)
return inputs_embeds
def forward(self, def forward(self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
kv_caches: List[torch.Tensor], kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata, attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object) -> Union[SamplerOutput, IntermediateTensors]: **kwargs: object) -> Union[SamplerOutput, IntermediateTensors]:
if intermediate_tensors is not None: if intermediate_tensors is not None:
input_ids = None
inputs_embeds = None inputs_embeds = None
else:
parsed_image_input = self._parse_and_validate_image_input(**kwargs)
if parsed_image_input is not None: # NOTE: In v1, inputs_embeds is always generated at model runner, this
vision_embeddings = self._process_image_input( # condition is for v0 compatibility.
parsed_image_input) elif inputs_embeds is None:
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa vision_embeddings = self.get_multimodal_embeddings(**kwargs)
vision_embeddings = vision_embeddings * ( inputs_embeds = self.get_input_embeddings(input_ids,
self.config.hidden_size**-0.5) vision_embeddings)
input_ids = None
inputs_embeds = self.language_model.model.get_input_embeddings(
input_ids)
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, vision_embeddings,
self.config.image_token_index)
input_ids = None
else:
inputs_embeds = None
hidden_states = self.language_model.model(input_ids, hidden_states = self.language_model.model(input_ids,
positions, positions,

View File

@ -676,7 +676,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
return image_embeds return image_embeds
def process_mm_inputs(self, **kwargs): def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
image_input = self._parse_and_validate_image_input(**kwargs) image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None: if image_input is None:
return None return None
@ -686,12 +686,12 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
def get_input_embeddings( def get_input_embeddings(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
vision_embeddings: Optional[NestedTensors] = None, multimodal_embeddings: Optional[NestedTensors] = None,
) -> torch.Tensor: ) -> torch.Tensor:
inputs_embeds = self.embed_tokens(input_ids) inputs_embeds = self.embed_tokens(input_ids)
if vision_embeddings is not None: if multimodal_embeddings is not None:
inputs_embeds = merge_multimodal_embeddings( inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, vision_embeddings, input_ids, inputs_embeds, multimodal_embeddings,
self.image_token_id) self.image_token_id)
return inputs_embeds return inputs_embeds
@ -703,12 +703,14 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object): **kwargs: object):
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility
elif inputs_embeds is None: elif inputs_embeds is None:
vision_embeddings = self.process_mm_inputs(**kwargs) vision_embeddings = self.get_multimodal_embeddings(**kwargs)
# always pass the input via `inputs_embeds`
# to make sure the computation graph is consistent
inputs_embeds = self.get_input_embeddings(input_ids, inputs_embeds = self.get_input_embeddings(input_ids,
vision_embeddings) vision_embeddings)
input_ids = None input_ids = None

View File

@ -42,10 +42,12 @@ from vllm.model_executor.model_loader.weight_utils import (
from vllm.model_executor.models.qwen2 import Qwen2Model from vllm.model_executor.models.qwen2 import Qwen2Model
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.inputs import NestedTensors
from vllm.multimodal.utils import consecutive_placeholder_ranges from vllm.multimodal.utils import consecutive_placeholder_ranges
from vllm.sequence import IntermediateTensors, SequenceData from vllm.sequence import IntermediateTensors, SequenceData
from .interfaces import SupportsMultiModal, SupportsPP from .interfaces import SupportsMultiModal, SupportsPP
from .utils import merge_multimodal_embeddings
logger = init_logger(__name__) logger = init_logger(__name__)
@ -371,6 +373,25 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
return masked_audio_features return masked_audio_features
def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
audio_input = self._parse_and_validate_audio_input(**kwargs)
if audio_input is None:
return None
masked_audio_features = self._process_audio_input(audio_input)
return masked_audio_features
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[NestedTensors] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
self.config.audio_token_index)
return inputs_embeds
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
@ -378,33 +399,27 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
kv_caches: List[torch.Tensor], kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata, attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object, **kwargs: object,
) -> Union[torch.Tensor, IntermediateTensors]: ) -> Union[torch.Tensor, IntermediateTensors]:
if intermediate_tensors is not None: if intermediate_tensors is not None:
input_ids = None
inputs_embeds = None inputs_embeds = None
else:
audio_input = self._parse_and_validate_audio_input(**kwargs)
if audio_input is None: # NOTE: In v1, inputs_embeds is always generated at model runner, this
inputs_embeds = None # condition is for v0 compatibility.
else: elif inputs_embeds is None:
inputs_embeds = self.language_model.embed_tokens(input_ids) multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
masked_audio_features = self._process_audio_input(audio_input) inputs_embeds = self.get_input_embeddings(input_ids,
# merge llm embeddings and audio features multimodal_embeddings)
mask = (input_ids == self.config.audio_token_index) input_ids = None
inputs_embeds[mask, :] = masked_audio_features
input_ids = None hidden_states = self.language_model(input_ids,
positions,
hidden_states = self.language_model( kv_caches,
input_ids=input_ids, attn_metadata,
positions=positions, intermediate_tensors,
kv_caches=kv_caches, inputs_embeds=inputs_embeds)
attn_metadata=attn_metadata,
intermediate_tensors=intermediate_tensors,
inputs_embeds=inputs_embeds,
)
return hidden_states return hidden_states
def compute_logits(self, hidden_states: torch.Tensor, def compute_logits(self, hidden_states: torch.Tensor,

View File

@ -63,7 +63,7 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.image import cached_get_image_processor
from vllm.multimodal.inputs import (MultiModalData, MultiModalDataDict, from vllm.multimodal.inputs import (MultiModalData, MultiModalDataDict,
MultiModalKwargs) MultiModalKwargs, NestedTensors)
from vllm.multimodal.utils import cached_get_tokenizer from vllm.multimodal.utils import cached_get_tokenizer
from vllm.platforms import _Backend from vllm.platforms import _Backend
from vllm.sequence import IntermediateTensors, PoolerOutput, SequenceData from vllm.sequence import IntermediateTensors, PoolerOutput, SequenceData
@ -1238,6 +1238,55 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
inputs_embeds[mask, :] = multimodal_embeddings inputs_embeds[mask, :] = multimodal_embeddings
return inputs_embeds return inputs_embeds
def get_multimodal_embeddings(
self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]:
image_input = self._parse_and_validate_image_input(**kwargs)
video_input = self._parse_and_validate_video_input(**kwargs)
if image_input is None and video_input is None:
return None
# We make a tuple of each embedding with its modality string. This is a
# temporary workaround for models to handle mixed modalities when
# get_multimodal_embeddings and get_input_embeddings are called
# separately.
# TODO(ywang96): Add support for mixed-modality inference for v1.
multimodal_embeddings: List[Tuple[NestedTensors, str]] = []
if image_input is not None:
image_embeds = self._process_image_input(image_input)
multimodal_embeddings.append((image_embeds, "image"))
if video_input is not None:
video_embeds = self._process_video_input(video_input)
multimodal_embeddings.append((video_embeds, "video"))
return multimodal_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[List[Tuple[NestedTensors,
str]]] = None,
) -> torch.Tensor:
inputs_embeds = self.model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
for embeddings, modality in multimodal_embeddings:
if modality == "image":
inputs_embeds = self._merge_multimodal_embeddings(
input_ids,
inputs_embeds,
embeddings,
placeholder_token_id=self.config.image_token_id,
)
if modality == "video":
inputs_embeds = self._merge_multimodal_embeddings(
input_ids,
inputs_embeds,
embeddings,
placeholder_token_id=self.config.video_token_id,
)
return inputs_embeds
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
@ -1245,6 +1294,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
kv_caches: List[torch.Tensor], kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata, attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object, **kwargs: object,
) -> Union[torch.Tensor, IntermediateTensors]: ) -> Union[torch.Tensor, IntermediateTensors]:
"""Run forward pass for Qwen2-VL. """Run forward pass for Qwen2-VL.
@ -1266,42 +1316,26 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
`None` if no videos are passed. `None` if no videos are passed.
""" """
if intermediate_tensors is not None: if intermediate_tensors is not None:
input_ids = None
inputs_embeds = None inputs_embeds = None
else:
image_input = self._parse_and_validate_image_input(**kwargs)
video_input = self._parse_and_validate_video_input(**kwargs)
if image_input is None and video_input is None: # NOTE: In v1, inputs_embeds is always generated at model runner, this
inputs_embeds = None # condition is for v0 compatibility.
else: elif inputs_embeds is None:
if uses_mrope(self.config): multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
assert positions.ndim == 2 and positions.size(0) == 3, (
"multimodal section rotary embedding requires "
f"(3, seq_len) positions, but got {positions.size()}")
inputs_embeds = self.model.embed_tokens(input_ids) # We need to check for usage of mrope here in case there is
# multimodal data.
# TODO (ywang96): move this to model runner in V1.
if multimodal_embeddings is not None and uses_mrope(self.config):
assert positions.ndim == 2 and positions.size(0) == 3, (
"multimodal section rotary embedding requires "
f"(3, seq_len) positions, but got {positions.size()}")
if image_input is not None: inputs_embeds = self.get_input_embeddings(input_ids,
image_embeds = self._process_image_input(image_input) multimodal_embeddings)
inputs_embeds = self._merge_multimodal_embeddings( input_ids = None
input_ids,
inputs_embeds,
image_embeds,
placeholder_token_id=self.config.image_token_id,
)
if video_input is not None:
video_embeds = self._process_video_input(video_input)
inputs_embeds = self._merge_multimodal_embeddings(
input_ids,
inputs_embeds,
video_embeds,
placeholder_token_id=self.config.video_token_id,
)
input_ids = None
hidden_states = self.model( hidden_states = self.model(
input_ids=input_ids, input_ids=input_ids,

View File

@ -449,10 +449,36 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
return result return result
def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
audio_input = self._parse_and_validate_audio_input(**kwargs)
if audio_input is None:
return None
audio_embeddings = self._process_audio_input(audio_input)
return audio_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[NestedTensors] = None,
attn_metadata: Optional[AttentionMetadata] = None,
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
# TODO(ywang96): use merge_multimodal_embeddings after
# v0 is deprecated
merge_multimodal_embeddings_from_map(
inputs_embeds, multimodal_embeddings,
attn_metadata.multi_modal_placeholder_index_maps["audio"])
return inputs_embeds
def forward(self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[torch.Tensor], kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata, attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[torch.Tensor], intermediate_tensors: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs) -> Union[torch.Tensor, IntermediateTensors]: **kwargs) -> Union[torch.Tensor, IntermediateTensors]:
"""Run forward pass for Ultravox """Run forward pass for Ultravox
@ -466,30 +492,28 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
Args: Args:
audio_features: A batch of audio inputs [B, N, 80, M]. audio_features: A batch of audio inputs [B, N, 80, M].
""" """
if intermediate_tensors is not None: if intermediate_tensors is not None:
input_ids = None
inputs_embeds = None inputs_embeds = None
else:
audio_input = self._parse_and_validate_audio_input(**kwargs)
if audio_input is not None:
audio_embeddings = self._process_audio_input(audio_input)
inputs_embeds = self.language_model.model.get_input_embeddings(
input_ids)
merge_multimodal_embeddings_from_map( # NOTE: In v1, inputs_embeds is always generated at model runner, this
inputs_embeds, audio_embeddings, # condition is for v0 compatibility.
attn_metadata.multi_modal_placeholder_index_maps["audio"]) elif inputs_embeds is None:
input_ids = None multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
else:
inputs_embeds = None
hidden_states = self.language_model.model( # TODO(ywang96): remove attn_metadata from get_input_embeddings
input_ids=input_ids, # after v0 is deprecated
positions=positions, inputs_embeds = self.get_input_embeddings(input_ids,
kv_caches=kv_caches, multimodal_embeddings,
attn_metadata=attn_metadata, attn_metadata)
intermediate_tensors=intermediate_tensors, input_ids = None
inputs_embeds=inputs_embeds)
hidden_states = self.language_model.model(input_ids,
positions,
kv_caches,
attn_metadata,
intermediate_tensors,
inputs_embeds=inputs_embeds)
return hidden_states return hidden_states
def compute_logits(self, hidden_states: torch.Tensor, def compute_logits(self, hidden_states: torch.Tensor,

View File

@ -356,8 +356,7 @@ def embed_multimodal(
input_ids: torch.Tensor, input_ids: torch.Tensor,
multimodal_token_id: int, multimodal_token_id: int,
get_text_embeds: Callable[[torch.Tensor], torch.Tensor], get_text_embeds: Callable[[torch.Tensor], torch.Tensor],
get_multimodal_embeds: Callable[[torch.Tensor], Union[torch.Tensor, multimodal_embeds: Union[torch.Tensor, List[torch.Tensor]],
List[torch.Tensor]]],
) -> torch.Tensor: ) -> torch.Tensor:
""" """
Embed token IDs and multimodal inputs and combine their embeddings. Embed token IDs and multimodal inputs and combine their embeddings.
@ -374,8 +373,6 @@ def embed_multimodal(
is_text = ~is_multimodal is_text = ~is_multimodal
text_embeds = get_text_embeds(input_ids[is_text]) text_embeds = get_text_embeds(input_ids[is_text])
multimodal_embeds = get_multimodal_embeds(input_ids[is_multimodal])
merged_embeds = torch.empty( merged_embeds = torch.empty(
(input_ids.shape[0], text_embeds.shape[1]), (input_ids.shape[0], text_embeds.shape[1]),
dtype=text_embeds.dtype, dtype=text_embeds.dtype,

View File

@ -363,7 +363,8 @@ class GPUModelRunner:
# 2. A list (length: num_images) of tensors, each of shape # 2. A list (length: num_images) of tensors, each of shape
# [feature_size, hidden_size] in case when the feature size is # [feature_size, hidden_size] in case when the feature size is
# dynamic depending on input images. # dynamic depending on input images.
encoder_outputs = self.model.process_mm_inputs(**batched_mm_inputs) encoder_outputs = self.model.get_multimodal_embeddings(
**batched_mm_inputs)
# Cache the encoder outputs. # Cache the encoder outputs.
for (req_id, input_id), output in zip(req_input_ids, encoder_outputs): for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):