mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-24 11:16:34 +08:00
Fix copy-paste error in phi4mm image processing (#18315)
Signed-off-by: Lifu Huang <lifu.hlf@gmail.com>
This commit is contained in:
parent
908733aca7
commit
4fb349f66a
@ -415,15 +415,6 @@ class Phi4MMImagePixelInputs(TypedDict):
|
|||||||
"""Shape: `(batch_size * num_images, H_mask, W_mask)`"""
|
"""Shape: `(batch_size * num_images, H_mask, W_mask)`"""
|
||||||
|
|
||||||
|
|
||||||
class Phi4MMImageEmbeddingInputs(TypedDict):
|
|
||||||
type: Literal["image_embeds"]
|
|
||||||
data: Union[torch.Tensor, list[torch.Tensor]]
|
|
||||||
"""Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
|
|
||||||
|
|
||||||
`hidden_size` must match the hidden size of language model backbone.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class Phi4MMAudioFeatureInputs(TypedDict):
|
class Phi4MMAudioFeatureInputs(TypedDict):
|
||||||
type: Literal["audio_features"]
|
type: Literal["audio_features"]
|
||||||
data: Union[torch.Tensor, list[torch.Tensor]]
|
data: Union[torch.Tensor, list[torch.Tensor]]
|
||||||
@ -436,7 +427,6 @@ class Phi4MMAudioEmbeddingInputs(TypedDict):
|
|||||||
"""Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)"""
|
"""Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)"""
|
||||||
|
|
||||||
|
|
||||||
Phi4MMImageInput = Union[Phi4MMImagePixelInputs, Phi4MMImageEmbeddingInputs]
|
|
||||||
Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
|
Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
|
||||||
|
|
||||||
|
|
||||||
@ -1112,15 +1102,13 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
|
|||||||
|
|
||||||
def _process_image_input(
|
def _process_image_input(
|
||||||
self, image_input: Phi4MMImagePixelInputs) -> list[torch.Tensor]:
|
self, image_input: Phi4MMImagePixelInputs) -> list[torch.Tensor]:
|
||||||
if image_input["type"] == "image_embeds":
|
|
||||||
image_embeds = image_input["image_embeds"].type(self.visual.dtype)
|
dtype = next(self.vision_encoder.parameters()).dtype
|
||||||
else:
|
pixel_values = image_input['data'].to(dtype)
|
||||||
dtype = next(self.vision_encoder.parameters()).dtype
|
image_sizes = image_input['image_sizes']
|
||||||
pixel_values = image_input['data'].to(dtype)
|
image_attention_mask = image_input['image_attention_mask']
|
||||||
image_sizes = image_input['image_sizes']
|
image_embeds = self.vision_encoder(pixel_values, image_sizes,
|
||||||
image_attention_mask = image_input['image_attention_mask']
|
image_attention_mask)
|
||||||
image_embeds = self.vision_encoder(pixel_values, image_sizes,
|
|
||||||
image_attention_mask)
|
|
||||||
return image_embeds
|
return image_embeds
|
||||||
|
|
||||||
def get_multimodal_embeddings(
|
def get_multimodal_embeddings(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user