Migrate MllamaImagePixelInputs to TensorSchema (#22020)

Signed-off-by: Benji Beck <benjibeck@meta.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-12-15 04:54:58 +08:00 · 2025-08-21 20:28:49 -07:00 · 2025-08-21 20:28:49 -07:00 · 0b9cc56fac
commit 0b9cc56fac
parent 8896eb72eb
1 changed files with 26 additions and 10 deletions
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@ -17,7 +17,7 @@
 """PyTorch Mllama model."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 import numpy as np
 import torch
@ -64,6 +64,7 @@ from vllm.multimodal.processing import (BaseProcessingInfo,
                                        EncDecMultiModalProcessor,
                                        PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from .clip import CLIPMLP
 from .interfaces import SupportsMultiModal, SupportsV0Only
@ -73,15 +74,30 @@ from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 logger = init_logger(__name__)
-class MllamaImagePixelInputs(TypedDict):
+class MllamaImagePixelInputs(TensorSchema):
-    type: Literal["pixel_values"]
+    """
-    data: torch.Tensor
+    Dimensions:
-    """Shape: """
+        - batch_size: Batch size
-    """(batch_size, max_num_image, max_num_chunk, num_channel, height, width)"""
+        - max_num_image: Max number of images
-    aspect_ratio_ids: torch.Tensor
+        - max_num_chunk: Max number of chunks
-    """Shape: `(batch_size, max_num_image)`"""
+        - max_num_tiles: Max number of tiles per image
-    aspect_ratio_mask: torch.Tensor
+        - num_channel: Number of channels
-    """Shape: `(batch_size, max_num_image, max_num_tiles)`"""
+        - height: Height
        - width: Width
    """
    type: Literal["pixel_values"] = "pixel_values"
    data: Annotated[torch.Tensor,
                    TensorShape("batch_size", "max_num_image", "max_num_chunk",
                                "num_channel", "height", "width")]
    aspect_ratio_ids: Annotated[torch.Tensor,
                                TensorShape("batch_size", "max_num_image")]
    aspect_ratio_mask: Annotated[
        torch.Tensor,
        TensorShape("batch_size", "max_num_image", "max_num_tiles")]
 # TODO: support LlamaImageEmbeddingInputs