Migrate Florence2ImagePixelInputs to TensorSchema (#21663)

Signed-off-by: Benji Beck <benjibeck@meta.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2026-01-27 10:25:15 +08:00 · 2025-07-27 02:43:02 -07:00 · 2025-07-27 02:43:02 -07:00 · 5f8c9a425e
commit 5f8c9a425e
parent 1cbf951ba2
1 changed files with 24 additions and 28 deletions
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@ -4,7 +4,7 @@
 import math
 from collections import OrderedDict
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union

 import torch
 import torch.nn as nn
@ -29,16 +29,28 @@ from vllm.multimodal.processing import (BaseProcessingInfo,
                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
                         SupportsV0Only)
 from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings


-class Florence2ImagePixelInputs(TypedDict):
+class Florence2ImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - b: Batch size
+        - c: Number of channels (3)
+        - h: Height of the image
+        - w: Width of the image
+    """
+
    type: Literal["pixel_values"]
-    data: torch.Tensor
-    """Shape: (batch_size, num_channel, height, width)"""
+
+    data: Annotated[
+        torch.Tensor,
+        TensorShape("b", 3, "h", "w"),
+    ]


 # ViT implementation are all copied from
@ -931,28 +943,6 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
            raise NotImplementedError(
                'Florence2 only supports COSINE as temporal embedding.')

-    def _validate_pixel_values(
-        self, data: Union[torch.Tensor, list[torch.Tensor]]
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
-
-        size = self.processor_config["size"]
-        h, w = size["height"], size["width"]
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape)
-
-            if actual_dims != expected_dims:
-                expected_expr = tuple(*map(str, expected_dims))
-                raise ValueError(
-                    "The expected shape of pixel values per batch "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
    def _parse_and_validate_image_input(self, **kwargs: object):
        pixel_values: Optional[Union[list[list[torch.Tensor]],
                                     list[torch.Tensor],
@ -971,10 +961,16 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
                "Both pixel values and image embeds are provided.")

        if pixel_values is not None:
+            size = self.processor_config["size"]
+            expected_h, expected_w = size["height"], size["width"]
+
            return Florence2ImagePixelInputs(
                type="pixel_values",
-                data=self._validate_pixel_values(
-                    flatten_bn(pixel_values, concat=True)),
+                data=flatten_bn(pixel_values, concat=True),
+                resolve_bindings={
+                    "h": expected_h,
+                    "w": expected_w
+                },
            )

        if image_embeds is not None: