mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-05 13:15:42 +08:00
Migrate KeyeImageInputs and KeyeVideoInputs to TensorSchema (#21686)
Signed-off-by: Benji Beck <benjibeck@meta.com>
This commit is contained in:
parent
a6c050286a
commit
d128d0d554
@ -3,7 +3,7 @@
|
|||||||
import math
|
import math
|
||||||
from collections.abc import Iterable, Mapping, Sequence
|
from collections.abc import Iterable, Mapping, Sequence
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import Any, Literal, Optional, TypedDict, Union
|
from typing import Annotated, Any, Literal, Optional, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
@ -46,6 +46,7 @@ from vllm.sequence import IntermediateTensors
|
|||||||
from vllm.transformers_utils.config import uses_mrope
|
from vllm.transformers_utils.config import uses_mrope
|
||||||
from vllm.transformers_utils.processor import (
|
from vllm.transformers_utils.processor import (
|
||||||
cached_image_processor_from_config)
|
cached_image_processor_from_config)
|
||||||
|
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||||
|
|
||||||
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
|
||||||
SupportsMultiModal, SupportsPP)
|
SupportsMultiModal, SupportsPP)
|
||||||
@ -102,77 +103,62 @@ def smart_resize(
|
|||||||
return h_bar, w_bar
|
return h_bar, w_bar
|
||||||
|
|
||||||
|
|
||||||
class KeyeImagePixelInputs(TypedDict):
|
class KeyeImagePixelInputs(TensorSchema):
|
||||||
|
"""
|
||||||
|
Dimensions:
|
||||||
|
- np: Number of patches
|
||||||
|
- cps: Number of channels * patch_size * patch_size
|
||||||
|
- ni: Number of images
|
||||||
|
- g: Grid dimensions (3 for t, h, w)
|
||||||
|
"""
|
||||||
type: Literal["pixel_values"]
|
type: Literal["pixel_values"]
|
||||||
pixel_values: torch.Tensor
|
pixel_values: Annotated[torch.Tensor, TensorShape("np", "cps")]
|
||||||
"""Shape:
|
image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
|
||||||
`(num_patches, num_channels * patch_size * patch_size)`
|
|
||||||
|
|
||||||
|
class KeyeImageEmbeddingInputs(TensorSchema):
|
||||||
"""
|
"""
|
||||||
|
Dimensions:
|
||||||
image_grid_thw: torch.Tensor
|
- nf: Number of image features
|
||||||
"""Shape: `(num_images, 3)`
|
- hs: Hidden size (must match the hidden size of language model
|
||||||
This should be in `(grid_t, grid_h, grid_w)` format.
|
backbone)
|
||||||
|
- ni: Number of images
|
||||||
|
- g: Grid dimensions (3 for t, h, w)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
class KeyeImageEmbeddingInputs(TypedDict):
|
|
||||||
type: Literal["image_embeds"]
|
type: Literal["image_embeds"]
|
||||||
image_embeds: torch.Tensor
|
image_embeds: Annotated[torch.Tensor, TensorShape("nf", "hs")]
|
||||||
"""Supported types:
|
image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
|
||||||
- list[`torch.Tensor`]: A list of tensors holding all images' features.
|
|
||||||
Each tensor holds an image's features.
|
|
||||||
- `torch.Tensor`: A tensor holding all images' features
|
|
||||||
(concatenation of all images' feature tensors).
|
|
||||||
|
|
||||||
Tensor shape: `(num_image_features, hidden_size)`
|
|
||||||
- `num_image_features` varies based on
|
|
||||||
the number and resolution of the images.
|
|
||||||
- `hidden_size` must match the hidden size of language model backbone.
|
|
||||||
"""
|
|
||||||
|
|
||||||
image_grid_thw: torch.Tensor
|
|
||||||
"""Shape: `(num_images, 3)`
|
|
||||||
This should be in `(grid_t, grid_h, grid_w)` format.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
KeyeImageInputs = Union[KeyeImagePixelInputs, KeyeImageEmbeddingInputs]
|
KeyeImageInputs = Union[KeyeImagePixelInputs, KeyeImageEmbeddingInputs]
|
||||||
|
|
||||||
|
|
||||||
class KeyeVideoPixelInputs(TypedDict):
|
class KeyeVideoPixelInputs(TensorSchema):
|
||||||
|
"""
|
||||||
|
Dimensions:
|
||||||
|
- np: Number of patches
|
||||||
|
- ctps: Number of channels * temporal_patch_size * patch_size *
|
||||||
|
patch_size
|
||||||
|
- nv: Number of videos
|
||||||
|
- g: Grid dimensions (3 for t, h, w)
|
||||||
|
"""
|
||||||
type: Literal["pixel_values_videos"]
|
type: Literal["pixel_values_videos"]
|
||||||
pixel_values_videos: torch.Tensor
|
pixel_values_videos: Annotated[torch.Tensor, TensorShape("np", "ctps")]
|
||||||
"""Shape:
|
video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)]
|
||||||
`(num_patches,
|
|
||||||
num_channels * temporal_patch_size * patch_size * patch_size)`
|
|
||||||
|
class KeyeVideoEmbeddingInputs(TensorSchema):
|
||||||
"""
|
"""
|
||||||
|
Dimensions:
|
||||||
video_grid_thw: torch.Tensor
|
- nf: Number of video features
|
||||||
"""Shape: `(num_videos, 3)`
|
- hs: Hidden size (must match the hidden size of language model
|
||||||
|
backbone)
|
||||||
This should be in `(grid_t, grid_h, grid_w)` format.
|
- nv: Number of videos
|
||||||
|
- g: Grid dimensions (3 for t, h, w)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
class KeyeVideoEmbeddingInputs(TypedDict):
|
|
||||||
type: Literal["video_embeds"]
|
type: Literal["video_embeds"]
|
||||||
video_embeds: torch.Tensor
|
video_embeds: Annotated[torch.Tensor, TensorShape("nf", "hs")]
|
||||||
"""Supported types:
|
video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)]
|
||||||
- list[`torch.Tensor`]: A list of tensors holding all videos' features.
|
|
||||||
Each tensor holds an video's features.
|
|
||||||
- `torch.Tensor`: A tensor holding all videos' features
|
|
||||||
(concatenation of all videos' feature tensors).
|
|
||||||
|
|
||||||
Tensor shape: `(num_image_features, hidden_size)`
|
|
||||||
- `num_image_features` varies based on
|
|
||||||
the number and resolution of the videos.
|
|
||||||
- `hidden_size` must match the hidden size of language model backbone.
|
|
||||||
"""
|
|
||||||
|
|
||||||
video_grid_thw: torch.Tensor
|
|
||||||
"""Shape: `(num_videos, 3)`
|
|
||||||
This should be in `(grid_t, grid_h, grid_w)` format.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
KeyeVideoInputs = Union[KeyeVideoPixelInputs, KeyeVideoEmbeddingInputs]
|
KeyeVideoInputs = Union[KeyeVideoPixelInputs, KeyeVideoEmbeddingInputs]
|
||||||
@ -1420,10 +1406,6 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
|
|||||||
image_grid_thw = self._validate_and_reshape_mm_tensor(
|
image_grid_thw = self._validate_and_reshape_mm_tensor(
|
||||||
image_grid_thw, "image grid_thw")
|
image_grid_thw, "image grid_thw")
|
||||||
|
|
||||||
if not isinstance(pixel_values, (torch.Tensor, list)):
|
|
||||||
raise ValueError("Incorrect type of image pixel values. "
|
|
||||||
f"Got type: {type(pixel_values)}")
|
|
||||||
|
|
||||||
return KeyeImagePixelInputs(
|
return KeyeImagePixelInputs(
|
||||||
type="pixel_values",
|
type="pixel_values",
|
||||||
pixel_values=pixel_values,
|
pixel_values=pixel_values,
|
||||||
@ -1436,9 +1418,6 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
|
|||||||
image_grid_thw = self._validate_and_reshape_mm_tensor(
|
image_grid_thw = self._validate_and_reshape_mm_tensor(
|
||||||
image_grid_thw, "image grid_thw")
|
image_grid_thw, "image grid_thw")
|
||||||
|
|
||||||
if not isinstance(image_embeds, torch.Tensor):
|
|
||||||
raise ValueError("Incorrect type of image embeddings. "
|
|
||||||
f"Got type: {type(image_embeds)}")
|
|
||||||
return KeyeImageEmbeddingInputs(
|
return KeyeImageEmbeddingInputs(
|
||||||
type="image_embeds",
|
type="image_embeds",
|
||||||
image_embeds=image_embeds,
|
image_embeds=image_embeds,
|
||||||
@ -1474,9 +1453,6 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
|
|||||||
video_grid_thw = self._validate_and_reshape_mm_tensor(
|
video_grid_thw = self._validate_and_reshape_mm_tensor(
|
||||||
video_grid_thw, "video grid_thw")
|
video_grid_thw, "video grid_thw")
|
||||||
|
|
||||||
if not isinstance(video_embeds, torch.Tensor):
|
|
||||||
raise ValueError("Incorrect type of video embeddings. "
|
|
||||||
f"Got type: {type(video_embeds)}")
|
|
||||||
return KeyeVideoEmbeddingInputs(
|
return KeyeVideoEmbeddingInputs(
|
||||||
type="video_embeds",
|
type="video_embeds",
|
||||||
video_embeds=video_embeds,
|
video_embeds=video_embeds,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user