mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-06 11:29:08 +08:00
Fix EVS crash when using video_embeds inputs in Qwen2.5-VL (#29232)
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
parent
ae66818379
commit
d84d8f4429
@ -230,6 +230,9 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
|
|||||||
- hidden_size must match the hidden size of language model backbone.
|
- hidden_size must match the hidden size of language model backbone.
|
||||||
- video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
|
- video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
|
||||||
format
|
format
|
||||||
|
- second_per_grid_ts: The video time interval (in seconds) for each
|
||||||
|
grid along the temporal dimension in the 3D position IDs. Returned
|
||||||
|
when `videos` is not `None`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
type: Literal["video_embeds"]
|
type: Literal["video_embeds"]
|
||||||
@ -244,6 +247,11 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
|
|||||||
TensorShape("nv", 3),
|
TensorShape("nv", 3),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
second_per_grid_ts: Annotated[
|
||||||
|
torch.Tensor | None,
|
||||||
|
TensorShape("nv"),
|
||||||
|
] = None
|
||||||
|
|
||||||
|
|
||||||
Qwen2_5_VLVideoInputs: TypeAlias = (
|
Qwen2_5_VLVideoInputs: TypeAlias = (
|
||||||
Qwen2_5_VLVideoPixelInputs | Qwen2_5_VLVideoEmbeddingInputs
|
Qwen2_5_VLVideoPixelInputs | Qwen2_5_VLVideoEmbeddingInputs
|
||||||
@ -1311,6 +1319,7 @@ class Qwen2_5_VLForConditionalGeneration(
|
|||||||
type="video_embeds",
|
type="video_embeds",
|
||||||
video_embeds=video_embeds,
|
video_embeds=video_embeds,
|
||||||
video_grid_thw=video_grid_thw,
|
video_grid_thw=video_grid_thw,
|
||||||
|
second_per_grid_ts=second_per_grid_ts,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _process_image_input(
|
def _process_image_input(
|
||||||
@ -1422,7 +1431,13 @@ class Qwen2_5_VLForConditionalGeneration(
|
|||||||
|
|
||||||
# Cast to long to match the original code
|
# Cast to long to match the original code
|
||||||
# https://github.com/huggingface/transformers/blob/41980ce93e775f6c88500c51c8db7946fc6a2add/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py#L491 # noqa
|
# https://github.com/huggingface/transformers/blob/41980ce93e775f6c88500c51c8db7946fc6a2add/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py#L491 # noqa
|
||||||
second_per_grid_ts = video_input["second_per_grid_ts"].long()
|
second_per_grid_ts = video_input.get("second_per_grid_ts")
|
||||||
|
if second_per_grid_ts is None:
|
||||||
|
raise ValueError(
|
||||||
|
"second_per_grid_ts is required when video_pruning_rate > 0 "
|
||||||
|
"is enabled for video inputs, including the video_embeds path."
|
||||||
|
)
|
||||||
|
second_per_grid_ts = second_per_grid_ts.long()
|
||||||
tokens_per_second = self.config.vision_config.tokens_per_second
|
tokens_per_second = self.config.vision_config.tokens_per_second
|
||||||
|
|
||||||
video_embeds_out = []
|
video_embeds_out = []
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user