mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-26 13:57:04 +08:00
[BugFix][Model] Fix Ernie4.5-VL hanging on long inputs (#24074)
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
This commit is contained in:
parent
3e0d4a3475
commit
b6fbc15634
@ -66,8 +66,6 @@ from .vision import get_vit_attn_backend
|
|||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
_MAX_FRAMES_PER_VIDEO = 16
|
|
||||||
|
|
||||||
# === Vision Transformer === #
|
# === Vision Transformer === #
|
||||||
|
|
||||||
|
|
||||||
@ -839,6 +837,15 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
|
|||||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||||
return {"image": None, "video": None}
|
return {"image": None, "video": None}
|
||||||
|
|
||||||
|
def get_mm_max_tokens_per_item(
|
||||||
|
self,
|
||||||
|
seq_len: int,
|
||||||
|
mm_counts: Mapping[str, int],
|
||||||
|
) -> Mapping[str, int]:
|
||||||
|
max_image_tokens = self.get_max_image_tokens()
|
||||||
|
max_video_tokens = self.get_max_video_tokens(seq_len, mm_counts)
|
||||||
|
return {"image": max_image_tokens, "video": max_video_tokens}
|
||||||
|
|
||||||
def _get_vision_info(
|
def _get_vision_info(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
@ -964,8 +971,7 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
|
|||||||
max_image_tokens = self.get_max_image_tokens() * max_images
|
max_image_tokens = self.get_max_image_tokens() * max_images
|
||||||
max_total_frames = self._get_max_video_frames(seq_len -
|
max_total_frames = self._get_max_video_frames(seq_len -
|
||||||
max_image_tokens)
|
max_image_tokens)
|
||||||
max_frames_per_video = min(max_total_frames // max(max_videos, 1),
|
max_frames_per_video = max_total_frames // max(max_videos, 1)
|
||||||
_MAX_FRAMES_PER_VIDEO)
|
|
||||||
|
|
||||||
return max(max_frames_per_video, 2)
|
return max(max_frames_per_video, 2)
|
||||||
|
|
||||||
|
|||||||
@ -287,8 +287,13 @@ class Ernie4_5_VLMoeMoE(nn.Module):
|
|||||||
if self.has_shared_experts:
|
if self.has_shared_experts:
|
||||||
shared_output = self.shared_experts(hidden_states)
|
shared_output = self.shared_experts(hidden_states)
|
||||||
|
|
||||||
if visual_token_mask is not None and visual_token_mask.any():
|
if visual_token_mask is not None and visual_token_mask.all():
|
||||||
# assert visual_token_mask.shape[0] != hidden_states.shape[0]
|
# only vision modal input
|
||||||
|
router_logits, _ = self.vision_experts_gate(hidden_states)
|
||||||
|
final_hidden_states = self.vision_experts(
|
||||||
|
hidden_states=hidden_states, router_logits=router_logits)
|
||||||
|
elif visual_token_mask is not None and visual_token_mask.any():
|
||||||
|
# text and vision modals input
|
||||||
visual_token_mask = visual_token_mask.repeat(
|
visual_token_mask = visual_token_mask.repeat(
|
||||||
1, self.hidden_size).bool()
|
1, self.hidden_size).bool()
|
||||||
text_token_mask = ~visual_token_mask
|
text_token_mask = ~visual_token_mask
|
||||||
@ -310,7 +315,7 @@ class Ernie4_5_VLMoeMoE(nn.Module):
|
|||||||
hidden_states=vision_hidden_states,
|
hidden_states=vision_hidden_states,
|
||||||
router_logits=vision_router_logits).flatten()
|
router_logits=vision_router_logits).flatten()
|
||||||
else:
|
else:
|
||||||
# text modal input processing directly
|
# only text modal input
|
||||||
text_router_logits, _ = self.text_experts_gate(hidden_states)
|
text_router_logits, _ = self.text_experts_gate(hidden_states)
|
||||||
|
|
||||||
final_hidden_states = self.text_experts(
|
final_hidden_states = self.text_experts(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user