mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 16:35:43 +08:00
[V1] VLM - Run the mm_mapper preprocessor in the frontend process (#10640)
Signed-off-by: Roger Wang <ywang@roblox.com> Co-authored-by: Michael Goin <michael@neuralmagic.com> Co-authored-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
parent
f6084f6324
commit
3bc94cab69
@ -27,9 +27,8 @@ def make_request() -> EngineCoreRequest:
|
||||
request_id=uuid.uuid4(),
|
||||
prompt=PROMPT,
|
||||
prompt_token_ids=PROMPT_TOKENS,
|
||||
mm_data=None,
|
||||
mm_inputs=None,
|
||||
mm_placeholders=None,
|
||||
mm_processor_kwargs=None,
|
||||
sampling_params=SamplingParams(),
|
||||
eos_token_id=None,
|
||||
arrival_time=time.time(),
|
||||
|
||||
@ -29,9 +29,8 @@ def make_request(params: SamplingParams) -> EngineCoreRequest:
|
||||
request_id=str(uuid.uuid4()),
|
||||
prompt=PROMPT,
|
||||
prompt_token_ids=PROMPT_TOKENS,
|
||||
mm_data=None,
|
||||
mm_inputs=None,
|
||||
mm_placeholders=None,
|
||||
mm_processor_kwargs=None,
|
||||
sampling_params=params,
|
||||
eos_token_id=None,
|
||||
arrival_time=time.time(),
|
||||
|
||||
@ -7,7 +7,8 @@ import torch
|
||||
from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
|
||||
from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs,
|
||||
MultiModalPlaceholderDict)
|
||||
from vllm.multimodal.inputs import MultiModalInputsV2
|
||||
|
||||
|
||||
@ -150,6 +151,12 @@ class TokenInputs(TypedDict):
|
||||
if the model supports it.
|
||||
"""
|
||||
|
||||
multi_modal_inputs: NotRequired["MultiModalKwargs"]
|
||||
"""
|
||||
Optional multi-modal inputs to pass to the model,
|
||||
if the model supports it.
|
||||
"""
|
||||
|
||||
multi_modal_placeholders: NotRequired["MultiModalPlaceholderDict"]
|
||||
"""
|
||||
Placeholder ranges for the multi-modal data.
|
||||
@ -169,6 +176,7 @@ def token_inputs(
|
||||
token_type_ids: Optional[List[int]] = None,
|
||||
prompt: Optional[str] = None,
|
||||
multi_modal_data: Optional["MultiModalDataDict"] = None,
|
||||
multi_modal_inputs: Optional["MultiModalKwargs"] = None,
|
||||
multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None,
|
||||
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
|
||||
) -> TokenInputs:
|
||||
@ -181,6 +189,8 @@ def token_inputs(
|
||||
inputs["token_type_ids"] = token_type_ids
|
||||
if multi_modal_data is not None:
|
||||
inputs["multi_modal_data"] = multi_modal_data
|
||||
if multi_modal_inputs is not None:
|
||||
inputs["multi_modal_inputs"] = multi_modal_inputs
|
||||
if multi_modal_placeholders is not None:
|
||||
inputs["multi_modal_placeholders"] = multi_modal_placeholders
|
||||
if mm_processor_kwargs is not None:
|
||||
@ -273,6 +283,18 @@ class SingletonInputsAdapter:
|
||||
|
||||
assert_never(inputs)
|
||||
|
||||
@cached_property
|
||||
def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]:
|
||||
inputs = self.inputs
|
||||
|
||||
if inputs["type"] == "token":
|
||||
return inputs.get("multi_modal_inputs", {})
|
||||
|
||||
if inputs["type"] == "multimodal":
|
||||
return inputs.get("mm_kwargs", {})
|
||||
|
||||
assert_never(inputs)
|
||||
|
||||
@cached_property
|
||||
def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
|
||||
inputs = self.inputs
|
||||
|
||||
@ -1,11 +1,11 @@
|
||||
import enum
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import msgspec
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
|
||||
from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
|
||||
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
||||
|
||||
|
||||
@ -35,9 +35,8 @@ class EngineCoreRequest:
|
||||
# always be tokenized?
|
||||
prompt: Optional[str]
|
||||
prompt_token_ids: List[int]
|
||||
mm_data: Optional[MultiModalDataDict]
|
||||
mm_inputs: Optional[List[MultiModalKwargs]]
|
||||
mm_placeholders: Optional[MultiModalPlaceholderDict]
|
||||
mm_processor_kwargs: Optional[Dict[str, Any]]
|
||||
sampling_params: SamplingParams
|
||||
eos_token_id: Optional[int]
|
||||
arrival_time: float
|
||||
|
||||
@ -84,14 +84,7 @@ class EngineCore:
|
||||
|
||||
def add_request(self, request: EngineCoreRequest):
|
||||
"""Add request to the scheduler."""
|
||||
|
||||
req = Request.from_engine_core_request(request)
|
||||
# FIXME(woosuk): The input mapping (e.g., PIL images to tensors) may
|
||||
# take 10-50 ms, which can cause a spike in the latency. We should
|
||||
# consider moving this to a separate thread.
|
||||
if req.mm_data:
|
||||
req.mm_inputs = self.mm_input_mapper.process_inputs(
|
||||
req.mm_data, req.mm_processor_kwargs)
|
||||
self.scheduler.add_request(req)
|
||||
|
||||
def abort_requests(self, request_ids: List[str]):
|
||||
|
||||
@ -14,6 +14,7 @@ from vllm.sampling_params import SamplingParams
|
||||
from vllm.transformers_utils.config import try_get_generation_config
|
||||
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
|
||||
from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest
|
||||
from vllm.v1.engine.mm_input_mapper import MMInputMapper
|
||||
|
||||
|
||||
class Processor:
|
||||
@ -39,6 +40,9 @@ class Processor:
|
||||
self.input_processor = input_registry.create_input_processor(
|
||||
model_config)
|
||||
|
||||
# Multi-modal (huggingface) input mapper
|
||||
self.mm_input_mapper = MMInputMapper(model_config)
|
||||
|
||||
# TODO: run in an ThreadpoolExecutor or BackgroundProcess.
|
||||
# This ideally should releases the GIL, so we should not block the
|
||||
# asyncio loop while this is running.
|
||||
@ -96,6 +100,12 @@ class Processor:
|
||||
sampling_params.update_from_generation_config(
|
||||
self.generation_config_fields, eos_token_id)
|
||||
|
||||
# Preprocess multi-modal data
|
||||
mm_inputs = self.mm_input_mapper.process_inputs(
|
||||
decoder_inputs.multi_modal_data,
|
||||
decoder_inputs.mm_processor_kwargs) if len(
|
||||
decoder_inputs.multi_modal_data) > 0 else None
|
||||
|
||||
# Make Request for Detokenizer.
|
||||
detokenizer_request = DetokenizerRequest(
|
||||
request_id,
|
||||
@ -113,9 +123,8 @@ class Processor:
|
||||
request_id,
|
||||
decoder_inputs.prompt,
|
||||
decoder_inputs.prompt_token_ids,
|
||||
decoder_inputs.multi_modal_data,
|
||||
mm_inputs,
|
||||
decoder_inputs.multi_modal_placeholders,
|
||||
decoder_inputs.mm_processor_kwargs,
|
||||
sampling_params,
|
||||
eos_token_id,
|
||||
arrival_time,
|
||||
|
||||
@ -45,9 +45,6 @@ class Request:
|
||||
self._all_token_ids: List[int] = self.prompt_token_ids.copy()
|
||||
self.num_computed_tokens = 0
|
||||
|
||||
# Raw multimodal data before the mm input mapper (e.g., PIL images).
|
||||
self.mm_data = self.inputs.multi_modal_data
|
||||
self.mm_processor_kwargs = self.inputs.mm_processor_kwargs
|
||||
mm_positions = self.inputs.multi_modal_placeholders
|
||||
if mm_positions:
|
||||
# FIXME(woosuk): Support other modalities.
|
||||
@ -55,7 +52,10 @@ class Request:
|
||||
else:
|
||||
self.mm_positions = []
|
||||
# Output of the mm input mapper (e.g., image tensors).
|
||||
self.mm_inputs: List[MultiModalKwargs] = []
|
||||
if self.inputs.multi_modal_inputs:
|
||||
self.mm_inputs = self.inputs.multi_modal_inputs
|
||||
else:
|
||||
self.mm_inputs: List[MultiModalKwargs] = []
|
||||
|
||||
@classmethod
|
||||
def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
|
||||
@ -64,9 +64,10 @@ class Request:
|
||||
inputs=token_inputs(
|
||||
prompt_token_ids=request.prompt_token_ids,
|
||||
prompt=request.prompt,
|
||||
multi_modal_data=request.mm_data,
|
||||
multi_modal_data=None,
|
||||
multi_modal_inputs=request.mm_inputs,
|
||||
multi_modal_placeholders=request.mm_placeholders,
|
||||
mm_processor_kwargs=request.mm_processor_kwargs,
|
||||
mm_processor_kwargs=None,
|
||||
),
|
||||
sampling_params=request.sampling_params,
|
||||
eos_token_id=request.eos_token_id,
|
||||
@ -110,7 +111,7 @@ class Request:
|
||||
return RequestStatus.get_finished_reason(self.status)
|
||||
|
||||
def has_encoder_inputs(self) -> bool:
|
||||
return len(self.mm_data) > 0
|
||||
return len(self.mm_inputs) > 0
|
||||
|
||||
@property
|
||||
def num_encoder_inputs(self) -> int:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user