[V1] VLM - Run the mm_mapper preprocessor in the frontend process (#10640)

Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
Alexander Matveev 2024-12-03 05:33:10 -05:00 committed by GitHub
parent f6084f6324
commit 3bc94cab69
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 47 additions and 25 deletions

View File

@ -27,9 +27,8 @@ def make_request() -> EngineCoreRequest:
request_id=uuid.uuid4(),
prompt=PROMPT,
prompt_token_ids=PROMPT_TOKENS,
mm_data=None,
mm_inputs=None,
mm_placeholders=None,
mm_processor_kwargs=None,
sampling_params=SamplingParams(),
eos_token_id=None,
arrival_time=time.time(),

View File

@ -29,9 +29,8 @@ def make_request(params: SamplingParams) -> EngineCoreRequest:
request_id=str(uuid.uuid4()),
prompt=PROMPT,
prompt_token_ids=PROMPT_TOKENS,
mm_data=None,
mm_inputs=None,
mm_placeholders=None,
mm_processor_kwargs=None,
sampling_params=params,
eos_token_id=None,
arrival_time=time.time(),

View File

@ -7,7 +7,8 @@ import torch
from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never
if TYPE_CHECKING:
from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs,
MultiModalPlaceholderDict)
from vllm.multimodal.inputs import MultiModalInputsV2
@ -150,6 +151,12 @@ class TokenInputs(TypedDict):
if the model supports it.
"""
multi_modal_inputs: NotRequired["MultiModalKwargs"]
"""
Optional multi-modal inputs to pass to the model,
if the model supports it.
"""
multi_modal_placeholders: NotRequired["MultiModalPlaceholderDict"]
"""
Placeholder ranges for the multi-modal data.
@ -169,6 +176,7 @@ def token_inputs(
token_type_ids: Optional[List[int]] = None,
prompt: Optional[str] = None,
multi_modal_data: Optional["MultiModalDataDict"] = None,
multi_modal_inputs: Optional["MultiModalKwargs"] = None,
multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None,
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
) -> TokenInputs:
@ -181,6 +189,8 @@ def token_inputs(
inputs["token_type_ids"] = token_type_ids
if multi_modal_data is not None:
inputs["multi_modal_data"] = multi_modal_data
if multi_modal_inputs is not None:
inputs["multi_modal_inputs"] = multi_modal_inputs
if multi_modal_placeholders is not None:
inputs["multi_modal_placeholders"] = multi_modal_placeholders
if mm_processor_kwargs is not None:
@ -273,6 +283,18 @@ class SingletonInputsAdapter:
assert_never(inputs)
@cached_property
def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]:
inputs = self.inputs
if inputs["type"] == "token":
return inputs.get("multi_modal_inputs", {})
if inputs["type"] == "multimodal":
return inputs.get("mm_kwargs", {})
assert_never(inputs)
@cached_property
def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
inputs = self.inputs

View File

@ -1,11 +1,11 @@
import enum
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Union
from typing import List, Optional, Union
import msgspec
from vllm.lora.request import LoRARequest
from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
from vllm.sampling_params import RequestOutputKind, SamplingParams
@ -35,9 +35,8 @@ class EngineCoreRequest:
# always be tokenized?
prompt: Optional[str]
prompt_token_ids: List[int]
mm_data: Optional[MultiModalDataDict]
mm_inputs: Optional[List[MultiModalKwargs]]
mm_placeholders: Optional[MultiModalPlaceholderDict]
mm_processor_kwargs: Optional[Dict[str, Any]]
sampling_params: SamplingParams
eos_token_id: Optional[int]
arrival_time: float

View File

@ -84,14 +84,7 @@ class EngineCore:
def add_request(self, request: EngineCoreRequest):
"""Add request to the scheduler."""
req = Request.from_engine_core_request(request)
# FIXME(woosuk): The input mapping (e.g., PIL images to tensors) may
# take 10-50 ms, which can cause a spike in the latency. We should
# consider moving this to a separate thread.
if req.mm_data:
req.mm_inputs = self.mm_input_mapper.process_inputs(
req.mm_data, req.mm_processor_kwargs)
self.scheduler.add_request(req)
def abort_requests(self, request_ids: List[str]):

View File

@ -14,6 +14,7 @@ from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.config import try_get_generation_config
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest
from vllm.v1.engine.mm_input_mapper import MMInputMapper
class Processor:
@ -39,6 +40,9 @@ class Processor:
self.input_processor = input_registry.create_input_processor(
model_config)
# Multi-modal (huggingface) input mapper
self.mm_input_mapper = MMInputMapper(model_config)
# TODO: run in an ThreadpoolExecutor or BackgroundProcess.
# This ideally should releases the GIL, so we should not block the
# asyncio loop while this is running.
@ -96,6 +100,12 @@ class Processor:
sampling_params.update_from_generation_config(
self.generation_config_fields, eos_token_id)
# Preprocess multi-modal data
mm_inputs = self.mm_input_mapper.process_inputs(
decoder_inputs.multi_modal_data,
decoder_inputs.mm_processor_kwargs) if len(
decoder_inputs.multi_modal_data) > 0 else None
# Make Request for Detokenizer.
detokenizer_request = DetokenizerRequest(
request_id,
@ -113,9 +123,8 @@ class Processor:
request_id,
decoder_inputs.prompt,
decoder_inputs.prompt_token_ids,
decoder_inputs.multi_modal_data,
mm_inputs,
decoder_inputs.multi_modal_placeholders,
decoder_inputs.mm_processor_kwargs,
sampling_params,
eos_token_id,
arrival_time,

View File

@ -45,9 +45,6 @@ class Request:
self._all_token_ids: List[int] = self.prompt_token_ids.copy()
self.num_computed_tokens = 0
# Raw multimodal data before the mm input mapper (e.g., PIL images).
self.mm_data = self.inputs.multi_modal_data
self.mm_processor_kwargs = self.inputs.mm_processor_kwargs
mm_positions = self.inputs.multi_modal_placeholders
if mm_positions:
# FIXME(woosuk): Support other modalities.
@ -55,7 +52,10 @@ class Request:
else:
self.mm_positions = []
# Output of the mm input mapper (e.g., image tensors).
self.mm_inputs: List[MultiModalKwargs] = []
if self.inputs.multi_modal_inputs:
self.mm_inputs = self.inputs.multi_modal_inputs
else:
self.mm_inputs: List[MultiModalKwargs] = []
@classmethod
def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
@ -64,9 +64,10 @@ class Request:
inputs=token_inputs(
prompt_token_ids=request.prompt_token_ids,
prompt=request.prompt,
multi_modal_data=request.mm_data,
multi_modal_data=None,
multi_modal_inputs=request.mm_inputs,
multi_modal_placeholders=request.mm_placeholders,
mm_processor_kwargs=request.mm_processor_kwargs,
mm_processor_kwargs=None,
),
sampling_params=request.sampling_params,
eos_token_id=request.eos_token_id,
@ -110,7 +111,7 @@ class Request:
return RequestStatus.get_finished_reason(self.status)
def has_encoder_inputs(self) -> bool:
return len(self.mm_data) > 0
return len(self.mm_inputs) > 0
@property
def num_encoder_inputs(self) -> int: