mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 11:36:20 +08:00
[V1] VLM - Run the mm_mapper preprocessor in the frontend process (#10640)
Signed-off-by: Roger Wang <ywang@roblox.com> Co-authored-by: Michael Goin <michael@neuralmagic.com> Co-authored-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
parent
f6084f6324
commit
3bc94cab69
@ -27,9 +27,8 @@ def make_request() -> EngineCoreRequest:
|
|||||||
request_id=uuid.uuid4(),
|
request_id=uuid.uuid4(),
|
||||||
prompt=PROMPT,
|
prompt=PROMPT,
|
||||||
prompt_token_ids=PROMPT_TOKENS,
|
prompt_token_ids=PROMPT_TOKENS,
|
||||||
mm_data=None,
|
mm_inputs=None,
|
||||||
mm_placeholders=None,
|
mm_placeholders=None,
|
||||||
mm_processor_kwargs=None,
|
|
||||||
sampling_params=SamplingParams(),
|
sampling_params=SamplingParams(),
|
||||||
eos_token_id=None,
|
eos_token_id=None,
|
||||||
arrival_time=time.time(),
|
arrival_time=time.time(),
|
||||||
|
|||||||
@ -29,9 +29,8 @@ def make_request(params: SamplingParams) -> EngineCoreRequest:
|
|||||||
request_id=str(uuid.uuid4()),
|
request_id=str(uuid.uuid4()),
|
||||||
prompt=PROMPT,
|
prompt=PROMPT,
|
||||||
prompt_token_ids=PROMPT_TOKENS,
|
prompt_token_ids=PROMPT_TOKENS,
|
||||||
mm_data=None,
|
mm_inputs=None,
|
||||||
mm_placeholders=None,
|
mm_placeholders=None,
|
||||||
mm_processor_kwargs=None,
|
|
||||||
sampling_params=params,
|
sampling_params=params,
|
||||||
eos_token_id=None,
|
eos_token_id=None,
|
||||||
arrival_time=time.time(),
|
arrival_time=time.time(),
|
||||||
|
|||||||
@ -7,7 +7,8 @@ import torch
|
|||||||
from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never
|
from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
|
from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs,
|
||||||
|
MultiModalPlaceholderDict)
|
||||||
from vllm.multimodal.inputs import MultiModalInputsV2
|
from vllm.multimodal.inputs import MultiModalInputsV2
|
||||||
|
|
||||||
|
|
||||||
@ -150,6 +151,12 @@ class TokenInputs(TypedDict):
|
|||||||
if the model supports it.
|
if the model supports it.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
multi_modal_inputs: NotRequired["MultiModalKwargs"]
|
||||||
|
"""
|
||||||
|
Optional multi-modal inputs to pass to the model,
|
||||||
|
if the model supports it.
|
||||||
|
"""
|
||||||
|
|
||||||
multi_modal_placeholders: NotRequired["MultiModalPlaceholderDict"]
|
multi_modal_placeholders: NotRequired["MultiModalPlaceholderDict"]
|
||||||
"""
|
"""
|
||||||
Placeholder ranges for the multi-modal data.
|
Placeholder ranges for the multi-modal data.
|
||||||
@ -169,6 +176,7 @@ def token_inputs(
|
|||||||
token_type_ids: Optional[List[int]] = None,
|
token_type_ids: Optional[List[int]] = None,
|
||||||
prompt: Optional[str] = None,
|
prompt: Optional[str] = None,
|
||||||
multi_modal_data: Optional["MultiModalDataDict"] = None,
|
multi_modal_data: Optional["MultiModalDataDict"] = None,
|
||||||
|
multi_modal_inputs: Optional["MultiModalKwargs"] = None,
|
||||||
multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None,
|
multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None,
|
||||||
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
|
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
|
||||||
) -> TokenInputs:
|
) -> TokenInputs:
|
||||||
@ -181,6 +189,8 @@ def token_inputs(
|
|||||||
inputs["token_type_ids"] = token_type_ids
|
inputs["token_type_ids"] = token_type_ids
|
||||||
if multi_modal_data is not None:
|
if multi_modal_data is not None:
|
||||||
inputs["multi_modal_data"] = multi_modal_data
|
inputs["multi_modal_data"] = multi_modal_data
|
||||||
|
if multi_modal_inputs is not None:
|
||||||
|
inputs["multi_modal_inputs"] = multi_modal_inputs
|
||||||
if multi_modal_placeholders is not None:
|
if multi_modal_placeholders is not None:
|
||||||
inputs["multi_modal_placeholders"] = multi_modal_placeholders
|
inputs["multi_modal_placeholders"] = multi_modal_placeholders
|
||||||
if mm_processor_kwargs is not None:
|
if mm_processor_kwargs is not None:
|
||||||
@ -273,6 +283,18 @@ class SingletonInputsAdapter:
|
|||||||
|
|
||||||
assert_never(inputs)
|
assert_never(inputs)
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]:
|
||||||
|
inputs = self.inputs
|
||||||
|
|
||||||
|
if inputs["type"] == "token":
|
||||||
|
return inputs.get("multi_modal_inputs", {})
|
||||||
|
|
||||||
|
if inputs["type"] == "multimodal":
|
||||||
|
return inputs.get("mm_kwargs", {})
|
||||||
|
|
||||||
|
assert_never(inputs)
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
|
def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
|
||||||
inputs = self.inputs
|
inputs = self.inputs
|
||||||
|
|||||||
@ -1,11 +1,11 @@
|
|||||||
import enum
|
import enum
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
import msgspec
|
import msgspec
|
||||||
|
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
|
from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
|
||||||
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
||||||
|
|
||||||
|
|
||||||
@ -35,9 +35,8 @@ class EngineCoreRequest:
|
|||||||
# always be tokenized?
|
# always be tokenized?
|
||||||
prompt: Optional[str]
|
prompt: Optional[str]
|
||||||
prompt_token_ids: List[int]
|
prompt_token_ids: List[int]
|
||||||
mm_data: Optional[MultiModalDataDict]
|
mm_inputs: Optional[List[MultiModalKwargs]]
|
||||||
mm_placeholders: Optional[MultiModalPlaceholderDict]
|
mm_placeholders: Optional[MultiModalPlaceholderDict]
|
||||||
mm_processor_kwargs: Optional[Dict[str, Any]]
|
|
||||||
sampling_params: SamplingParams
|
sampling_params: SamplingParams
|
||||||
eos_token_id: Optional[int]
|
eos_token_id: Optional[int]
|
||||||
arrival_time: float
|
arrival_time: float
|
||||||
|
|||||||
@ -84,14 +84,7 @@ class EngineCore:
|
|||||||
|
|
||||||
def add_request(self, request: EngineCoreRequest):
|
def add_request(self, request: EngineCoreRequest):
|
||||||
"""Add request to the scheduler."""
|
"""Add request to the scheduler."""
|
||||||
|
|
||||||
req = Request.from_engine_core_request(request)
|
req = Request.from_engine_core_request(request)
|
||||||
# FIXME(woosuk): The input mapping (e.g., PIL images to tensors) may
|
|
||||||
# take 10-50 ms, which can cause a spike in the latency. We should
|
|
||||||
# consider moving this to a separate thread.
|
|
||||||
if req.mm_data:
|
|
||||||
req.mm_inputs = self.mm_input_mapper.process_inputs(
|
|
||||||
req.mm_data, req.mm_processor_kwargs)
|
|
||||||
self.scheduler.add_request(req)
|
self.scheduler.add_request(req)
|
||||||
|
|
||||||
def abort_requests(self, request_ids: List[str]):
|
def abort_requests(self, request_ids: List[str]):
|
||||||
|
|||||||
@ -14,6 +14,7 @@ from vllm.sampling_params import SamplingParams
|
|||||||
from vllm.transformers_utils.config import try_get_generation_config
|
from vllm.transformers_utils.config import try_get_generation_config
|
||||||
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
|
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
|
||||||
from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest
|
from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest
|
||||||
|
from vllm.v1.engine.mm_input_mapper import MMInputMapper
|
||||||
|
|
||||||
|
|
||||||
class Processor:
|
class Processor:
|
||||||
@ -39,6 +40,9 @@ class Processor:
|
|||||||
self.input_processor = input_registry.create_input_processor(
|
self.input_processor = input_registry.create_input_processor(
|
||||||
model_config)
|
model_config)
|
||||||
|
|
||||||
|
# Multi-modal (huggingface) input mapper
|
||||||
|
self.mm_input_mapper = MMInputMapper(model_config)
|
||||||
|
|
||||||
# TODO: run in an ThreadpoolExecutor or BackgroundProcess.
|
# TODO: run in an ThreadpoolExecutor or BackgroundProcess.
|
||||||
# This ideally should releases the GIL, so we should not block the
|
# This ideally should releases the GIL, so we should not block the
|
||||||
# asyncio loop while this is running.
|
# asyncio loop while this is running.
|
||||||
@ -96,6 +100,12 @@ class Processor:
|
|||||||
sampling_params.update_from_generation_config(
|
sampling_params.update_from_generation_config(
|
||||||
self.generation_config_fields, eos_token_id)
|
self.generation_config_fields, eos_token_id)
|
||||||
|
|
||||||
|
# Preprocess multi-modal data
|
||||||
|
mm_inputs = self.mm_input_mapper.process_inputs(
|
||||||
|
decoder_inputs.multi_modal_data,
|
||||||
|
decoder_inputs.mm_processor_kwargs) if len(
|
||||||
|
decoder_inputs.multi_modal_data) > 0 else None
|
||||||
|
|
||||||
# Make Request for Detokenizer.
|
# Make Request for Detokenizer.
|
||||||
detokenizer_request = DetokenizerRequest(
|
detokenizer_request = DetokenizerRequest(
|
||||||
request_id,
|
request_id,
|
||||||
@ -113,9 +123,8 @@ class Processor:
|
|||||||
request_id,
|
request_id,
|
||||||
decoder_inputs.prompt,
|
decoder_inputs.prompt,
|
||||||
decoder_inputs.prompt_token_ids,
|
decoder_inputs.prompt_token_ids,
|
||||||
decoder_inputs.multi_modal_data,
|
mm_inputs,
|
||||||
decoder_inputs.multi_modal_placeholders,
|
decoder_inputs.multi_modal_placeholders,
|
||||||
decoder_inputs.mm_processor_kwargs,
|
|
||||||
sampling_params,
|
sampling_params,
|
||||||
eos_token_id,
|
eos_token_id,
|
||||||
arrival_time,
|
arrival_time,
|
||||||
|
|||||||
@ -45,9 +45,6 @@ class Request:
|
|||||||
self._all_token_ids: List[int] = self.prompt_token_ids.copy()
|
self._all_token_ids: List[int] = self.prompt_token_ids.copy()
|
||||||
self.num_computed_tokens = 0
|
self.num_computed_tokens = 0
|
||||||
|
|
||||||
# Raw multimodal data before the mm input mapper (e.g., PIL images).
|
|
||||||
self.mm_data = self.inputs.multi_modal_data
|
|
||||||
self.mm_processor_kwargs = self.inputs.mm_processor_kwargs
|
|
||||||
mm_positions = self.inputs.multi_modal_placeholders
|
mm_positions = self.inputs.multi_modal_placeholders
|
||||||
if mm_positions:
|
if mm_positions:
|
||||||
# FIXME(woosuk): Support other modalities.
|
# FIXME(woosuk): Support other modalities.
|
||||||
@ -55,7 +52,10 @@ class Request:
|
|||||||
else:
|
else:
|
||||||
self.mm_positions = []
|
self.mm_positions = []
|
||||||
# Output of the mm input mapper (e.g., image tensors).
|
# Output of the mm input mapper (e.g., image tensors).
|
||||||
self.mm_inputs: List[MultiModalKwargs] = []
|
if self.inputs.multi_modal_inputs:
|
||||||
|
self.mm_inputs = self.inputs.multi_modal_inputs
|
||||||
|
else:
|
||||||
|
self.mm_inputs: List[MultiModalKwargs] = []
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
|
def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
|
||||||
@ -64,9 +64,10 @@ class Request:
|
|||||||
inputs=token_inputs(
|
inputs=token_inputs(
|
||||||
prompt_token_ids=request.prompt_token_ids,
|
prompt_token_ids=request.prompt_token_ids,
|
||||||
prompt=request.prompt,
|
prompt=request.prompt,
|
||||||
multi_modal_data=request.mm_data,
|
multi_modal_data=None,
|
||||||
|
multi_modal_inputs=request.mm_inputs,
|
||||||
multi_modal_placeholders=request.mm_placeholders,
|
multi_modal_placeholders=request.mm_placeholders,
|
||||||
mm_processor_kwargs=request.mm_processor_kwargs,
|
mm_processor_kwargs=None,
|
||||||
),
|
),
|
||||||
sampling_params=request.sampling_params,
|
sampling_params=request.sampling_params,
|
||||||
eos_token_id=request.eos_token_id,
|
eos_token_id=request.eos_token_id,
|
||||||
@ -110,7 +111,7 @@ class Request:
|
|||||||
return RequestStatus.get_finished_reason(self.status)
|
return RequestStatus.get_finished_reason(self.status)
|
||||||
|
|
||||||
def has_encoder_inputs(self) -> bool:
|
def has_encoder_inputs(self) -> bool:
|
||||||
return len(self.mm_data) > 0
|
return len(self.mm_inputs) > 0
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def num_encoder_inputs(self) -> int:
|
def num_encoder_inputs(self) -> int:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user