mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 05:25:00 +08:00
[V1] Use pickle for serializing EngineCoreRequest & Add multimodal inputs to EngineCoreRequest (#10245)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
47db6ec831
commit
7c65527918
@ -1,10 +1,11 @@
|
||||
import enum
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Union
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
import msgspec
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
|
||||
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
||||
|
||||
|
||||
@ -22,7 +23,8 @@ class DetokenizerRequest:
|
||||
include_stop_str_in_output: bool
|
||||
|
||||
|
||||
class EngineCoreRequest(msgspec.Struct, omit_defaults=True):
|
||||
@dataclass
|
||||
class EngineCoreRequest:
|
||||
|
||||
# NOTE: prompt and prompt_token_ids should be DecoderOnlyInput,
|
||||
# but this object is currently not playing well with msgspec
|
||||
@ -33,6 +35,9 @@ class EngineCoreRequest(msgspec.Struct, omit_defaults=True):
|
||||
# always be tokenized?
|
||||
prompt: Optional[str]
|
||||
prompt_token_ids: List[int]
|
||||
mm_data: Optional[MultiModalDataDict]
|
||||
mm_placeholders: Optional[MultiModalPlaceholderDict]
|
||||
mm_processor_kwargs: Optional[Dict[str, Any]]
|
||||
sampling_params: SamplingParams
|
||||
eos_token_id: Optional[int]
|
||||
arrival_time: float
|
||||
|
||||
@ -19,6 +19,7 @@ from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
|
||||
EngineCoreRequest, EngineCoreRequestType)
|
||||
from vllm.v1.executor.gpu_executor import GPUExecutor
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
from vllm.v1.serial_utils import PickleEncoder
|
||||
from vllm.version import __version__ as VLLM_VERSION
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -315,7 +316,7 @@ class EngineCoreProc(EngineCore):
|
||||
"""Input socket IO thread."""
|
||||
|
||||
# Msgpack serialization decoding.
|
||||
decoder_add_req = msgpack.Decoder(EngineCoreRequest)
|
||||
decoder_add_req = PickleEncoder()
|
||||
decoder_abort_req = msgpack.Decoder(list[str])
|
||||
|
||||
with self.make_socket(input_path, zmq.constants.PULL) as socket:
|
||||
|
||||
@ -11,6 +11,7 @@ from vllm.utils import get_open_zmq_ipc_path
|
||||
from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
|
||||
EngineCoreRequest, EngineCoreRequestType)
|
||||
from vllm.v1.engine.core import EngineCore, EngineCoreProc
|
||||
from vllm.v1.serial_utils import PickleEncoder
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -115,7 +116,7 @@ class MPClient(EngineCoreClient):
|
||||
**kwargs,
|
||||
):
|
||||
# Serialization setup.
|
||||
self.encoder = msgspec.msgpack.Encoder()
|
||||
self.encoder = PickleEncoder()
|
||||
self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
|
||||
|
||||
# ZMQ setup.
|
||||
|
||||
@ -91,7 +91,10 @@ class Processor:
|
||||
# Make Request for EngineCore.
|
||||
engine_core_request = EngineCoreRequest(
|
||||
request_id, processed_inputs.get("prompt"),
|
||||
processed_inputs.get("prompt_token_ids"), sampling_params,
|
||||
processed_inputs.get("prompt_token_ids"),
|
||||
processed_inputs.get("multi_modal_data"),
|
||||
processed_inputs.get("multi_modal_placeholders"),
|
||||
processed_inputs.get("mm_processor_kwargs"), sampling_params,
|
||||
eos_token_id, arrival_time, lora_request)
|
||||
|
||||
return detokenizer_request, engine_core_request
|
||||
|
||||
10
vllm/v1/serial_utils.py
Normal file
10
vllm/v1/serial_utils.py
Normal file
@ -0,0 +1,10 @@
|
||||
import pickle
|
||||
|
||||
|
||||
class PickleEncoder:
|
||||
|
||||
def encode(self, obj):
|
||||
return pickle.dumps(obj)
|
||||
|
||||
def decode(self, data):
|
||||
return pickle.loads(data)
|
||||
Loading…
x
Reference in New Issue
Block a user