diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 33b96af3018a3..739cbedc2f8cc 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -29,6 +29,8 @@ from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT, # yapf: enable from vllm.logger import init_logger from vllm.outputs import RequestOutput +from vllm.transformers_utils.config import ( + maybe_register_config_serialize_by_value) from vllm.usage.usage_lib import UsageContext from vllm.worker.model_runner_base import InputProcessingError @@ -42,12 +44,12 @@ class MQLLMEngine: """A multiprocessing wrapper for :class:`LLMEngine`. This class is used to wrap the :class:`LLMEngine` class to enable use - in concurrnet manner. It runs a background loop and uses zeromq to + in concurrnet manner. It runs a background loop and uses zeromq to receive new requests and stream outputs incrementally via ipc. - + The :class:`LLMEngine` generate or encode process is kicked off when a new RPCProcessRequest is received by the input_socket. - + The self.engine_loop checks the input_socket for new requests, adds them to the LLMEngine if there are any, calls the internal :class:`LLMEngine.step()`, and sends the RequestOutputs back over @@ -428,6 +430,9 @@ def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext, ipc_path: str, disable_log_stats: bool, disable_log_requests: bool, engine_alive): try: + # Ensure we can serialize transformer config before spawning + maybe_register_config_serialize_by_value() + engine = MQLLMEngine.from_vllm_config( vllm_config=vllm_config, usage_context=usage_context, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index bc74ebd205d15..ef557193ae2a6 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -82,6 +82,8 @@ from vllm.entrypoints.openai.serving_transcription import ( from vllm.entrypoints.openai.tool_parsers import ToolParserManager from vllm.entrypoints.utils import load_aware_call, with_cancellation from vllm.logger import init_logger +from vllm.transformers_utils.config import ( + maybe_register_config_serialize_by_value) from vllm.usage.usage_lib import UsageContext from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path, is_valid_ipv6_address, set_ulimit) @@ -221,6 +223,9 @@ async def build_async_engine_client_from_engine_args( # so we need to spawn a new process context = multiprocessing.get_context("spawn") + # Ensure we can serialize transformer config before spawning + maybe_register_config_serialize_by_value() + # The Process can raise an exception during startup, which may # not actually result in an exitcode being reported. As a result # we use a shared variable to communicate the information.