diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py index cebdf56c45b1b..05378ec74d2fa 100644 --- a/vllm/benchmarks/latency.py +++ b/vllm/benchmarks/latency.py @@ -13,7 +13,6 @@ import numpy as np from tqdm import tqdm import vllm.envs as envs -from vllm import LLM, SamplingParams from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format, write_to_json) from vllm.engine.arg_utils import EngineArgs @@ -85,6 +84,9 @@ def main(args: argparse.Namespace): "Please set it to a valid path to use torch profiler.") engine_args = EngineArgs.from_cli_args(args) + # Lazy import to avoid importing LLM when the bench command is not selected. + from vllm import LLM, SamplingParams + # NOTE(woosuk): If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. llm = LLM(**dataclasses.asdict(engine_args)) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 88ef16b87e557..f6f83223a1488 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -8,6 +8,7 @@ import importlib import inspect import json import multiprocessing +import multiprocessing.forkserver as forkserver import os import signal import socket @@ -155,6 +156,15 @@ async def build_async_engine_client( client_config: Optional[dict[str, Any]] = None, ) -> AsyncIterator[EngineClient]: + if os.getenv("VLLM_WORKER_MULTIPROC_METHOD") == "forkserver": + # The executor is expected to be mp. + # Pre-import heavy modules in the forkserver process + logger.debug("Setup forkserver with pre-imports") + multiprocessing.set_start_method('forkserver') + multiprocessing.set_forkserver_preload(["vllm.v1.engine.async_llm"]) + forkserver.ensure_running() + logger.debug("Forkserver setup complete!") + # Context manager to handle engine_client lifecycle # Ensures everything is shutdown and cleaned up on error/exit engine_args = AsyncEngineArgs.from_cli_args(args)