From f39ab2d4bde85e169b85ea3555dc4b74224b3929 Mon Sep 17 00:00:00 2001 From: jinqinn Date: Mon, 23 Jun 2025 11:36:26 +0800 Subject: [PATCH] [Misc] Configurable timeout for execute_model RPC calls via env var (#19544) Signed-off-by: jinqinn --- vllm/envs.py | 6 ++++++ vllm/v1/executor/multiproc_executor.py | 17 ++++++----------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 93a7c8069c2d..01d8d8a2d2e0 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -130,6 +130,7 @@ if TYPE_CHECKING: VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1 VLLM_SLEEP_WHEN_IDLE: bool = False VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16 + VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300 VLLM_KV_CACHE_LAYOUT: Optional[str] = None VLLM_COMPUTE_NANS_IN_LOGITS: bool = False @@ -897,6 +898,11 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_MQ_MAX_CHUNK_BYTES_MB": lambda: int(os.getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")), + # Timeout in seconds for execute_model RPC calls in multiprocessing + # executor (only applies when TP > 1). + "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": + lambda: int(os.getenv("VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS", "300")), + # KV Cache layout used throughout vllm. # Some common values are: # - NHD diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 2148680d5f56..b06b7cc804d5 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -37,11 +37,6 @@ from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) -POLLING_TIMEOUT_MS = 5000 -POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000 - -EXECUTE_MODEL_TIMEOUT_S = 300 - class MultiprocExecutor(Executor): @@ -160,12 +155,12 @@ class MultiprocExecutor(Executor): self, scheduler_output, ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]: - (output, ) = self.collective_rpc("execute_model", - args=(scheduler_output, ), - unique_reply_rank=self.output_rank, - non_block=self.max_concurrent_batches - > 1, - timeout=EXECUTE_MODEL_TIMEOUT_S) + (output, ) = self.collective_rpc( + "execute_model", + args=(scheduler_output, ), + unique_reply_rank=self.output_rank, + non_block=self.max_concurrent_batches > 1, + timeout=envs.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS) return output def collective_rpc(self,