mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 13:35:36 +08:00
[Misc] Configurable timeout for execute_model RPC calls via env var (#19544)
Signed-off-by: jinqinn <goodqinjin@163.com>
This commit is contained in:
parent
4a0f7888a3
commit
f39ab2d4bd
@ -130,6 +130,7 @@ if TYPE_CHECKING:
|
|||||||
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
|
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
|
||||||
VLLM_SLEEP_WHEN_IDLE: bool = False
|
VLLM_SLEEP_WHEN_IDLE: bool = False
|
||||||
VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16
|
VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16
|
||||||
|
VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300
|
||||||
VLLM_KV_CACHE_LAYOUT: Optional[str] = None
|
VLLM_KV_CACHE_LAYOUT: Optional[str] = None
|
||||||
VLLM_COMPUTE_NANS_IN_LOGITS: bool = False
|
VLLM_COMPUTE_NANS_IN_LOGITS: bool = False
|
||||||
|
|
||||||
@ -897,6 +898,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
"VLLM_MQ_MAX_CHUNK_BYTES_MB":
|
"VLLM_MQ_MAX_CHUNK_BYTES_MB":
|
||||||
lambda: int(os.getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")),
|
lambda: int(os.getenv("VLLM_MQ_MAX_CHUNK_BYTES_MB", "16")),
|
||||||
|
|
||||||
|
# Timeout in seconds for execute_model RPC calls in multiprocessing
|
||||||
|
# executor (only applies when TP > 1).
|
||||||
|
"VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS":
|
||||||
|
lambda: int(os.getenv("VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS", "300")),
|
||||||
|
|
||||||
# KV Cache layout used throughout vllm.
|
# KV Cache layout used throughout vllm.
|
||||||
# Some common values are:
|
# Some common values are:
|
||||||
# - NHD
|
# - NHD
|
||||||
|
|||||||
@ -37,11 +37,6 @@ from vllm.worker.worker_base import WorkerWrapperBase
|
|||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
POLLING_TIMEOUT_MS = 5000
|
|
||||||
POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
|
|
||||||
|
|
||||||
EXECUTE_MODEL_TIMEOUT_S = 300
|
|
||||||
|
|
||||||
|
|
||||||
class MultiprocExecutor(Executor):
|
class MultiprocExecutor(Executor):
|
||||||
|
|
||||||
@ -160,12 +155,12 @@ class MultiprocExecutor(Executor):
|
|||||||
self,
|
self,
|
||||||
scheduler_output,
|
scheduler_output,
|
||||||
) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
|
) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
|
||||||
(output, ) = self.collective_rpc("execute_model",
|
(output, ) = self.collective_rpc(
|
||||||
args=(scheduler_output, ),
|
"execute_model",
|
||||||
unique_reply_rank=self.output_rank,
|
args=(scheduler_output, ),
|
||||||
non_block=self.max_concurrent_batches
|
unique_reply_rank=self.output_rank,
|
||||||
> 1,
|
non_block=self.max_concurrent_batches > 1,
|
||||||
timeout=EXECUTE_MODEL_TIMEOUT_S)
|
timeout=envs.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def collective_rpc(self,
|
def collective_rpc(self,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user