From 4c69e228b32220ac9159dfdcf0df13ea776e630d Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Fri, 21 Mar 2025 22:25:43 -0700 Subject: [PATCH] [Misc] Increase RayDistributedExecutor RAY_CGRAPH_get_timeout (#15301) Signed-off-by: Rui Qiao --- vllm/executor/ray_distributed_executor.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index d769d235020d5..c823ab5bf9698 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -561,6 +561,15 @@ class RayDistributedExecutor(DistributedExecutorBase): envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL) logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s", envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM) + # Enlarge the default value of "RAY_CGRAPH_get_timeout" to 300 seconds + # (it is 10 seconds by default). This is a Ray environment variable to + # control the timeout of getting result from a compiled graph execution, + # i.e., the distributed execution that includes model forward runs and + # intermediate tensor communications, in the case of vllm. + os.environ.setdefault("RAY_CGRAPH_get_timeout", "300") # noqa: SIM112 + logger.info("RAY_CGRAPH_get_timeout is set to %s", + os.environ["RAY_CGRAPH_get_timeout"]) # noqa: SIM112 + with InputNode() as input_data: # Example DAG: PP=2, TP=4 #