[Misc] Increase RayDistributedExecutor RAY_CGRAPH_get_timeout (#15301)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2026-03-18 04:17:06 +08:00 · 2025-03-21 22:25:43 -07:00 · 2025-03-21 22:25:43 -07:00 · 4c69e228b3
commit 4c69e228b3
parent 790b79750b
1 changed files with 9 additions and 0 deletions
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@ -561,6 +561,15 @@ class RayDistributedExecutor(DistributedExecutorBase):
                    envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
        logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s",
                    envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
+        # Enlarge the default value of "RAY_CGRAPH_get_timeout" to 300 seconds
+        # (it is 10 seconds by default). This is a Ray environment variable to
+        # control the timeout of getting result from a compiled graph execution,
+        # i.e., the distributed execution that includes model forward runs and
+        # intermediate tensor communications, in the case of vllm.
+        os.environ.setdefault("RAY_CGRAPH_get_timeout", "300")  # noqa: SIM112
+        logger.info("RAY_CGRAPH_get_timeout is set to %s",
+                    os.environ["RAY_CGRAPH_get_timeout"])  # noqa: SIM112
+
        with InputNode() as input_data:
            # Example DAG: PP=2, TP=4
            #