diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index d2fc0916bc555..0cb3b739b7245 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -117,7 +117,7 @@ def test_models_distributed( pytest.skip(f"Skip test for {test_suite}") if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa - # test ray adag + # test Ray Compiled Graph os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index a500ba9dfe02a..fd4a804183bf5 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -93,7 +93,7 @@ def test_models_distributed( if (model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray"): - # test ray adag + # test Ray Compiled Graph os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 9677ccd2ea823..390ed91c26051 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -324,8 +324,8 @@ def _compare_tp( specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill if distributed_backend == "ray" and (vllm_major_version == "1" or specific_case): - # For V1, test Ray ADAG for all the tests - # For V0, test Ray ADAG for a subset of the tests + # For V1, test Ray Compiled Graph for all the tests + # For V0, test Ray Compiled Graph for a subset of the tests pp_env = { "VLLM_USE_V1": vllm_major_version, "VLLM_USE_RAY_COMPILED_DAG": "1", @@ -333,7 +333,7 @@ def _compare_tp( "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1", } # Temporary. Currently when zeromq + SPMD is used, it does not properly - # terminate because of aDAG issue. + # terminate because of a Ray Compiled Graph issue. common_args.append("--disable-frontend-multiprocessing") else: pp_env = None @@ -367,8 +367,9 @@ def _compare_tp( if pp_env is None: raise else: - # Ray ADAG tests are flaky, so we don't want to fail the test - logger.exception("Ray ADAG tests failed") + # Ray Compiled Graph tests are flaky, + # so we don't want to fail the test + logger.exception("Ray Compiled Graph tests failed") @pytest.mark.parametrize( diff --git a/vllm/envs.py b/vllm/envs.py index 84426cb5bb22b..048d63bfec0f7 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -371,21 +371,22 @@ environment_variables: Dict[str, Callable[[], Any]] = { "VLLM_USE_RAY_SPMD_WORKER": lambda: bool(int(os.getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))), - # If the env var is set, it uses the Ray's compiled DAG API - # which optimizes the control plane overhead. + # If the env var is set, it uses the Ray's Compiled Graph + # (previously known as ADAG) API which optimizes the + # control plane overhead. # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. "VLLM_USE_RAY_COMPILED_DAG": lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))), # If the env var is set, it uses NCCL for communication in - # Ray's compiled DAG. This flag is ignored if + # Ray's Compiled Graph. This flag is ignored if # VLLM_USE_RAY_COMPILED_DAG is not set. "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1")) ), # If the env var is set, it enables GPU communication overlap - # (experimental feature) in Ray's compiled DAG. This flag is ignored if + # (experimental feature) in Ray's Compiled Graph. This flag is ignored if # VLLM_USE_RAY_COMPILED_DAG is not set. "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM": lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0")) diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index 2908fefc8e7eb..c3b41d1c11340 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -491,7 +491,7 @@ class RayDistributedExecutor(DistributedExecutorBase): async_run_remote_workers_only to complete.""" ray.get(parallel_worker_tasks) - def _check_ray_adag_installation(self): + def _check_ray_cgraph_installation(self): import pkg_resources from packaging import version @@ -503,10 +503,10 @@ class RayDistributedExecutor(DistributedExecutorBase): f"required, but found {current_version}") import importlib.util - adag_spec = importlib.util.find_spec( + cgraph_spec = importlib.util.find_spec( "ray.experimental.compiled_dag_ref") - if adag_spec is None: - raise ValueError("Ray accelerated DAG is not installed. " + if cgraph_spec is None: + raise ValueError("Ray Compiled Graph is not installed. " "Run `pip install ray[adag]` to install it.") cupy_spec = importlib.util.find_spec("cupy") @@ -518,7 +518,7 @@ class RayDistributedExecutor(DistributedExecutorBase): def _compiled_ray_dag(self, enable_asyncio: bool): assert self.parallel_config.use_ray - self._check_ray_adag_installation() + self._check_ray_cgraph_installation() from ray.dag import InputNode, MultiOutputNode from ray.experimental.channel.torch_tensor_type import TorchTensorType diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index a9661fe0ef160..6067f9a3c13b8 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -83,9 +83,9 @@ try: execute_model_req = self.input_decoder.decode(serialized_req) - # TODO(swang): This is needed right now because Ray aDAG executes - # on a background thread, so we need to reset torch's current - # device. + # TODO(swang): This is needed right now because Ray Compiled Graph + # executes on a background thread, so we need to reset torch's + # current device. import torch if not self.compiled_dag_cuda_device_set: torch.cuda.set_device(self.worker.device) @@ -119,7 +119,7 @@ try: "IntermediateTensors"]], ) -> Union["ModelRunnerOutput", Tuple["SchedulerOutput", "IntermediateTensors"]]: - # this method is used to compile ray CG, + # This method is used by Ray Compiled Graph to execute the model, # and it needs a special logic of self.setup_device_if_necessary() self.setup_device_if_necessary() assert self.worker is not None, "Worker is not initialized"