[V1] Unify VLLM_ENABLE_V1_MULTIPROCESSING handling in RayExecutor (#11472)

2026-03-16 15:37:13 +08:00 · 2024-12-24 19:49:46 -08:00 · 2024-12-24 19:49:46 -08:00 · 9832e5572a
commit 9832e5572a
parent 3f3e92e1f2
3 changed files with 4 additions and 8 deletions
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@ -127,11 +127,6 @@ def test_models_distributed(
    if attention_backend:
        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend

-    # Import VLLM_USE_V1 dynamically to handle patching
-    from vllm.envs import VLLM_USE_V1
-    if VLLM_USE_V1 and distributed_executor_backend != "mp":
-        os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
-
    dtype = "half"
    max_tokens = 5

--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@ -21,7 +21,6 @@ from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.detokenizer import Detokenizer
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.executor.ray_utils import initialize_ray_cluster

 logger = init_logger(__name__)

@ -112,7 +111,6 @@ class LLMEngine:
        distributed_executor_backend = (
            vllm_config.parallel_config.distributed_executor_backend)
        if distributed_executor_backend == "ray":
-            initialize_ray_cluster(vllm_config.parallel_config)
            from vllm.v1.executor.ray_executor import RayExecutor
            executor_class = RayExecutor
        elif distributed_executor_backend == "mp":
--- a/vllm/v1/executor/ray_executor.py
+++ b/vllm/v1/executor/ray_executor.py
@ -8,7 +8,8 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.executor.ray_utils import RayWorkerWrapper, ray
+from vllm.v1.executor.ray_utils import (RayWorkerWrapper,
+                                        initialize_ray_cluster, ray)
 from vllm.v1.outputs import ModelRunnerOutput

 if ray is not None:
@ -33,7 +34,9 @@ class RayExecutor(Executor):
        if ray_usage != "1":
            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"

+        initialize_ray_cluster(self.parallel_config)
        placement_group = self.parallel_config.placement_group
+
        # Create the parallel GPU workers.
        self._init_workers_ray(placement_group)