[V1] Simplify Shutdown (#11659)

2025-12-10 08:14:54 +08:00 · 2025-01-03 12:25:38 -05:00 · 2025-01-03 12:25:38 -05:00 · 80c751e7f6
commit 80c751e7f6
parent e1a5c2f0a1
7 changed files with 40 additions and 58 deletions
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@ -142,9 +142,6 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
        client.abort_requests([request.request_id])
        # Shutdown the client.
        client.shutdown()
@pytest.mark.asyncio
 async def test_engine_core_client_asyncio(monkeypatch):
@ -200,6 +197,3 @@ async def test_engine_core_client_asyncio(monkeypatch):
            else:
                assert len(outputs[req_id]) == MAX_TOKENS, (
                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
        # Shutdown the client.
        client.shutdown()
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -232,11 +232,6 @@ class LLM:
        self.request_counter = Counter()
    def __del__(self):
        if hasattr(self, 'llm_engine') and self.llm_engine and hasattr(
                self.llm_engine, "shutdown"):
            self.llm_engine.shutdown()
    @staticmethod
    def get_engine_class() -> Type[LLMEngine]:
        if envs.VLLM_USE_V1:
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@ -103,9 +103,6 @@ class AsyncLLM(EngineClient):
        self.output_handler: Optional[asyncio.Task] = None
    def __del__(self):
        self.shutdown()
    @classmethod
    def from_engine_args(
        cls,
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@ -203,7 +203,6 @@ class EngineCoreProc(EngineCore):
        finally:
            if engine_core is not None:
                engine_core.shutdown()
                engine_core = None
    def run_busy_loop(self):
        """Core busy loop of the EngineCore."""
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@ -1,4 +1,6 @@
-from typing import List, Optional, Type
+import weakref
 from abc import ABC, abstractmethod
 from typing import List, Type
 import msgspec
 import zmq
@ -18,7 +20,7 @@ from vllm.v1.utils import BackgroundProcHandle
 logger = init_logger(__name__)
-class EngineCoreClient:
+class EngineCoreClient(ABC):
    """
    EngineCoreClient: subclasses handle different methods for pushing 
        and pulling from the EngineCore for asyncio / multiprocessing.
@ -52,8 +54,9 @@ class EngineCoreClient:
        return InprocClient(vllm_config, executor_class, log_stats)
    @abstractmethod
    def shutdown(self):
-        pass
+        ...
    def get_output(self) -> List[EngineCoreOutput]:
        raise NotImplementedError
@ -107,9 +110,6 @@ class InprocClient(EngineCoreClient):
    def shutdown(self):
        self.engine_core.shutdown()
    def __del__(self):
        self.shutdown()
    def profile(self, is_start: bool = True) -> None:
        self.engine_core.profile(is_start)
@ -139,10 +139,14 @@ class MPClient(EngineCoreClient):
        self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
        # ZMQ setup.
-        if asyncio_mode:
+        self.ctx = (
-            self.ctx = zmq.asyncio.Context()
+            zmq.asyncio.Context()  # type: ignore[attr-defined]
-        else:
+            if asyncio_mode else zmq.Context())  # type: ignore[attr-defined]
-            self.ctx = zmq.Context()  # type: ignore[attr-defined]
+
        # Note(rob): shutdown function cannot be a bound method,
        # else the gc cannot collect the object.
        self._finalizer = weakref.finalize(self, lambda x: x.destroy(linger=0),
                                           self.ctx)
        # Paths and sockets for IPC.
        output_path = get_open_zmq_ipc_path()
@ -153,7 +157,6 @@ class MPClient(EngineCoreClient):
                                            zmq.constants.PUSH)
        # Start EngineCore in background process.
        self.proc_handle: Optional[BackgroundProcHandle]
        self.proc_handle = BackgroundProcHandle(
            input_path=input_path,
            output_path=output_path,
@ -166,12 +169,11 @@ class MPClient(EngineCoreClient):
            })
    def shutdown(self):
-        # Shut down the zmq context.
+        """Clean up background resources."""
-        self.ctx.destroy(linger=0)
+        if hasattr(self, "proc_handle"):
        if hasattr(self, "proc_handle") and self.proc_handle:
            self.proc_handle.shutdown()
-            self.proc_handle = None
+
        self._finalizer()
 class SyncMPClient(MPClient):
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@ -205,10 +205,3 @@ class LLMEngine:
                            f"found type: {type(tokenizer_group)}")
        return tokenizer_group
    def __del__(self):
        self.shutdown()
    def shutdown(self):
        if engine_core := getattr(self, "engine_core", None):
            engine_core.shutdown()
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@ -1,3 +1,4 @@
 import multiprocessing
 import os
 import weakref
 from collections.abc import Sequence
@ -91,8 +92,6 @@ class BackgroundProcHandle:
        target_fn: Callable,
        process_kwargs: Dict[Any, Any],
    ):
        self._finalizer = weakref.finalize(self, self.shutdown)
        context = get_mp_context()
        reader, writer = context.Pipe(duplex=False)
@ -102,11 +101,11 @@ class BackgroundProcHandle:
        process_kwargs["ready_pipe"] = writer
        process_kwargs["input_path"] = input_path
        process_kwargs["output_path"] = output_path
        self.input_path = input_path
        self.output_path = output_path
-        # Run Detokenizer busy loop in background process.
+        # Run busy loop in background process.
        self.proc = context.Process(target=target_fn, kwargs=process_kwargs)
        self._finalizer = weakref.finalize(self, shutdown, self.proc,
                                           input_path, output_path)
        self.proc.start()
        # Wait for startup.
@ -114,21 +113,24 @@ class BackgroundProcHandle:
            raise RuntimeError(f"{process_name} initialization failed. "
                               "See root cause above.")
    def __del__(self):
        self.shutdown()
    def shutdown(self):
-        # Shutdown the process if needed.
+        self._finalizer()
        if hasattr(self, "proc") and self.proc.is_alive():
            self.proc.terminate()
            self.proc.join(5)
            if self.proc.is_alive():
                kill_process_tree(self.proc.pid)
-        # Remove zmq ipc socket files
+# Note(rob): shutdown function cannot be a bound method,
-        ipc_sockets = [self.output_path, self.input_path]
+# else the gc cannot collect the object.
-        for ipc_socket in ipc_sockets:
+def shutdown(proc: multiprocessing.Process, input_path: str, output_path: str):
-            socket_file = ipc_socket.replace("ipc://", "")
+    # Shutdown the process.
-            if os and os.path.exists(socket_file):
+    if proc.is_alive():
-                os.remove(socket_file)
+        proc.terminate()
        proc.join(5)
        if proc.is_alive():
            kill_process_tree(proc.pid)
    # Remove zmq ipc socket files.
    ipc_sockets = [output_path, input_path]
    for ipc_socket in ipc_sockets:
        socket_file = ipc_socket.replace("ipc://", "")
        if os and os.path.exists(socket_file):
            os.remove(socket_file)