[Core] cleanup zmq ipc sockets on exit (#11115)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
This commit is contained in:
Russell Bryant 2024-12-11 22:12:24 -05:00 committed by GitHub
parent 24a36d6d5f
commit ccede2b264
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 56 additions and 16 deletions

View File

@ -1,4 +1,5 @@
import asyncio import asyncio
import atexit
import importlib import importlib
import inspect import inspect
import multiprocessing import multiprocessing
@ -196,6 +197,14 @@ async def build_async_engine_client_from_engine_args(
assert engine_pid is not None, "Engine process failed to start." assert engine_pid is not None, "Engine process failed to start."
logger.info("Started engine process with PID %d", engine_pid) logger.info("Started engine process with PID %d", engine_pid)
def _cleanup_ipc_path():
socket_path = ipc_path.replace("ipc://", "")
if os.path.exists(socket_path):
os.remove(socket_path)
# Ensure we clean up the local IPC socket file on exit.
atexit.register(_cleanup_ipc_path)
# Build RPCClient, which conforms to EngineClient Protocol. # Build RPCClient, which conforms to EngineClient Protocol.
engine_config = engine_args.create_engine_config() engine_config = engine_args.create_engine_config()
build_client = partial(MQLLMEngineClient, ipc_path, engine_config, build_client = partial(MQLLMEngineClient, ipc_path, engine_config,

View File

@ -4,6 +4,7 @@ import queue
import signal import signal
import threading import threading
import time import time
from dataclasses import dataclass
from multiprocessing.process import BaseProcess from multiprocessing.process import BaseProcess
from typing import List, Tuple, Type, Union from typing import List, Tuple, Type, Union
@ -129,6 +130,14 @@ class EngineCore:
self.model_executor.profile(is_start) self.model_executor.profile(is_start)
@dataclass
class EngineCoreProcHandle:
proc: BaseProcess
ready_path: str
input_path: str
output_path: str
class EngineCoreProc(EngineCore): class EngineCoreProc(EngineCore):
"""ZMQ-wrapper for running EngineCore in background process.""" """ZMQ-wrapper for running EngineCore in background process."""
@ -200,7 +209,7 @@ class EngineCoreProc(EngineCore):
input_path: str, input_path: str,
output_path: str, output_path: str,
ready_path: str, ready_path: str,
) -> BaseProcess: ) -> EngineCoreProcHandle:
# The current process might have CUDA context, # The current process might have CUDA context,
# so we need to spawn a new process. # so we need to spawn a new process.
# NOTE(rob): this is a problem for using EngineCoreProc w/ # NOTE(rob): this is a problem for using EngineCoreProc w/
@ -222,7 +231,10 @@ class EngineCoreProc(EngineCore):
# Wait for startup # Wait for startup
EngineCoreProc.wait_for_startup(proc, ready_path) EngineCoreProc.wait_for_startup(proc, ready_path)
return proc return EngineCoreProcHandle(proc=proc,
ready_path=ready_path,
input_path=input_path,
output_path=output_path)
@staticmethod @staticmethod
def run_engine_core(*args, **kwargs): def run_engine_core(*args, **kwargs):

View File

@ -1,4 +1,5 @@
import atexit import atexit
import os
from typing import List, Union from typing import List, Union
import msgspec import msgspec
@ -148,7 +149,7 @@ class MPClient(EngineCoreClient):
self.input_socket.bind(input_path) self.input_socket.bind(input_path)
# Start EngineCore in background process. # Start EngineCore in background process.
self.proc = EngineCoreProc.make_engine_core_process( self.proc_handle = EngineCoreProc.make_engine_core_process(
*args, *args,
input_path=input_path, input_path=input_path,
output_path=output_path, output_path=output_path,
@ -161,13 +162,24 @@ class MPClient(EngineCoreClient):
# Shut down the zmq context. # Shut down the zmq context.
self.ctx.destroy(linger=0) self.ctx.destroy(linger=0)
# Shutdown the process if needed. if hasattr(self, "proc_handle"):
if hasattr(self, "proc") and self.proc.is_alive(): # Shutdown the process if needed.
self.proc.terminate() if self.proc_handle.proc.is_alive():
self.proc.join(5) self.proc_handle.proc.terminate()
self.proc_handle.proc.join(5)
if self.proc.is_alive(): if self.proc_handle.proc.is_alive():
kill_process_tree(self.proc.pid) kill_process_tree(self.proc_handle.proc.pid)
# Remove zmq ipc socket files
ipc_sockets = [
self.proc_handle.ready_path, self.proc_handle.output_path,
self.proc_handle.input_path
]
for ipc_socket in ipc_sockets:
socket_file = ipc_socket.replace("ipc://", "")
if os.path.exists(socket_file):
os.remove(socket_file)
def __del__(self): def __del__(self):
self.shutdown() self.shutdown()

View File

@ -172,16 +172,23 @@ class MultiprocExecutor:
# Send SIGTERM if still running # Send SIGTERM if still running
active_procs = [w.proc for w in self.workers if w.proc.is_alive()] active_procs = [w.proc for w in self.workers if w.proc.is_alive()]
self.workers = None
for p in active_procs: for p in active_procs:
p.terminate() p.terminate()
if wait_for_termination(active_procs, 4): if not wait_for_termination(active_procs, 4):
return # Send SIGKILL if still running
active_procs = [p for p in active_procs if p.is_alive()]
for p in active_procs:
p.kill()
# Send SIGKILL if still running self._cleanup_sockets()
active_procs = [p for p in active_procs if p.is_alive()] self.workers = None
for p in active_procs:
p.kill() def _cleanup_sockets(self):
for w in self.workers:
# Remove the zmq ipc socket file
socket_path = w.ready_path.replace("ipc://", "")
if os.path.exists(socket_path):
os.remove(socket_path)
def shutdown(self): def shutdown(self):
"""Properly shut down the executor and its workers""" """Properly shut down the executor and its workers"""