[V1] Improve TP>1 Error Handling + Stack Trace (#11721)

Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
Robert Shaw 2025-01-03 16:29:11 -05:00 committed by GitHub
parent 61fed92c7e
commit 1543914c04
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 40 additions and 21 deletions

View File

@ -1,6 +1,5 @@
import asyncio
import os
import signal
from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
from vllm.config import ModelConfig, VllmConfig
@ -42,21 +41,6 @@ class AsyncLLM(EngineClient):
start_engine_loop: bool = True,
) -> None:
# The child processes will send SIGQUIT when unrecoverable
# errors happen. We kill the process tree here so that the
# stack trace is very evident.
# TODO: rather than killing the main process, we should
# figure out how to raise an AsyncEngineDeadError and
# handle at the API server level so we can return a better
# error code to the clients calling VLLM.
def sigquit_handler(signum, frame):
logger.fatal(
"AsyncLLM got SIGQUIT from worker processes, shutting "
"down. See stack trace above for root cause issue.")
kill_process_tree(os.getpid())
signal.signal(signal.SIGQUIT, sigquit_handler)
assert start_engine_loop
self.log_requests = log_requests

View File

@ -198,7 +198,7 @@ class EngineCoreProc(EngineCore):
except Exception:
traceback = get_exception_traceback()
logger.error("EngineCore hit an exception: %s", traceback)
parent_process.send_signal(signal.SIGQUIT)
parent_process.send_signal(signal.SIGUSR1)
finally:
if engine_core is not None:

View File

@ -1,3 +1,5 @@
import os
import signal
import weakref
from abc import ABC, abstractmethod
from typing import List, Type
@ -8,7 +10,8 @@ import zmq.asyncio
from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket
from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
make_zmq_socket)
from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
EngineCoreProfile, EngineCoreRequest,
EngineCoreRequestType, EngineCoreRequestUnion)
@ -134,6 +137,20 @@ class MPClient(EngineCoreClient):
executor_class: Type[Executor],
log_stats: bool = False,
):
# The child processes will send SIGUSR1 when unrecoverable
# errors happen. We kill the process tree here so that the
# stack trace is very evident.
# TODO(rob): rather than killing the main process, we should
# figure out how to raise an AsyncEngineDeadError and
# handle at the API server level so we can return a better
# error code to the clients calling VLLM.
def sigusr1_handler(signum, frame):
logger.fatal("Got fatal signal from worker processes, shutting "
"down. See stack trace above for root cause issue.")
kill_process_tree(os.getpid())
signal.signal(signal.SIGUSR1, sigusr1_handler)
# Serialization setup.
self.encoder = PickleEncoder()
self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)

View File

@ -9,6 +9,7 @@ from enum import Enum, auto
from multiprocessing.process import BaseProcess
from typing import Any, Dict, List, Optional, Tuple
import psutil
import zmq
from vllm.config import VllmConfig
@ -38,6 +39,19 @@ class MultiprocExecutor(Executor):
# and ensure workers will be terminated.
self._finalizer = weakref.finalize(self, self.shutdown)
# The child processes will send SIGUSR1 when unrecoverable
# errors happen.
def sigusr1_handler(signum, frame):
logger.fatal(
"MulitprocExecutor got fatal signal from worker processes, "
"shutting down. See stack trace above for root cause issue.")
# Propagate error up to parent process.
parent_process = psutil.Process().parent()
parent_process.send_signal(signal.SIGUSR1)
self.shutdown()
signal.signal(signal.SIGUSR1, sigusr1_handler)
self.vllm_config = vllm_config
self.parallel_config = vllm_config.parallel_config
@ -335,8 +349,11 @@ class WorkerProc:
except SystemExit:
logger.debug("Worker interrupted.")
except BaseException as e:
logger.exception(e)
except Exception:
# worker_busy_loop sends exceptions exceptons to Executor
# for shutdown, but if there is an error in startup or an
# error with IPC itself, we need to alert the parent.
psutil.Process().parent().send_signal(signal.SIGUSR1)
raise
finally:
@ -377,9 +394,10 @@ class WorkerProc:
try:
output = getattr(self.worker, method)(*args, **kwargs)
except BaseException as e:
except Exception as e:
self.worker_response_mq.enqueue(
(WorkerProc.ResponseStatus.FAILURE, e))
logger.exception("WorkerProc hit an exception: %s", exc_info=e)
continue
self.worker_response_mq.enqueue(