mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-27 10:21:48 +08:00
[V1] Improve TP>1 Error Handling + Stack Trace (#11721)
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
parent
61fed92c7e
commit
1543914c04
@ -1,6 +1,5 @@
|
||||
import asyncio
|
||||
import os
|
||||
import signal
|
||||
from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
|
||||
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
@ -42,21 +41,6 @@ class AsyncLLM(EngineClient):
|
||||
start_engine_loop: bool = True,
|
||||
) -> None:
|
||||
|
||||
# The child processes will send SIGQUIT when unrecoverable
|
||||
# errors happen. We kill the process tree here so that the
|
||||
# stack trace is very evident.
|
||||
# TODO: rather than killing the main process, we should
|
||||
# figure out how to raise an AsyncEngineDeadError and
|
||||
# handle at the API server level so we can return a better
|
||||
# error code to the clients calling VLLM.
|
||||
def sigquit_handler(signum, frame):
|
||||
logger.fatal(
|
||||
"AsyncLLM got SIGQUIT from worker processes, shutting "
|
||||
"down. See stack trace above for root cause issue.")
|
||||
kill_process_tree(os.getpid())
|
||||
|
||||
signal.signal(signal.SIGQUIT, sigquit_handler)
|
||||
|
||||
assert start_engine_loop
|
||||
|
||||
self.log_requests = log_requests
|
||||
|
||||
@ -198,7 +198,7 @@ class EngineCoreProc(EngineCore):
|
||||
except Exception:
|
||||
traceback = get_exception_traceback()
|
||||
logger.error("EngineCore hit an exception: %s", traceback)
|
||||
parent_process.send_signal(signal.SIGQUIT)
|
||||
parent_process.send_signal(signal.SIGUSR1)
|
||||
|
||||
finally:
|
||||
if engine_core is not None:
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
import os
|
||||
import signal
|
||||
import weakref
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Type
|
||||
@ -8,7 +10,8 @@ import zmq.asyncio
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket
|
||||
from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
|
||||
make_zmq_socket)
|
||||
from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
|
||||
EngineCoreProfile, EngineCoreRequest,
|
||||
EngineCoreRequestType, EngineCoreRequestUnion)
|
||||
@ -134,6 +137,20 @@ class MPClient(EngineCoreClient):
|
||||
executor_class: Type[Executor],
|
||||
log_stats: bool = False,
|
||||
):
|
||||
# The child processes will send SIGUSR1 when unrecoverable
|
||||
# errors happen. We kill the process tree here so that the
|
||||
# stack trace is very evident.
|
||||
# TODO(rob): rather than killing the main process, we should
|
||||
# figure out how to raise an AsyncEngineDeadError and
|
||||
# handle at the API server level so we can return a better
|
||||
# error code to the clients calling VLLM.
|
||||
def sigusr1_handler(signum, frame):
|
||||
logger.fatal("Got fatal signal from worker processes, shutting "
|
||||
"down. See stack trace above for root cause issue.")
|
||||
kill_process_tree(os.getpid())
|
||||
|
||||
signal.signal(signal.SIGUSR1, sigusr1_handler)
|
||||
|
||||
# Serialization setup.
|
||||
self.encoder = PickleEncoder()
|
||||
self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
|
||||
|
||||
@ -9,6 +9,7 @@ from enum import Enum, auto
|
||||
from multiprocessing.process import BaseProcess
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import psutil
|
||||
import zmq
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
@ -38,6 +39,19 @@ class MultiprocExecutor(Executor):
|
||||
# and ensure workers will be terminated.
|
||||
self._finalizer = weakref.finalize(self, self.shutdown)
|
||||
|
||||
# The child processes will send SIGUSR1 when unrecoverable
|
||||
# errors happen.
|
||||
def sigusr1_handler(signum, frame):
|
||||
logger.fatal(
|
||||
"MulitprocExecutor got fatal signal from worker processes, "
|
||||
"shutting down. See stack trace above for root cause issue.")
|
||||
# Propagate error up to parent process.
|
||||
parent_process = psutil.Process().parent()
|
||||
parent_process.send_signal(signal.SIGUSR1)
|
||||
self.shutdown()
|
||||
|
||||
signal.signal(signal.SIGUSR1, sigusr1_handler)
|
||||
|
||||
self.vllm_config = vllm_config
|
||||
self.parallel_config = vllm_config.parallel_config
|
||||
|
||||
@ -335,8 +349,11 @@ class WorkerProc:
|
||||
except SystemExit:
|
||||
logger.debug("Worker interrupted.")
|
||||
|
||||
except BaseException as e:
|
||||
logger.exception(e)
|
||||
except Exception:
|
||||
# worker_busy_loop sends exceptions exceptons to Executor
|
||||
# for shutdown, but if there is an error in startup or an
|
||||
# error with IPC itself, we need to alert the parent.
|
||||
psutil.Process().parent().send_signal(signal.SIGUSR1)
|
||||
raise
|
||||
|
||||
finally:
|
||||
@ -377,9 +394,10 @@ class WorkerProc:
|
||||
|
||||
try:
|
||||
output = getattr(self.worker, method)(*args, **kwargs)
|
||||
except BaseException as e:
|
||||
except Exception as e:
|
||||
self.worker_response_mq.enqueue(
|
||||
(WorkerProc.ResponseStatus.FAILURE, e))
|
||||
logger.exception("WorkerProc hit an exception: %s", exc_info=e)
|
||||
continue
|
||||
|
||||
self.worker_response_mq.enqueue(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user