From a865bc1ca6f794b0127e601058cee5100860875a Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 8 Apr 2025 10:09:03 +0800 Subject: [PATCH] [core] do not send error across process (#16174) Signed-off-by: youkaichao --- vllm/v1/executor/multiproc_executor.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 1d5175eb6adc3..d79bce194b713 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -119,10 +119,9 @@ class MultiprocExecutor(Executor): timeout=dequeue_timeout) if status != WorkerProc.ResponseStatus.SUCCESS: - if isinstance(result, Exception): - raise result - else: - raise RuntimeError("Worker failed") + raise RuntimeError( + "Worker failed with error %s, please check the" + " stack trace above for the root cause", result) responses[w.rank] = result @@ -378,9 +377,11 @@ class WorkerProc: # Notes have been introduced in python 3.11 if hasattr(e, "add_note"): e.add_note(traceback.format_exc()) - self.worker_response_mq.enqueue( - (WorkerProc.ResponseStatus.FAILURE, e)) logger.exception("WorkerProc hit an exception: %s", exc_info=e) + # exception might not be serializable, so we convert it to + # string, only for logging purpose. + self.worker_response_mq.enqueue( + (WorkerProc.ResponseStatus.FAILURE, str(e))) continue self.worker_response_mq.enqueue(