From 07eb6f19f3b0ee9f7adf6eb689607028aa40bfd5 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 19 Jul 2024 15:34:34 -0700 Subject: [PATCH] [bugfix][distributed] fix multi-node bug for shared memory (#6597) --- vllm/distributed/device_communicators/shm_broadcast.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index bfea106bc027..b572d7224079 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -108,8 +108,14 @@ class ShmRingBuffer: # created by the process. The following patch is a workaround. with patch("multiprocessing.resource_tracker.register", lambda *args, **kwargs: None): - self.shared_memory = shared_memory.SharedMemory(name=name) - assert self.shared_memory.size == self.total_bytes_of_buffer + try: + self.shared_memory = shared_memory.SharedMemory(name=name) + assert self.shared_memory.size == self.total_bytes_of_buffer # noqa + except FileNotFoundError: + # we might deserialize the object in a different node + # in this case, this object is not used, + # and we should suppress the error + pass def __reduce__(self): return (