diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md index 55dae0bb92d4e..43fe5fe2e5e94 100644 --- a/docs/source/design/multiprocessing.md +++ b/docs/source/design/multiprocessing.md @@ -24,7 +24,7 @@ This document describes how vLLM deals with these challenges. [Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include: - `spawn` - spawn a new Python process. This will be the default as of Python - 3.14. + 3.14. In macOS, this is already the default. - `fork` - Use `os.fork()` to fork the Python interpreter. This is the default in Python versions prior to 3.14. @@ -34,7 +34,7 @@ This document describes how vLLM deals with these challenges. ### Tradeoffs `fork` is the fastest method, but is incompatible with dependencies that use -threads. +threads. If you are under macOS, using `fork` may cause the process to crash. `spawn` is more compatible with dependencies, but can be problematic when vLLM is used as a library. If the consuming code does not use a `__main__` guard (`if diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 0d54fc73c882b..11ed7c0843779 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -125,8 +125,13 @@ class ShmRingBuffer: lambda *args, **kwargs: None): try: self.shared_memory = shared_memory.SharedMemory(name=name) - assert ( - self.shared_memory.size == self.total_bytes_of_buffer) + # See https://docs.python.org/3/library/multiprocessing.shared_memory.html # noqa + # Some platforms allocate memory based on page size, + # so the shared memory block size may be larger or equal + # to the requested size. The size parameter is ignored + # when attaching to an existing block. + assert (self.shared_memory.size + >= self.total_bytes_of_buffer) except FileNotFoundError: # we might deserialize the object in a different node # in this case, this object is not used, diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 0eb747a4c4514..619219023f4da 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import os +import sys from typing import TYPE_CHECKING, Optional import psutil @@ -148,6 +149,13 @@ class CpuPlatform(Platform): # To hint IPEX uses shared memory based AllReduce os.environ["LOCAL_WORLD_SIZE"] = str( vllm_config.parallel_config.tensor_parallel_size) + if sys.platform == "darwin" and \ + envs.VLLM_WORKER_MULTIPROC_METHOD == "fork": + if os.environ.get('VLLM_WORKER_MULTIPROC_METHOD', None) is None: + logger.warning( + "Default to spawn method on MacOS. If this is not desired," + " set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly.") + os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' @classmethod def is_pin_memory_available(cls) -> bool: