diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md new file mode 100644 index 0000000000000..b58456ecc6da8 --- /dev/null +++ b/docs/source/design/multiprocessing.md @@ -0,0 +1,195 @@ +# Python Multiprocessing + +## Debugging + +Please see the [Debugging +Tips](https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing) +page for information on known issues and how to solve them. + +## Introduction + +*Note that source code references are to the state of the code at the time of writing in December, 2024.* + +The use of Python multiprocessing in vLLM is complicated by: + +- The use of vLLM as a library and the inability to control the code using vLLM +- Varying levels of incompatibilities between multiprocessing methods and vLLM + dependencies + +This document describes how vLLM deals with these challenges. + +## Multiprocessing Methods + +[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include: + +- `spawn` - spawn a new Python process. This will be the default as of Python + 3.14. + +- `fork` - Use `os.fork()` to fork the Python interpreter. This is the default + in Python versions prior to 3.14. + +- `forkserver` - Spawn a server process that will fork a new process on request. + +### Tradeoffs + +`fork` is the fastest method, but is incompatible with dependencies that use +threads. + +`spawn` is more compatible with dependencies, but can be problematic when vLLM +is used as a library. If the consuming code does not use a `__main__` guard (`if +__name__ == "__main__":`), the code will be inadvertently re-executed when vLLM +spawns a new process. This can lead to infinite recursion, among other problems. + +`forkserver` will spawn a new server process that will fork new processes on +demand. This unfortunately has the same problem as `spawn` when vLLM is used as +a library. The server process is created as a spawned new process, which will +re-execute code not protected by a `__main__` guard. + +For both `spawn` and `forkserver`, the process must not depend on inheriting any +global state as would be the case with `fork`. + +## Compatibility with Dependencies + +Multiple vLLM dependencies indicate either a preference or requirement for using +`spawn`: + +- +- +- + +It is perhaps more accurate to say that there are known problems with using +`fork` after initializing these dependencies. + +## Current State (v0) + +The environment variable `VLLM_WORKER_MULTIPROC_METHOD` can be used to control which method is used by vLLM. The current default is `fork`. + +- + +When we know we own the process because the `vllm` command was used, we use +`spawn` because it's the most widely compatible. + +- + +The `multiproc_xpu_executor` forces the use of `spawn`. + +- + +There are other miscellaneous places hard-coding the use of `spawn`: + +- +- + +Related PRs: + +- + +## Prior State in v1 + +There was an environment variable to control whether multiprocessing is used in +the v1 engine core, `VLLM_ENABLE_V1_MULTIPROCESSING`. This defaulted to off. + +- + +When it was enabled, the v1 `LLMEngine` would create a new process to run the +engine core. + +- +- +- https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/core_client.py#L44-L45 + +It was off by default for all the reasons mentioned above - compatibility with +dependencies and code using vLLM as a library. + +### Changes Made in v1 + +There is not an easy solution with Python's `multiprocessing` that will work +everywhere. As a first step, we can get v1 into a state where it does "best +effort" choice of multiprocessing method to maximize compatibility. + +- Default to `fork`. +- Use `spawn` when we know we control the main process (`vllm` was executed). +- If we detect `cuda` was previously initialized, force `spawn` and emit a + warning. We know `fork` will break, so this is the best we can do. + +The case that is known to still break in this scenario is code using vLLM as a +library that initializes `cuda` before calling vLLM. The warning we emit should +instruct users to either add a `__main__` guard or to disable multiprocessing. + +If that known-failure case occurs, the user will see two messages that explain +what is happening. First, a log message from vLLM: + +``` + WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously + initialized. We must use the `spawn` multiprocessing start method. Setting + VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See + https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing + for more information. +``` + +Second, Python itself will raise an exception with a nice explanation: + +``` +RuntimeError: + An attempt has been made to start a new process before the + current process has finished its bootstrapping phase. + + This probably means that you are not using fork to start your + child processes and you have forgotten to use the proper idiom + in the main module: + + if __name__ == '__main__': + freeze_support() + ... + + The "freeze_support()" line can be omitted if the program + is not going to be frozen to produce an executable. + + To fix this issue, refer to the "Safe importing of main module" + section in https://docs.python.org/3/library/multiprocessing.html +``` + +## Alternatives Considered + +### Detect if a `__main__` guard is present + +It has been suggested that we could behave better if we could detect whether +code using vLLM as a library has a `__main__` guard in place. This [post on +stackoverflow](https://stackoverflow.com/questions/77220442/multiprocessing-pool-in-a-python-class-without-name-main-guard) +was from a library author facing the same question. + +It is possible to detect whether we are in the original, `__main__` process, or +a subsequent spawned process. However, it does not appear to be straight forward +to detect whether a `__main__` guard is present in the code. + +This option has been discarded as impractical. + +### Use `forkserver` + +At first it appears that `forkserver` is a nice solution to the problem. +However, the way it works presents the same challenges that `spawn` does when +vLLM is used as a library. + +### Force `spawn` all the time + +One way to clean this up is to just force the use of `spawn` all the time and +document that the use of a `__main__` guard is required when using vLLM as a +library. This would unfortunately break existing code and make vLLM harder to +use, violating the desire to make the `LLM` class as easy as possible to use. + +Instead of pushing this on our users, we will retain the complexity to do our +best to make things work. + +## Future Work + +We may want to consider a different worker management approach in the future +that works around these challenges. + +1. We could implement something `forkserver`-like, but have the process manager + be something we initially launch by running our own subprocess and a custom + entrypoint for worker management (launch a `vllm-manager` process). + +2. We can explore other libraries that may better suit our needs. Examples to + consider: + +- diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst index 0c1afcbd7c0b9..d6c83014dc69f 100644 --- a/docs/source/getting_started/debugging.rst +++ b/docs/source/getting_started/debugging.rst @@ -136,6 +136,62 @@ If the test script hangs or crashes, usually it means the hardware/drivers are b Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup, being sure to execute different commands (with different ``--node-rank``) on different nodes. +Python multiprocessing +---------------------- + +`RuntimeError` Exception +^^^^^^^^^^^^^^^^^^^^^^^^ + +If you have seen a warning in your logs like this: + +.. code-block:: console + + WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously + initialized. We must use the `spawn` multiprocessing start method. Setting + VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See + https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing + for more information. + +or an error from Python that looks like this: + +.. code-block:: console + + RuntimeError: + An attempt has been made to start a new process before the + current process has finished its bootstrapping phase. + + This probably means that you are not using fork to start your + child processes and you have forgotten to use the proper idiom + in the main module: + + if __name__ == '__main__': + freeze_support() + ... + + The "freeze_support()" line can be omitted if the program + is not going to be frozen to produce an executable. + + To fix this issue, refer to the "Safe importing of main module" + section in https://docs.python.org/3/library/multiprocessing.html + +then you must update your Python code to guard usage of ``vllm`` behind a ``if +__name__ == '__main__':`` block. For example, instead of this: + +.. code-block:: python + + import vllm + + llm = vllm.LLM(...) + +try this instead: + +.. code-block:: python + + if __name__ == '__main__': + import vllm + + llm = vllm.LLM(...) + Known Issues ---------------------------------------- - In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq `_ , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of ``vllm`` to include the `fix `_. diff --git a/docs/source/index.rst b/docs/source/index.rst index 842013d6d49c4..8ac09f6988893 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -173,6 +173,7 @@ Documentation design/input_processing/model_inputs_index design/kernel/paged_attention design/multimodal/multimodal_index + design/multiprocessing .. For Developers: contributing to the vLLM project diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 11b2574ce42dd..58ab892676b9a 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -232,6 +232,10 @@ class LLM: self.request_counter = Counter() + def __del__(self): + if self.llm_engine and hasattr(self.llm_engine, "shutdown"): + self.llm_engine.shutdown() + @staticmethod def get_engine_class() -> Type[LLMEngine]: if envs.VLLM_USE_V1: diff --git a/vllm/envs.py b/vllm/envs.py index bc8c1499e9534..da17b747ea215 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -69,7 +69,7 @@ if TYPE_CHECKING: VLLM_SKIP_P2P_CHECK: bool = False VLLM_DISABLED_KERNELS: List[str] = [] VLLM_USE_V1: bool = False - VLLM_ENABLE_V1_MULTIPROCESSING: bool = False + VLLM_ENABLE_V1_MULTIPROCESSING: bool = True VLLM_LOG_BATCHSIZE_INTERVAL: float = -1 @@ -460,7 +460,7 @@ environment_variables: Dict[str, Callable[[], Any]] = { # If set, enable multiprocessing in LLM for the V1 code path. "VLLM_ENABLE_V1_MULTIPROCESSING": - lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0"))), + lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))), "VLLM_LOG_BATCHSIZE_INTERVAL": lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")), } diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index fe475db6d3f57..c4d90f0856f86 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -274,7 +274,20 @@ def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None: file.write = write_with_prefix # type: ignore[method-assign] +def _check_multiproc_method(): + if (cuda_is_initialized() + and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"): + logger.warning("CUDA was previously initialized. We must use " + "the `spawn` multiprocessing start method. Setting " + "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " + "See https://docs.vllm.ai/en/latest/getting_started/" + "debugging.html#python-multiprocessing " + "for more information.") + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + def get_mp_context(): + _check_multiproc_method() mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD return multiprocessing.get_context(mp_method) @@ -284,12 +297,7 @@ def set_multiprocessing_worker_envs(parallel_config): in a multiprocessing environment. This should be called by the parent process before worker processes are created""" - if (cuda_is_initialized() - and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"): - logger.warning("CUDA was previously initialized. We must use " - "the `spawn` multiprocessing start method. Setting " - "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'.") - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + _check_multiproc_method() # Configure thread parallelism if OMP_NUM_THREADS isn't set # diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 6246a0067842a..ee7419bce2565 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -1,4 +1,3 @@ -import multiprocessing import pickle import queue import signal @@ -13,6 +12,7 @@ import zmq.asyncio from msgspec import msgpack from vllm.config import CacheConfig, VllmConfig +from vllm.executor.multiproc_worker_utils import get_mp_context from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext from vllm.v1.core.scheduler import Scheduler @@ -210,11 +210,7 @@ class EngineCoreProc(EngineCore): output_path: str, ready_path: str, ) -> EngineCoreProcHandle: - # The current process might have CUDA context, - # so we need to spawn a new process. - # NOTE(rob): this is a problem for using EngineCoreProc w/ - # LLM, since we need a if __name__ == "__main__" guard. - context = multiprocessing.get_context("spawn") + context = get_mp_context() process_kwargs = { "input_path": input_path, diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index a66ae111be8c5..e0bfe1b93b360 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -159,10 +159,16 @@ class MPClient(EngineCoreClient): atexit.register(self.shutdown) def shutdown(self): + # During final garbage collection in process shutdown, atexit may be + # None. + if atexit: + # in case shutdown gets called via __del__ first + atexit.unregister(self.shutdown) + # Shut down the zmq context. self.ctx.destroy(linger=0) - if hasattr(self, "proc_handle"): + if hasattr(self, "proc_handle") and self.proc_handle: # Shutdown the process if needed. if self.proc_handle.proc.is_alive(): self.proc_handle.proc.terminate() @@ -178,8 +184,9 @@ class MPClient(EngineCoreClient): ] for ipc_socket in ipc_sockets: socket_file = ipc_socket.replace("ipc://", "") - if os.path.exists(socket_file): + if os and os.path.exists(socket_file): os.remove(socket_file) + self.proc_handle = None def __del__(self): self.shutdown() diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 1b3a9f12d009e..c02494897b41f 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -196,3 +196,10 @@ class LLMEngine: f"found type: {type(tokenizer_group)}") return tokenizer_group + + def __del__(self): + self.shutdown() + + def shutdown(self): + if engine_core := getattr(self, "engine_core", None): + engine_core.shutdown() diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 63a12f791051f..14384a730ceec 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -163,6 +163,10 @@ class MultiprocExecutor: termination and kill signals if needed.""" def wait_for_termination(procs, timeout): + if not time: + # If we are in late stage shutdown, the interpreter may replace + # `time` with `None`. + return all(not proc.is_alive() for proc in procs) start_time = time.time() while time.time() - start_time < timeout: if all(not proc.is_alive() for proc in procs): @@ -187,10 +191,14 @@ class MultiprocExecutor: for w in self.workers: # Remove the zmq ipc socket file socket_path = w.ready_path.replace("ipc://", "") - if os.path.exists(socket_path): + if os and os.path.exists(socket_path): os.remove(socket_path) def shutdown(self): + if atexit: + # in case shutdown was called explicitly, we don't need to call it + # again + atexit.unregister(self.shutdown) """Properly shut down the executor and its workers""" if (hasattr(self, 'workers') and self.workers is not None): for w in self.workers: #TODO: not sure if needed