[distributed][misc] use fork by default for mp (#5669)

2026-07-21 12:07:15 +08:00 · 2024-06-20 17:06:34 -07:00 · 2024-06-20 17:06:34 -07:00 · 6c5b7af152
commit 6c5b7af152
parent 8065a7e220
3 changed files with 38 additions and 3 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -37,6 +37,9 @@ steps:
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  commands:
  # FIXIT: find out which code initialize cuda before running the test
  # before the fix, we need to use spawn to test it
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
@ -55,6 +58,9 @@ steps:
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  commands:
  # FIXIT: find out which code initialize cuda before running the test
  # before the fix, we need to use spawn to test it
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s distributed/test_pynccl.py
  # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
  # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
@ -145,6 +151,9 @@ steps:
  num_gpus: 4
  # This test runs llama 13B, so it is required to run on 4 GPUs.
  commands:
    # FIXIT: find out which code initialize cuda before running the test
    # before the fix, we need to use spawn to test it
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s -x lora/test_long_context.py
 - label: Tensorizer Test
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@ -1,6 +1,9 @@
 import ctypes
 import json
 import os
 import pickle
 import subprocess
 import sys
 from itertools import product
 from typing import Dict, List, Optional, Sequence
@ -198,7 +201,25 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
        ids = list(range(num_dev))
        # batch of all pairs of GPUs
        batch_src, batch_tgt = zip(*list(product(ids, ids)))
-        result = can_actually_p2p(batch_src, batch_tgt)
+        # NOTE: we use `subprocess` rather than `multiprocessing` here
        # because the caller might not have `if __name__ == "__main__":`,
        # in that case we cannot use spawn method in multiprocessing.
        # However, `can_actually_p2p` requires spawn method.
        # The fix is, we use `subprocess` to call the function,
        # where we have `if __name__ == "__main__":` in this file.
        input_bytes = pickle.dumps((batch_src, batch_tgt))
        returned = subprocess.run([sys.executable, __file__],
                                  input=input_bytes,
                                  capture_output=True)
        # check if the subprocess is successful
        try:
            returned.check_returncode()
        except Exception as e:
            # wrap raised exception to provide more information
            raise RuntimeError(
                f"Error happened when batch testing "
                f"peer-to-peer access from {batch_src} to {batch_tgt}") from e
        result = pickle.loads(returned.stdout)
        for _i, _j, r in zip(batch_src, batch_tgt, result):
            cache[f"{_i}->{_j}"] = r
        with open(path, "w") as f:
@ -213,3 +234,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
 __all__ = ["gpu_p2p_access_check"]
 if __name__ == "__main__":
    batch_src, batch_tgt = pickle.loads(sys.stdin.buffer.read())
    result = can_actually_p2p(batch_src, batch_tgt)
    sys.stdout.buffer.write(pickle.dumps(result))
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -29,7 +29,7 @@ if TYPE_CHECKING:
    VLLM_CPU_KVCACHE_SPACE: int = 0
    VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/"
    VLLM_USE_RAY_COMPILED_DAG: bool = False
-    VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
+    VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
    VLLM_IMAGE_FETCH_TIMEOUT: int = 5
    VLLM_TARGET_DEVICE: str = "cuda"
    MAX_JOBS: Optional[str] = None
@ -212,7 +212,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    # Use dedicated multiprocess context for workers.
    # Both spawn and fork work
    "VLLM_WORKER_MULTIPROC_METHOD":
-    lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),
+    lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
    # Timeout for fetching images when serving multimodal models
    # Default is 5 seconds