mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 04:26:00 +08:00
[distributed][misc] use fork by default for mp (#5669)
This commit is contained in:
parent
8065a7e220
commit
6c5b7af152
@ -37,6 +37,9 @@ steps:
|
|||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
commands:
|
commands:
|
||||||
|
# FIXIT: find out which code initialize cuda before running the test
|
||||||
|
# before the fix, we need to use spawn to test it
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
@ -55,6 +58,9 @@ steps:
|
|||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
commands:
|
commands:
|
||||||
|
# FIXIT: find out which code initialize cuda before running the test
|
||||||
|
# before the fix, we need to use spawn to test it
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
|
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
|
||||||
# See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
|
# See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
|
||||||
@ -145,6 +151,9 @@ steps:
|
|||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
||||||
commands:
|
commands:
|
||||||
|
# FIXIT: find out which code initialize cuda before running the test
|
||||||
|
# before the fix, we need to use spawn to test it
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s -x lora/test_long_context.py
|
- pytest -v -s -x lora/test_long_context.py
|
||||||
|
|
||||||
- label: Tensorizer Test
|
- label: Tensorizer Test
|
||||||
|
|||||||
@ -1,6 +1,9 @@
|
|||||||
import ctypes
|
import ctypes
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import pickle
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
from itertools import product
|
from itertools import product
|
||||||
from typing import Dict, List, Optional, Sequence
|
from typing import Dict, List, Optional, Sequence
|
||||||
|
|
||||||
@ -198,7 +201,25 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
|
|||||||
ids = list(range(num_dev))
|
ids = list(range(num_dev))
|
||||||
# batch of all pairs of GPUs
|
# batch of all pairs of GPUs
|
||||||
batch_src, batch_tgt = zip(*list(product(ids, ids)))
|
batch_src, batch_tgt = zip(*list(product(ids, ids)))
|
||||||
result = can_actually_p2p(batch_src, batch_tgt)
|
# NOTE: we use `subprocess` rather than `multiprocessing` here
|
||||||
|
# because the caller might not have `if __name__ == "__main__":`,
|
||||||
|
# in that case we cannot use spawn method in multiprocessing.
|
||||||
|
# However, `can_actually_p2p` requires spawn method.
|
||||||
|
# The fix is, we use `subprocess` to call the function,
|
||||||
|
# where we have `if __name__ == "__main__":` in this file.
|
||||||
|
input_bytes = pickle.dumps((batch_src, batch_tgt))
|
||||||
|
returned = subprocess.run([sys.executable, __file__],
|
||||||
|
input=input_bytes,
|
||||||
|
capture_output=True)
|
||||||
|
# check if the subprocess is successful
|
||||||
|
try:
|
||||||
|
returned.check_returncode()
|
||||||
|
except Exception as e:
|
||||||
|
# wrap raised exception to provide more information
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Error happened when batch testing "
|
||||||
|
f"peer-to-peer access from {batch_src} to {batch_tgt}") from e
|
||||||
|
result = pickle.loads(returned.stdout)
|
||||||
for _i, _j, r in zip(batch_src, batch_tgt, result):
|
for _i, _j, r in zip(batch_src, batch_tgt, result):
|
||||||
cache[f"{_i}->{_j}"] = r
|
cache[f"{_i}->{_j}"] = r
|
||||||
with open(path, "w") as f:
|
with open(path, "w") as f:
|
||||||
@ -213,3 +234,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
__all__ = ["gpu_p2p_access_check"]
|
__all__ = ["gpu_p2p_access_check"]
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
batch_src, batch_tgt = pickle.loads(sys.stdin.buffer.read())
|
||||||
|
result = can_actually_p2p(batch_src, batch_tgt)
|
||||||
|
sys.stdout.buffer.write(pickle.dumps(result))
|
||||||
|
|||||||
@ -29,7 +29,7 @@ if TYPE_CHECKING:
|
|||||||
VLLM_CPU_KVCACHE_SPACE: int = 0
|
VLLM_CPU_KVCACHE_SPACE: int = 0
|
||||||
VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/"
|
VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/"
|
||||||
VLLM_USE_RAY_COMPILED_DAG: bool = False
|
VLLM_USE_RAY_COMPILED_DAG: bool = False
|
||||||
VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
|
VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
|
||||||
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
|
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
|
||||||
VLLM_TARGET_DEVICE: str = "cuda"
|
VLLM_TARGET_DEVICE: str = "cuda"
|
||||||
MAX_JOBS: Optional[str] = None
|
MAX_JOBS: Optional[str] = None
|
||||||
@ -212,7 +212,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
|
|||||||
# Use dedicated multiprocess context for workers.
|
# Use dedicated multiprocess context for workers.
|
||||||
# Both spawn and fork work
|
# Both spawn and fork work
|
||||||
"VLLM_WORKER_MULTIPROC_METHOD":
|
"VLLM_WORKER_MULTIPROC_METHOD":
|
||||||
lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),
|
lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
|
||||||
|
|
||||||
# Timeout for fetching images when serving multimodal models
|
# Timeout for fetching images when serving multimodal models
|
||||||
# Default is 5 seconds
|
# Default is 5 seconds
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user