mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 06:35:00 +08:00
[CI/Build] Test torchrun with 8 cards (#27548)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
This commit is contained in:
parent
a9fe0793f2
commit
f7a6682872
@ -205,6 +205,24 @@ steps:
|
|||||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||||
- popd
|
- popd
|
||||||
|
|
||||||
|
- label: Distributed Tests (8 GPUs) # 4min
|
||||||
|
timeout_in_minutes: 10
|
||||||
|
gpu: h100
|
||||||
|
num_gpus: 8
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
source_file_dependencies:
|
||||||
|
- examples/offline_inference/torchrun_dp_example.py
|
||||||
|
- vllm/config/parallel.py
|
||||||
|
- vllm/distributed/
|
||||||
|
- vllm/v1/engine/llm_engine.py
|
||||||
|
- vllm/v1/executor/uniproc_executor.py
|
||||||
|
- vllm/v1/worker/gpu_worker.py
|
||||||
|
commands:
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
|
# test with torchrun tp=2 and dp=4 with ep
|
||||||
|
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||||
|
|
||||||
- label: EPLB Algorithm Test # 5min
|
- label: EPLB Algorithm Test # 5min
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 15
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
|||||||
@ -9,10 +9,76 @@ To run this example:
|
|||||||
```bash
|
```bash
|
||||||
$ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example.py
|
$ torchrun --nproc-per-node=2 examples/offline_inference/torchrun_dp_example.py
|
||||||
```
|
```
|
||||||
|
|
||||||
|
With custom parallelism settings:
|
||||||
|
```bash
|
||||||
|
$ torchrun --nproc-per-node=8 examples/offline_inference/torchrun_dp_example.py \
|
||||||
|
--tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||||
|
```
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Data-parallel inference with torchrun"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--tp-size",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="Tensor parallel size (default: 1)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--pp-size",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="Pipeline parallel size (default: 1)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dp-size",
|
||||||
|
type=int,
|
||||||
|
default=2,
|
||||||
|
help="Data parallel size (default: 2)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--enable-ep",
|
||||||
|
action="store_true",
|
||||||
|
help="Enable expert parallel (default: False)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--model",
|
||||||
|
type=str,
|
||||||
|
default="microsoft/Phi-mini-MoE-instruct",
|
||||||
|
help="Model name or path (default: microsoft/Phi-mini-MoE-instruct)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-model-len",
|
||||||
|
type=int,
|
||||||
|
default=4096,
|
||||||
|
help="Maximum model length (default: 4096)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--gpu-memory-utilization",
|
||||||
|
type=float,
|
||||||
|
default=0.6,
|
||||||
|
help="GPU memory utilization (default: 0.6)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--seed",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="Random seed (default: 1)",
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
|
||||||
# Create prompts, the same across all ranks
|
# Create prompts, the same across all ranks
|
||||||
prompts = [
|
prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
@ -30,15 +96,15 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
|||||||
# all ranks have the same random seed, so that sampling can be
|
# all ranks have the same random seed, so that sampling can be
|
||||||
# deterministic across ranks.
|
# deterministic across ranks.
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
model="microsoft/Phi-mini-MoE-instruct",
|
model=args.model,
|
||||||
tensor_parallel_size=1,
|
tensor_parallel_size=args.tp_size,
|
||||||
data_parallel_size=2,
|
data_parallel_size=args.dp_size,
|
||||||
pipeline_parallel_size=1,
|
pipeline_parallel_size=args.pp_size,
|
||||||
enable_expert_parallel=False,
|
enable_expert_parallel=args.enable_ep,
|
||||||
distributed_executor_backend="external_launcher",
|
distributed_executor_backend="external_launcher",
|
||||||
max_model_len=4096,
|
max_model_len=args.max_model_len,
|
||||||
gpu_memory_utilization=0.6,
|
gpu_memory_utilization=args.gpu_memory_utilization,
|
||||||
seed=1,
|
seed=args.seed,
|
||||||
)
|
)
|
||||||
|
|
||||||
dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank
|
dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user