mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-30 05:17:03 +08:00
[CI] Fix H200 Distributed test (#31054)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
This commit is contained in:
parent
ee52d9901d
commit
ae0770fa6b
@ -1254,13 +1254,13 @@ steps:
|
|||||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
||||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs) # 68min
|
- label: Distributed Tests (2 GPUs) # 68min
|
||||||
timeout_in_minutes: 90
|
timeout_in_minutes: 90
|
||||||
@ -1508,7 +1508,7 @@ steps:
|
|||||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
|
- HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
##### B200 test #####
|
##### B200 test #####
|
||||||
|
|||||||
@ -1109,13 +1109,13 @@ steps:
|
|||||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
||||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs) # 68min
|
- label: Distributed Tests (2 GPUs) # 68min
|
||||||
timeout_in_minutes: 90
|
timeout_in_minutes: 90
|
||||||
@ -1334,7 +1334,7 @@ steps:
|
|||||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
##### B200 test #####
|
##### B200 test #####
|
||||||
|
|||||||
@ -145,7 +145,7 @@ steps:
|
|||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
||||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs)(B200)
|
- label: Distributed Tests (2 GPUs)(B200)
|
||||||
@ -171,7 +171,7 @@ steps:
|
|||||||
- tests/distributed/
|
- tests/distributed/
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
commands:
|
commands:
|
||||||
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
|
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
|
||||||
|
|
||||||
- label: Distributed NixlConnector PD accuracy (4 GPUs)
|
- label: Distributed NixlConnector PD accuracy (4 GPUs)
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
|
|||||||
@ -5,25 +5,25 @@ Usage:
|
|||||||
Single node:
|
Single node:
|
||||||
python examples/offline_inference/data_parallel.py \
|
python examples/offline_inference/data_parallel.py \
|
||||||
--model="ibm-research/PowerMoE-3b" \
|
--model="ibm-research/PowerMoE-3b" \
|
||||||
--dp-size=2 \
|
-dp=2 \
|
||||||
--tp-size=2
|
-tp=2
|
||||||
|
|
||||||
Multi-node:
|
Multi-node:
|
||||||
Node 0 (assume the node has ip of 10.99.48.128):
|
Node 0 (assume the node has ip of 10.99.48.128):
|
||||||
python examples/offline_inference/data_parallel.py \
|
python examples/offline_inference/data_parallel.py \
|
||||||
--model="ibm-research/PowerMoE-3b" \
|
--model="ibm-research/PowerMoE-3b" \
|
||||||
--dp-size=2 \
|
-dp=2 \
|
||||||
--tp-size=2 \
|
-tp=2 \
|
||||||
--node-size=2 \
|
--nnodes=2 \
|
||||||
--node-rank=0 \
|
--node-rank=0 \
|
||||||
--master-addr=10.99.48.128 \
|
--master-addr=10.99.48.128 \
|
||||||
--master-port=13345
|
--master-port=13345
|
||||||
Node 1:
|
Node 1:
|
||||||
python examples/offline_inference/data_parallel.py \
|
python examples/offline_inference/data_parallel.py \
|
||||||
--model="ibm-research/PowerMoE-3b" \
|
--model="ibm-research/PowerMoE-3b" \
|
||||||
--dp-size=2 \
|
-dp=2 \
|
||||||
--tp-size=2 \
|
-tp=2 \
|
||||||
--node-size=2 \
|
--nnodes=2 \
|
||||||
--node-rank=1 \
|
--node-rank=1 \
|
||||||
--master-addr=10.99.48.128 \
|
--master-addr=10.99.48.128 \
|
||||||
--master-port=13345
|
--master-port=13345
|
||||||
@ -32,103 +32,40 @@ Multi-node:
|
|||||||
import os
|
import os
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, EngineArgs, SamplingParams
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
from vllm.utils.network_utils import get_open_port
|
from vllm.utils.network_utils import get_open_port
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def create_parser():
|
||||||
import argparse
|
parser = FlexibleArgumentParser(description="Data Parallel Inference")
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="Data Parallel Inference")
|
# Add all engine args
|
||||||
parser.add_argument(
|
EngineArgs.add_cli_args(parser)
|
||||||
"--model",
|
parser.set_defaults(
|
||||||
type=str,
|
model="ibm-research/PowerMoE-3b",
|
||||||
default="ibm-research/PowerMoE-3b",
|
enable_expert_parallel=True,
|
||||||
help="Model name or path",
|
|
||||||
)
|
|
||||||
parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size")
|
|
||||||
parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size")
|
|
||||||
parser.add_argument(
|
|
||||||
"--node-size", type=int, default=1, help="Total number of nodes"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--node-rank", type=int, default=0, help="Rank of the current node"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--master-addr", type=str, default="", help="Master node IP address"
|
|
||||||
)
|
|
||||||
parser.add_argument("--master-port", type=int, default=0, help="Master node port")
|
|
||||||
parser.add_argument(
|
|
||||||
"--enforce-eager", action="store_true", help="Enforce eager mode execution."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--trust-remote-code", action="store_true", help="Trust remote code."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--max-num-seqs",
|
|
||||||
type=int,
|
|
||||||
default=64,
|
|
||||||
help=("Maximum number of sequences to be processed in a single iteration."),
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--max-model-len",
|
|
||||||
type=int,
|
|
||||||
help=("Maximum number of tokens to be processed in a single iteration."),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Add timeout (not in EngineArgs)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--timeout",
|
"--timeout",
|
||||||
type=int,
|
type=int,
|
||||||
default=300,
|
default=300,
|
||||||
help=("Number of seconds before unresponsive process is killed."),
|
help="Number of seconds before unresponsive process is killed.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--gpu-memory-utilization",
|
return parser
|
||||||
type=float,
|
|
||||||
default=0.8,
|
|
||||||
help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--enable-dbo",
|
|
||||||
action="store_true",
|
|
||||||
help=("Enable microbatched execution"),
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--compilation-config",
|
|
||||||
type=int,
|
|
||||||
help=("Compilation optimization (O) mode 0-3."),
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--quantization",
|
|
||||||
type=str,
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--disable-expert-parallel",
|
|
||||||
dest="enable_expert_parallel",
|
|
||||||
action="store_false",
|
|
||||||
help="Disable expert parallel (default: enabled).",
|
|
||||||
)
|
|
||||||
parser.set_defaults(enable_expert_parallel=True)
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def main(
|
def main(
|
||||||
model,
|
|
||||||
dp_size,
|
dp_size,
|
||||||
local_dp_rank,
|
local_dp_rank,
|
||||||
global_dp_rank,
|
global_dp_rank,
|
||||||
dp_master_ip,
|
dp_master_ip,
|
||||||
dp_master_port,
|
dp_master_port,
|
||||||
GPUs_per_dp_rank,
|
engine_args,
|
||||||
enforce_eager,
|
|
||||||
enable_expert_parallel,
|
|
||||||
trust_remote_code,
|
|
||||||
max_num_seqs,
|
|
||||||
max_model_len,
|
|
||||||
compilation_config,
|
|
||||||
gpu_memory_utilization,
|
|
||||||
enable_dbo,
|
|
||||||
quantization,
|
|
||||||
):
|
):
|
||||||
os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
|
os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
|
||||||
os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
|
os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
|
||||||
@ -173,19 +110,7 @@ def main(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Create an LLM.
|
# Create an LLM.
|
||||||
llm = LLM(
|
llm = LLM(**engine_args)
|
||||||
model=model,
|
|
||||||
tensor_parallel_size=GPUs_per_dp_rank,
|
|
||||||
enforce_eager=enforce_eager,
|
|
||||||
enable_expert_parallel=enable_expert_parallel,
|
|
||||||
trust_remote_code=trust_remote_code,
|
|
||||||
max_num_seqs=max_num_seqs,
|
|
||||||
max_model_len=max_model_len,
|
|
||||||
gpu_memory_utilization=gpu_memory_utilization,
|
|
||||||
enable_dbo=enable_dbo,
|
|
||||||
quantization=quantization,
|
|
||||||
compilation_config=compilation_config,
|
|
||||||
)
|
|
||||||
outputs = llm.generate(prompts, sampling_params)
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
# Print the outputs.
|
# Print the outputs.
|
||||||
for i, output in enumerate(outputs):
|
for i, output in enumerate(outputs):
|
||||||
@ -204,22 +129,29 @@ def main(
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
args = parse_args()
|
parser = create_parser()
|
||||||
|
args = vars(parser.parse_args())
|
||||||
|
|
||||||
dp_size = args.dp_size
|
# Extract DP-specific args
|
||||||
tp_size = args.tp_size
|
dp_size = args.pop("data_parallel_size")
|
||||||
node_size = args.node_size
|
nnodes = args.get("nnodes", 1)
|
||||||
node_rank = args.node_rank
|
node_rank = args.get("node_rank", 0)
|
||||||
|
master_addr = args.get("master_addr", "")
|
||||||
|
master_port = args.get("master_port", 0)
|
||||||
|
timeout = args.pop("timeout")
|
||||||
|
|
||||||
if node_size == 1:
|
# Remaining args are engine args
|
||||||
|
engine_args = args
|
||||||
|
|
||||||
|
if nnodes == 1:
|
||||||
dp_master_ip = "127.0.0.1"
|
dp_master_ip = "127.0.0.1"
|
||||||
dp_master_port = get_open_port()
|
dp_master_port = get_open_port()
|
||||||
else:
|
else:
|
||||||
dp_master_ip = args.master_addr
|
dp_master_ip = master_addr
|
||||||
dp_master_port = args.master_port
|
dp_master_port = master_port
|
||||||
|
|
||||||
assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
|
assert dp_size % nnodes == 0, "dp_size should be divisible by nnodes"
|
||||||
dp_per_node = dp_size // node_size
|
dp_per_node = dp_size // nnodes
|
||||||
|
|
||||||
from multiprocessing import Process
|
from multiprocessing import Process
|
||||||
|
|
||||||
@ -235,29 +167,19 @@ if __name__ == "__main__":
|
|||||||
proc = Process(
|
proc = Process(
|
||||||
target=main,
|
target=main,
|
||||||
args=(
|
args=(
|
||||||
args.model,
|
|
||||||
dp_size,
|
dp_size,
|
||||||
local_dp_rank,
|
local_dp_rank,
|
||||||
global_dp_rank,
|
global_dp_rank,
|
||||||
dp_master_ip,
|
dp_master_ip,
|
||||||
dp_master_port,
|
dp_master_port,
|
||||||
tp_size,
|
engine_args,
|
||||||
args.enforce_eager,
|
|
||||||
args.enable_expert_parallel,
|
|
||||||
args.trust_remote_code,
|
|
||||||
args.max_num_seqs,
|
|
||||||
args.max_model_len,
|
|
||||||
args.compilation_config,
|
|
||||||
args.gpu_memory_utilization,
|
|
||||||
args.enable_dbo,
|
|
||||||
args.quantization,
|
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
proc.start()
|
proc.start()
|
||||||
procs.append(proc)
|
procs.append(proc)
|
||||||
exit_code = 0
|
exit_code = 0
|
||||||
for proc in procs:
|
for proc in procs:
|
||||||
proc.join(timeout=args.timeout)
|
proc.join(timeout=timeout)
|
||||||
if proc.exitcode is None:
|
if proc.exitcode is None:
|
||||||
print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
|
print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
|
||||||
proc.kill()
|
proc.kill()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user