mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 03:05:02 +08:00
Add support for multi-node on CI (#5955)
Signed-off-by: kevin <kevin@anyscale.com>
This commit is contained in:
parent
08c5bdecae
commit
a0550cbc80
77
.buildkite/run-multi-node-test.sh
Executable file
77
.buildkite/run-multi-node-test.sh
Executable file
@ -0,0 +1,77 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -euox pipefail
|
||||
|
||||
if [[ $# -lt 3 ]]; then
|
||||
echo "Please provide the number of nodes and GPU per node."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
NUM_NODES=$1
|
||||
NUM_GPUS=$2
|
||||
DOCKER_IMAGE=$3
|
||||
|
||||
shift 3
|
||||
COMMANDS=("$@")
|
||||
if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
|
||||
echo "The number of commands must be equal to the number of nodes."
|
||||
echo "Number of nodes: $NUM_NODES"
|
||||
echo "Number of commands: ${#COMMANDS[@]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "List of commands"
|
||||
for command in "${COMMANDS[@]}"; do
|
||||
echo $command
|
||||
done
|
||||
|
||||
start_network() {
|
||||
docker network create --subnet=192.168.10.0/24 docker-net
|
||||
}
|
||||
|
||||
start_nodes() {
|
||||
for node in $(seq 0 $(($NUM_NODES-1))); do
|
||||
GPU_DEVICES='"device='
|
||||
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
||||
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
||||
GPU_DEVICES+=$(($DEVICE_NUM))
|
||||
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
|
||||
GPU_DEVICES+=','
|
||||
fi
|
||||
done
|
||||
GPU_DEVICES+='"'
|
||||
# echo "Starting node$node with GPU devices: $GPU_DEVICES"
|
||||
docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE tail -f /dev/null
|
||||
done
|
||||
}
|
||||
|
||||
run_nodes() {
|
||||
for node in $(seq 0 $(($NUM_NODES-1))); do
|
||||
GPU_DEVICES='"device='
|
||||
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
|
||||
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
|
||||
GPU_DEVICES+=$(($DEVICE_NUM))
|
||||
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
|
||||
GPU_DEVICES+=','
|
||||
fi
|
||||
done
|
||||
GPU_DEVICES+='"'
|
||||
echo "Running node$node with GPU devices: $GPU_DEVICES"
|
||||
if [ $node -lt $(($NUM_NODES - 1)) ]; then
|
||||
docker exec -d node$node /bin/bash -c "${COMMANDS[$node]}"
|
||||
else
|
||||
docker exec node$node /bin/bash -c "${COMMANDS[$node]}"
|
||||
fi
|
||||
done
|
||||
}
|
||||
cleanup() {
|
||||
for node in $(seq 0 $(($NUM_NODES-1))); do
|
||||
docker stop node$node
|
||||
done
|
||||
docker network rm docker-net
|
||||
}
|
||||
trap cleanup EXIT
|
||||
start_network
|
||||
start_nodes
|
||||
run_nodes
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user