From a0550cbc80f504aa2da80b573c22204f686a0389 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Tue, 9 Jul 2024 12:56:56 -0700 Subject: [PATCH] Add support for multi-node on CI (#5955) Signed-off-by: kevin --- .buildkite/run-multi-node-test.sh | 77 +++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100755 .buildkite/run-multi-node-test.sh diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh new file mode 100755 index 000000000000..0d94b2555f16 --- /dev/null +++ b/.buildkite/run-multi-node-test.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +set -euox pipefail + +if [[ $# -lt 3 ]]; then + echo "Please provide the number of nodes and GPU per node." + exit 1 +fi + +NUM_NODES=$1 +NUM_GPUS=$2 +DOCKER_IMAGE=$3 + +shift 3 +COMMANDS=("$@") +if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then + echo "The number of commands must be equal to the number of nodes." + echo "Number of nodes: $NUM_NODES" + echo "Number of commands: ${#COMMANDS[@]}" + exit 1 +fi + +echo "List of commands" +for command in "${COMMANDS[@]}"; do + echo $command +done + +start_network() { + docker network create --subnet=192.168.10.0/24 docker-net +} + +start_nodes() { + for node in $(seq 0 $(($NUM_NODES-1))); do + GPU_DEVICES='"device=' + for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do + DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) + GPU_DEVICES+=$(($DEVICE_NUM)) + if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then + GPU_DEVICES+=',' + fi + done + GPU_DEVICES+='"' + # echo "Starting node$node with GPU devices: $GPU_DEVICES" + docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE tail -f /dev/null + done +} + +run_nodes() { + for node in $(seq 0 $(($NUM_NODES-1))); do + GPU_DEVICES='"device=' + for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do + DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) + GPU_DEVICES+=$(($DEVICE_NUM)) + if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then + GPU_DEVICES+=',' + fi + done + GPU_DEVICES+='"' + echo "Running node$node with GPU devices: $GPU_DEVICES" + if [ $node -lt $(($NUM_NODES - 1)) ]; then + docker exec -d node$node /bin/bash -c "${COMMANDS[$node]}" + else + docker exec node$node /bin/bash -c "${COMMANDS[$node]}" + fi + done +} +cleanup() { + for node in $(seq 0 $(($NUM_NODES-1))); do + docker stop node$node + done + docker network rm docker-net +} +trap cleanup EXIT +start_network +start_nodes +run_nodes +