mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-14 09:05:28 +08:00
136 lines
3.5 KiB
Bash
136 lines
3.5 KiB
Bash
#!/bin/bash
|
|
|
|
echo "Warning: LMCache disaggregated prefill support for vLLM v1 is experimental and subject to change."
|
|
|
|
|
|
PIDS=()
|
|
|
|
# Switch to the directory of the current script
|
|
cd "$(dirname "${BASH_SOURCE[0]}")"
|
|
|
|
check_hf_token() {
|
|
if [ -z "$HF_TOKEN" ]; then
|
|
echo "HF_TOKEN is not set. Please set it to your Hugging Face token."
|
|
exit 1
|
|
fi
|
|
if [[ "$HF_TOKEN" != hf_* ]]; then
|
|
echo "HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
|
|
exit 1
|
|
fi
|
|
echo "HF_TOKEN is set and valid."
|
|
}
|
|
|
|
check_num_gpus() {
|
|
# can you check if the number of GPUs are >=2 via nvidia-smi?
|
|
num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
|
|
if [ "$num_gpus" -lt 2 ]; then
|
|
echo "You need at least 2 GPUs to run disaggregated prefill."
|
|
exit 1
|
|
else
|
|
echo "Found $num_gpus GPUs."
|
|
fi
|
|
}
|
|
|
|
ensure_python_library_installed() {
|
|
echo "Checking if $1 is installed..."
|
|
python -c "import $1" > /dev/null 2>&1
|
|
if [ $? -ne 0 ]; then
|
|
if [ "$1" == "nixl" ]; then
|
|
echo "$1 is not installed. Please refer to https://github.com/ai-dynamo/nixl for installation."
|
|
else
|
|
echo "$1 is not installed. Please install it via pip install $1."
|
|
fi
|
|
exit 1
|
|
else
|
|
echo "$1 is installed."
|
|
fi
|
|
}
|
|
|
|
cleanup() {
|
|
echo "Stopping everything…"
|
|
trap - INT TERM # prevent re-entrancy
|
|
kill -- -$$ # negative PID == “this whole process-group”
|
|
wait # reap children so we don't leave zombies
|
|
exit 0
|
|
}
|
|
|
|
wait_for_server() {
|
|
local port=$1
|
|
local timeout_seconds=1200
|
|
local start_time=$(date +%s)
|
|
|
|
echo "Waiting for server on port $port..."
|
|
|
|
while true; do
|
|
if curl -s "localhost:${port}/v1/completions" > /dev/null; then
|
|
return 0
|
|
fi
|
|
|
|
local now=$(date +%s)
|
|
if (( now - start_time >= timeout_seconds )); then
|
|
echo "Timeout waiting for server"
|
|
return 1
|
|
fi
|
|
|
|
sleep 1
|
|
done
|
|
}
|
|
|
|
|
|
main() {
|
|
check_hf_token
|
|
check_num_gpus
|
|
ensure_python_library_installed lmcache
|
|
ensure_python_library_installed nixl
|
|
ensure_python_library_installed pandas
|
|
ensure_python_library_installed datasets
|
|
ensure_python_library_installed vllm
|
|
|
|
trap cleanup INT
|
|
trap cleanup USR1
|
|
trap cleanup TERM
|
|
|
|
echo "Launching prefiller, decoder and proxy..."
|
|
echo "Please check prefiller.log, decoder.log and proxy.log for logs."
|
|
|
|
bash disagg_vllm_launcher.sh prefiller \
|
|
> >(tee prefiller.log) 2>&1 &
|
|
prefiller_pid=$!
|
|
PIDS+=($prefiller_pid)
|
|
|
|
bash disagg_vllm_launcher.sh decoder \
|
|
> >(tee decoder.log) 2>&1 &
|
|
decoder_pid=$!
|
|
PIDS+=($decoder_pid)
|
|
|
|
python3 disagg_proxy_server.py \
|
|
--host localhost \
|
|
--port 9000 \
|
|
--prefiller-host localhost \
|
|
--prefiller-port 8100 \
|
|
--decoder-host localhost \
|
|
--decoder-port 8200 \
|
|
> >(tee proxy.log) 2>&1 &
|
|
proxy_pid=$!
|
|
PIDS+=($proxy_pid)
|
|
|
|
wait_for_server 8100
|
|
wait_for_server 8200
|
|
wait_for_server 9000
|
|
|
|
echo "All servers are up. Starting benchmark..."
|
|
|
|
# begin benchmark
|
|
cd ../../../benchmarks/
|
|
python benchmark_serving.py --port 9000 --seed $(date +%s) \
|
|
--model meta-llama/Llama-3.1-8B-Instruct \
|
|
--dataset-name random --random-input-len 7500 --random-output-len 200 \
|
|
--num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log
|
|
|
|
echo "Benchmarking done. Cleaning up..."
|
|
|
|
cleanup
|
|
|
|
}
|
|
|
|
main |