mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 05:14:54 +08:00
314 lines
12 KiB
Bash
314 lines
12 KiB
Bash
#!/bin/bash
|
|
|
|
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
|
|
# See details in README (benchmarks/auto_tune/README.md).
|
|
|
|
TAG=$(date +"%Y_%m_%d_%H_%M")
|
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
|
VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
|
|
BASE=${BASE:-"$SCRIPT_DIR/../../.."}
|
|
MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
|
|
SYSTEM=${SYSTEM:-"TPU"}
|
|
TP=${TP:-1}
|
|
DOWNLOAD_DIR=${DOWNLOAD_DIR:-""}
|
|
INPUT_LEN=${INPUT_LEN:-4000}
|
|
OUTPUT_LEN=${OUTPUT_LEN:-16}
|
|
MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
|
|
MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
|
|
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
|
|
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
|
|
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
|
|
|
|
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
|
RESULT="$LOG_FOLDER/result.txt"
|
|
PROFILE_PATH="$LOG_FOLDER/profile"
|
|
|
|
echo "====================== AUTO TUNE PARAMETERS ===================="
|
|
echo "SCRIPT_DIR=$SCRIPT_DIR"
|
|
echo "BASE=$BASE"
|
|
echo "MODEL=$MODEL"
|
|
echo "SYSTEM=$SYSTEM"
|
|
echo "TP=$TP"
|
|
echo "DOWNLOAD_DIR=$DOWNLOAD_DIR"
|
|
echo "INPUT_LEN=$INPUT_LEN"
|
|
echo "OUTPUT_LEN=$OUTPUT_LEN"
|
|
echo "MAX_MODEL_LEN=$MAX_MODEL_LEN"
|
|
echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT"
|
|
echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS"
|
|
echo "NUM_SEQS_LIST=$NUM_SEQS_LIST"
|
|
echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST"
|
|
echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
|
|
echo "RESULT_FILE=$RESULT"
|
|
echo "====================== AUTO TUNEPARAMETERS ===================="
|
|
|
|
rm -rf $LOG_FOLDER
|
|
rm -rf $PROFILE_PATH
|
|
mkdir -p $LOG_FOLDER
|
|
mkdir -p $PROFILE_PATH
|
|
|
|
cd "$BASE/vllm"
|
|
|
|
pip install -q datasets
|
|
|
|
current_hash=$(git rev-parse HEAD)
|
|
echo "hash:$current_hash" >> "$RESULT"
|
|
echo "current_hash: $current_hash"
|
|
|
|
TOTAL_LEN=$((INPUT_LEN + OUTPUT_LEN))
|
|
RED='\033[0;31m'
|
|
if (( TOTAL_LEN > MAX_MODEL_LEN )); then
|
|
echo -e "${RED}FAILED: INPUT_LEN($INPUT_LEN) + OUTPUT_LEN($OUTPUT_LEN) = $TOTAL_LEN, which is > MAX_MODEL_LEN = $MAX_MODEL_LEN.\033[0m" >&2
|
|
exit 1
|
|
fi
|
|
|
|
best_throughput=0
|
|
best_max_num_seqs=0
|
|
best_num_batched_tokens=0
|
|
best_goodput=0
|
|
best_request_rate=0
|
|
|
|
start_server() {
|
|
local gpu_memory_utilization=$1
|
|
local max_num_seqs=$2
|
|
local max_num_batched_tokens=$3
|
|
local vllm_log=$4
|
|
local profile_dir=$5
|
|
|
|
pkill -if "vllm serve" || true
|
|
|
|
# Define the common arguments as a bash array.
|
|
# Each argument and its value are separate elements.
|
|
local common_args_array=(
|
|
"$MODEL"
|
|
"--disable-log-requests"
|
|
"--port" "8004"
|
|
"--gpu-memory-utilization" "$gpu_memory_utilization"
|
|
"--max-num-seqs" "$max_num_seqs"
|
|
"--max-num-batched-tokens" "$max_num_batched_tokens"
|
|
"--tensor-parallel-size" "$TP"
|
|
"--enable-prefix-caching"
|
|
"--load-format" "dummy"
|
|
"--download-dir" "$DOWNLOAD_DIR"
|
|
"--max-model-len" "$MAX_MODEL_LEN"
|
|
)
|
|
|
|
# Use the array expansion "${common_args_array[@]}"
|
|
# This correctly passes each element as a separate argument.
|
|
if [[ -n "$profile_dir" ]]; then
|
|
# Start server with profiling enabled
|
|
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
|
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
|
else
|
|
# Start server without profiling
|
|
VLLM_SERVER_DEV_MODE=1 \
|
|
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
|
fi
|
|
local server_pid=$!
|
|
|
|
# wait for 10 minutes...
|
|
server_started=0
|
|
for i in {1..60}; do
|
|
# This line checks whether the server is still alive or not,
|
|
# since that we should always have permission to send signal to the server process.
|
|
kill -0 $server_pid 2> /dev/null || break
|
|
|
|
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
|
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
|
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
|
server_started=1
|
|
break
|
|
else
|
|
sleep 10
|
|
fi
|
|
done
|
|
|
|
if (( ! server_started )); then
|
|
echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log".
|
|
return 1
|
|
else
|
|
return 0
|
|
fi
|
|
}
|
|
|
|
run_benchmark() {
|
|
local max_num_seqs=$1
|
|
local max_num_batched_tokens=$2
|
|
local gpu_memory_utilization=$3
|
|
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
|
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
|
|
echo "vllm_log: $vllm_log"
|
|
echo
|
|
rm -f $vllm_log
|
|
pkill -if "vllm serve" || true
|
|
|
|
echo "starting server..."
|
|
# Call start_server without a profile_dir to avoid profiling overhead
|
|
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log ""
|
|
result=$?
|
|
if [[ "$result" -eq 1 ]]; then
|
|
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
|
else
|
|
echo "server started."
|
|
fi
|
|
echo
|
|
|
|
echo "run benchmark test..."
|
|
meet_latency_requirement=0
|
|
# get a basic qps by using request-rate inf
|
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
|
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
|
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
|
# --profile flag is removed from this call
|
|
vllm bench serve \
|
|
--backend vllm \
|
|
--model $MODEL \
|
|
--dataset-name random \
|
|
--random-input-len $adjusted_input_len \
|
|
--random-output-len $OUTPUT_LEN \
|
|
--ignore-eos \
|
|
--disable-tqdm \
|
|
--request-rate inf \
|
|
--percentile-metrics ttft,tpot,itl,e2el \
|
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
|
--num-prompts 1000 \
|
|
--random-prefix-len $prefix_len \
|
|
--port 8004 &> "$bm_log"
|
|
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
|
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
|
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
|
|
|
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
|
meet_latency_requirement=1
|
|
request_rate=inf
|
|
fi
|
|
|
|
if (( ! meet_latency_requirement )); then
|
|
# start from request-rate as int(throughput) + 1
|
|
request_rate=$((${throughput%.*} + 1))
|
|
while ((request_rate > 0)); do
|
|
# clear prefix cache
|
|
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
|
sleep 5
|
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
|
vllm bench serve \
|
|
--backend vllm \
|
|
--model $MODEL \
|
|
--dataset-name random \
|
|
--random-input-len $adjusted_input_len \
|
|
--random-output-len $OUTPUT_LEN \
|
|
--ignore-eos \
|
|
--disable-tqdm \
|
|
--request-rate $request_rate \
|
|
--percentile-metrics ttft,tpot,itl,e2el \
|
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
|
--num-prompts 100 \
|
|
--random-prefix-len $prefix_len \
|
|
--port 8004 &> "$bm_log"
|
|
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
|
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
|
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
|
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
|
meet_latency_requirement=1
|
|
break
|
|
fi
|
|
request_rate=$((request_rate-1))
|
|
done
|
|
fi
|
|
# write the results and update the best result.
|
|
if ((meet_latency_requirement)); then
|
|
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput"
|
|
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT"
|
|
if (( $(echo "$throughput > $best_throughput" | bc -l) )); then
|
|
best_throughput=$throughput
|
|
best_max_num_seqs=$max_num_seqs
|
|
best_num_batched_tokens=$max_num_batched_tokens
|
|
best_goodput=$goodput
|
|
best_request_rate=$request_rate
|
|
fi
|
|
else
|
|
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
|
|
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT"
|
|
fi
|
|
|
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
|
|
|
pkill -if "vllm serve" || true
|
|
sleep 10
|
|
echo "===================="
|
|
return 0
|
|
}
|
|
|
|
read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
|
|
read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
|
|
|
|
# first find out the max gpu-memory-utilization without HBM OOM.
|
|
gpu_memory_utilization=0.98
|
|
find_gpu_memory_utilization=0
|
|
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
|
|
# Pass empty string for profile_dir argument
|
|
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
|
|
result=$?
|
|
if [[ "$result" -eq 0 ]]; then
|
|
find_gpu_memory_utilization=1
|
|
break
|
|
else
|
|
gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
|
|
fi
|
|
done
|
|
|
|
if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
|
|
echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
|
|
else
|
|
echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
|
|
exit 1
|
|
fi
|
|
|
|
for num_seqs in "${num_seqs_list[@]}"; do
|
|
for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
|
|
run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
|
|
done
|
|
done
|
|
echo "finish permutations"
|
|
|
|
# =================================================================================
|
|
# FINAL PROFILING RUN FOR THE BEST CONFIGURATION
|
|
# =================================================================================
|
|
if (( $(echo "$best_throughput > 0" | bc -l) )); then
|
|
echo
|
|
echo "Benchmark tuning finished. Now running profiling on the best configuration found..."
|
|
echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput"
|
|
echo
|
|
|
|
vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt"
|
|
bm_log="$LOG_FOLDER/bm_log_BEST_PROFILE.txt"
|
|
|
|
# Start server with the best params and profiling ENABLED
|
|
echo "Starting server for profiling..."
|
|
start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH"
|
|
|
|
# Run benchmark with the best params and the --profile flag
|
|
echo "Running benchmark with profiling..."
|
|
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
|
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
|
vllm bench serve \
|
|
--backend vllm \
|
|
--model $MODEL \
|
|
--dataset-name random \
|
|
--random-input-len $adjusted_input_len \
|
|
--random-output-len $OUTPUT_LEN \
|
|
--ignore-eos \
|
|
--disable-tqdm \
|
|
--request-rate $best_request_rate \
|
|
--percentile-metrics ttft,tpot,itl,e2el \
|
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
|
--num-prompts 100 \
|
|
--random-prefix-len $prefix_len \
|
|
--port 8004 \
|
|
--profile &> "$bm_log"
|
|
else
|
|
echo "No configuration met the latency requirements. Skipping final profiling run."
|
|
fi
|
|
pkill -if "vllm serve" || true
|
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|