mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-25 12:04:27 +08:00
[Bug] Update auto_tune.sh to separate benchmarking and profiling. (#21629)
Signed-off-by: Eric Hanley <ericehanley@google.com>
This commit is contained in:
parent
9af654cc38
commit
309c1bb822
@ -49,6 +49,7 @@ best_throughput=0
|
|||||||
best_max_num_seqs=0
|
best_max_num_seqs=0
|
||||||
best_num_batched_tokens=0
|
best_num_batched_tokens=0
|
||||||
best_goodput=0
|
best_goodput=0
|
||||||
|
best_request_rate=0
|
||||||
|
|
||||||
start_server() {
|
start_server() {
|
||||||
local gpu_memory_utilization=$1
|
local gpu_memory_utilization=$1
|
||||||
@ -57,18 +58,35 @@ start_server() {
|
|||||||
local vllm_log=$4
|
local vllm_log=$4
|
||||||
local profile_dir=$5
|
local profile_dir=$5
|
||||||
|
|
||||||
pkill -f vllm
|
pkill -if vllm
|
||||||
|
|
||||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
|
# Define the common arguments as a bash array.
|
||||||
--port 8004 \
|
# Each argument and its value are separate elements.
|
||||||
--gpu-memory-utilization $gpu_memory_utilization \
|
local common_args_array=(
|
||||||
--max-num-seqs $max_num_seqs \
|
"$MODEL"
|
||||||
--max-num-batched-tokens $max_num_batched_tokens \
|
"--disable-log-requests"
|
||||||
--tensor-parallel-size $TP \
|
"--port" "8004"
|
||||||
--enable-prefix-caching \
|
"--gpu-memory-utilization" "$gpu_memory_utilization"
|
||||||
--load-format dummy \
|
"--max-num-seqs" "$max_num_seqs"
|
||||||
--download-dir "$DOWNLOAD_DIR" \
|
"--max-num-batched-tokens" "$max_num_batched_tokens"
|
||||||
--max-model-len $MAX_MODEL_LEN > "$vllm_log" 2>&1 &
|
"--tensor-parallel-size" "$TP"
|
||||||
|
"--enable-prefix-caching"
|
||||||
|
"--load-format" "dummy"
|
||||||
|
"--download-dir" "$DOWNLOAD_DIR"
|
||||||
|
"--max-model-len" "$MAX_MODEL_LEN"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use the array expansion "${common_args_array[@]}"
|
||||||
|
# This correctly passes each element as a separate argument.
|
||||||
|
if [[ -n "$profile_dir" ]]; then
|
||||||
|
# Start server with profiling enabled
|
||||||
|
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
||||||
|
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||||
|
else
|
||||||
|
# Start server without profiling
|
||||||
|
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
|
||||||
|
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||||
|
fi
|
||||||
|
|
||||||
# wait for 10 minutes...
|
# wait for 10 minutes...
|
||||||
server_started=0
|
server_started=0
|
||||||
@ -82,6 +100,7 @@ start_server() {
|
|||||||
sleep 10
|
sleep 10
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
if (( ! server_started )); then
|
if (( ! server_started )); then
|
||||||
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
|
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
|
||||||
return 1
|
return 1
|
||||||
@ -90,37 +109,20 @@ start_server() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
update_best_profile() {
|
|
||||||
local profile_dir=$1
|
|
||||||
local profile_index=$2
|
|
||||||
sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
|
|
||||||
selected_profile_file=
|
|
||||||
if [[ "$SYSTEM" == "TPU" ]]; then
|
|
||||||
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
|
|
||||||
fi
|
|
||||||
if [[ "$SYSTEM" == "GPU" ]]; then
|
|
||||||
selected_profile_file="${sorted_paths[$profile_index]}"
|
|
||||||
fi
|
|
||||||
rm -f $PROFILE_PATH/*
|
|
||||||
cp $selected_profile_file $PROFILE_PATH
|
|
||||||
}
|
|
||||||
|
|
||||||
run_benchmark() {
|
run_benchmark() {
|
||||||
local max_num_seqs=$1
|
local max_num_seqs=$1
|
||||||
local max_num_batched_tokens=$2
|
local max_num_batched_tokens=$2
|
||||||
local gpu_memory_utilization=$3
|
local gpu_memory_utilization=$3
|
||||||
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
||||||
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
|
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
|
||||||
local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
|
|
||||||
echo "vllm_log: $vllm_log"
|
echo "vllm_log: $vllm_log"
|
||||||
echo
|
echo
|
||||||
rm -f $vllm_log
|
rm -f $vllm_log
|
||||||
mkdir -p $profile_dir
|
pkill -if vllm
|
||||||
pkill -f vllm
|
|
||||||
local profile_index=0
|
|
||||||
|
|
||||||
echo "starting server..."
|
echo "starting server..."
|
||||||
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
|
# Call start_server without a profile_dir to avoid profiling overhead
|
||||||
|
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log ""
|
||||||
result=$?
|
result=$?
|
||||||
if [[ "$result" -eq 1 ]]; then
|
if [[ "$result" -eq 1 ]]; then
|
||||||
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
||||||
@ -134,7 +136,8 @@ run_benchmark() {
|
|||||||
# get a basic qps by using request-rate inf
|
# get a basic qps by using request-rate inf
|
||||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
||||||
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
||||||
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
||||||
|
# --profile flag is removed from this call
|
||||||
vllm bench serve \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--model $MODEL \
|
--model $MODEL \
|
||||||
@ -148,8 +151,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
|||||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||||
--num-prompts 1000 \
|
--num-prompts 1000 \
|
||||||
--random-prefix-len $prefix_len \
|
--random-prefix-len $prefix_len \
|
||||||
--port 8004 \
|
--port 8004 &> "$bm_log"
|
||||||
--profile &> "$bm_log"
|
|
||||||
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||||
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
@ -163,7 +165,6 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
|||||||
# start from request-rate as int(throughput) + 1
|
# start from request-rate as int(throughput) + 1
|
||||||
request_rate=$((${throughput%.*} + 1))
|
request_rate=$((${throughput%.*} + 1))
|
||||||
while ((request_rate > 0)); do
|
while ((request_rate > 0)); do
|
||||||
profile_index=$((profile_index+1))
|
|
||||||
# clear prefix cache
|
# clear prefix cache
|
||||||
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
||||||
sleep 5
|
sleep 5
|
||||||
@ -201,12 +202,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
|||||||
best_max_num_seqs=$max_num_seqs
|
best_max_num_seqs=$max_num_seqs
|
||||||
best_num_batched_tokens=$max_num_batched_tokens
|
best_num_batched_tokens=$max_num_batched_tokens
|
||||||
best_goodput=$goodput
|
best_goodput=$goodput
|
||||||
if [[ "$SYSTEM" == "TPU" ]]; then
|
best_request_rate=$request_rate
|
||||||
update_best_profile "$profile_dir/plugins/profile" $profile_index
|
|
||||||
fi
|
|
||||||
if [[ "$SYSTEM" == "GPU" ]]; then
|
|
||||||
update_best_profile "$profile_dir" $profile_index
|
|
||||||
fi
|
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
|
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
|
||||||
@ -215,7 +211,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
|||||||
|
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
||||||
|
|
||||||
pkill vllm
|
pkill -if vllm
|
||||||
sleep 10
|
sleep 10
|
||||||
printf '=%.0s' $(seq 1 20)
|
printf '=%.0s' $(seq 1 20)
|
||||||
return 0
|
return 0
|
||||||
@ -228,7 +224,8 @@ read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
|
|||||||
gpu_memory_utilization=0.98
|
gpu_memory_utilization=0.98
|
||||||
find_gpu_memory_utilization=0
|
find_gpu_memory_utilization=0
|
||||||
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
|
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
|
||||||
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
|
# Pass empty string for profile_dir argument
|
||||||
|
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
|
||||||
result=$?
|
result=$?
|
||||||
if [[ "$result" -eq 0 ]]; then
|
if [[ "$result" -eq 0 ]]; then
|
||||||
find_gpu_memory_utilization=1
|
find_gpu_memory_utilization=1
|
||||||
@ -251,5 +248,45 @@ for num_seqs in "${num_seqs_list[@]}"; do
|
|||||||
done
|
done
|
||||||
done
|
done
|
||||||
echo "finish permutations"
|
echo "finish permutations"
|
||||||
|
|
||||||
|
# =================================================================================
|
||||||
|
# FINAL PROFILING RUN FOR THE BEST CONFIGURATION
|
||||||
|
# =================================================================================
|
||||||
|
if (( $(echo "$best_throughput > 0" | bc -l) )); then
|
||||||
|
echo
|
||||||
|
echo "Benchmark tuning finished. Now running profiling on the best configuration found..."
|
||||||
|
echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput"
|
||||||
|
echo
|
||||||
|
|
||||||
|
vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt"
|
||||||
|
bm_log="$LOG_FOLDER/bm_log_BEST_PROFILE.txt"
|
||||||
|
|
||||||
|
# Start server with the best params and profiling ENABLED
|
||||||
|
echo "Starting server for profiling..."
|
||||||
|
start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH"
|
||||||
|
|
||||||
|
# Run benchmark with the best params and the --profile flag
|
||||||
|
echo "Running benchmark with profiling..."
|
||||||
|
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
||||||
|
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
|
||||||
|
vllm bench serve \
|
||||||
|
--backend vllm \
|
||||||
|
--model $MODEL \
|
||||||
|
--dataset-name random \
|
||||||
|
--random-input-len $adjusted_input_len \
|
||||||
|
--random-output-len $OUTPUT_LEN \
|
||||||
|
--ignore-eos \
|
||||||
|
--disable-tqdm \
|
||||||
|
--request-rate $best_request_rate \
|
||||||
|
--percentile-metrics ttft,tpot,itl,e2el \
|
||||||
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||||
|
--num-prompts 100 \
|
||||||
|
--random-prefix-len $prefix_len \
|
||||||
|
--port 8004 \
|
||||||
|
--profile &> "$bm_log"
|
||||||
|
else
|
||||||
|
echo "No configuration met the latency requirements. Skipping final profiling run."
|
||||||
|
fi
|
||||||
|
pkill -if vllm
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user