[Bug] Update auto_tune.sh to separate benchmarking and profiling. (#21629)

Signed-off-by: Eric Hanley <ericehanley@google.com>
2026-05-25 12:04:27 +08:00 · 2025-08-04 10:12:06 -05:00 · 2025-08-04 10:12:06 -05:00 · 309c1bb822
commit 309c1bb822
parent 9af654cc38
1 changed files with 80 additions and 43 deletions
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@ -49,6 +49,7 @@ best_throughput=0
 best_max_num_seqs=0
 best_num_batched_tokens=0
 best_goodput=0
 best_request_rate=0
 start_server() {
    local gpu_memory_utilization=$1
@ -57,18 +58,35 @@ start_server() {
    local vllm_log=$4
    local profile_dir=$5
-    pkill -f vllm
+    pkill -if vllm
-    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
+    # Define the common arguments as a bash array.
-        --port 8004 \
+    # Each argument and its value are separate elements.
-        --gpu-memory-utilization $gpu_memory_utilization \
+    local common_args_array=(
-        --max-num-seqs $max_num_seqs \
+        "$MODEL"
-        --max-num-batched-tokens $max_num_batched_tokens \
+        "--disable-log-requests"
-        --tensor-parallel-size $TP \
+        "--port" "8004"
-        --enable-prefix-caching \
+        "--gpu-memory-utilization" "$gpu_memory_utilization"
-        --load-format dummy \
+        "--max-num-seqs" "$max_num_seqs"
-        --download-dir "$DOWNLOAD_DIR" \
+        "--max-num-batched-tokens" "$max_num_batched_tokens"
-        --max-model-len $MAX_MODEL_LEN > "$vllm_log" 2>&1 &
+        "--tensor-parallel-size" "$TP"
        "--enable-prefix-caching"
        "--load-format" "dummy"
        "--download-dir" "$DOWNLOAD_DIR"
        "--max-model-len" "$MAX_MODEL_LEN"
    )
    # Use the array expansion "${common_args_array[@]}"
    # This correctly passes each element as a separate argument.
    if [[ -n "$profile_dir" ]]; then
        # Start server with profiling enabled
        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
    else
        # Start server without profiling
        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
    fi
    # wait for 10 minutes...
    server_started=0
@ -82,6 +100,7 @@ start_server() {
            sleep 10
        fi
    done
    if (( ! server_started )); then
        echo "server did not start within 10 minutes. Please check server log at $vllm_log".
        return 1
@ -90,37 +109,20 @@ start_server() {
    fi
 }
 update_best_profile() {
    local profile_dir=$1
    local profile_index=$2
    sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
    selected_profile_file=
    if [[ "$SYSTEM" == "TPU" ]]; then
        selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
    fi
    if [[ "$SYSTEM" == "GPU" ]]; then
        selected_profile_file="${sorted_paths[$profile_index]}"
    fi
    rm -f $PROFILE_PATH/*
    cp $selected_profile_file $PROFILE_PATH
 }
 run_benchmark() {
    local max_num_seqs=$1
    local max_num_batched_tokens=$2
    local gpu_memory_utilization=$3
    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
    local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
    echo "vllm_log: $vllm_log"
    echo
    rm -f $vllm_log
-    mkdir -p $profile_dir
+    pkill -if vllm
    pkill -f vllm
    local profile_index=0
    echo "starting server..."
-    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
+    # Call start_server without a profile_dir to avoid profiling overhead
    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log ""
    result=$?
    if [[ "$result" -eq 1 ]]; then
        echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
@ -134,7 +136,8 @@ run_benchmark() {
    # get a basic qps by using request-rate inf
    bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
    prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
-adjusted_input_len=$(( INPUT_LEN - prefix_len ))
+    adjusted_input_len=$(( INPUT_LEN - prefix_len ))
    # --profile flag is removed from this call
    vllm bench serve \
        --backend vllm \
        --model $MODEL  \
@ -148,8 +151,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
        --num-prompts 1000 \
        --random-prefix-len $prefix_len \
-        --port 8004 \
+        --port 8004 &> "$bm_log"
        --profile &> "$bm_log"
    throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
    e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
    goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
@ -163,7 +165,6 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
    # start from request-rate as int(throughput) + 1
        request_rate=$((${throughput%.*} + 1))
        while ((request_rate > 0)); do
            profile_index=$((profile_index+1))
            # clear prefix cache
            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
            sleep 5
@ -201,12 +202,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
            best_max_num_seqs=$max_num_seqs
            best_num_batched_tokens=$max_num_batched_tokens
            best_goodput=$goodput
-            if [[ "$SYSTEM" == "TPU" ]]; then
+            best_request_rate=$request_rate
                update_best_profile "$profile_dir/plugins/profile" $profile_index
            fi
            if [[ "$SYSTEM" == "GPU" ]]; then
                update_best_profile "$profile_dir" $profile_index
            fi
        fi
    else
        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
@ -215,7 +211,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
    echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
-    pkill vllm
+    pkill -if vllm
    sleep 10
    printf '=%.0s' $(seq 1 20)
    return 0
@ -228,7 +224,8 @@ read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
 gpu_memory_utilization=0.98
 find_gpu_memory_utilization=0
 while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
-    start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
+    # Pass empty string for profile_dir argument
    start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
    result=$?
    if [[ "$result" -eq 0 ]]; then
        find_gpu_memory_utilization=1
@ -251,5 +248,45 @@ for num_seqs in "${num_seqs_list[@]}"; do
    done
 done
 echo "finish permutations"
 # =================================================================================
 # FINAL PROFILING RUN FOR THE BEST CONFIGURATION
 # =================================================================================
 if (( $(echo "$best_throughput > 0" | bc -l) )); then
    echo
    echo "Benchmark tuning finished. Now running profiling on the best configuration found..."
    echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput"
    echo
    vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt"
    bm_log="$LOG_FOLDER/bm_log_BEST_PROFILE.txt"
    # Start server with the best params and profiling ENABLED
    echo "Starting server for profiling..."
    start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH"
    # Run benchmark with the best params and the --profile flag
    echo "Running benchmark with profiling..."
    prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
    adjusted_input_len=$(( INPUT_LEN - prefix_len ))
    vllm bench serve \
        --backend vllm \
        --model $MODEL \
        --dataset-name random \
        --random-input-len $adjusted_input_len \
        --random-output-len $OUTPUT_LEN \
        --ignore-eos \
        --disable-tqdm \
        --request-rate $best_request_rate \
        --percentile-metrics ttft,tpot,itl,e2el \
        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
        --num-prompts 100 \
        --random-prefix-len $prefix_len \
        --port 8004 \
        --profile &> "$bm_log"
 else
    echo "No configuration met the latency requirements. Skipping final profiling run."
 fi
 pkill -if vllm
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"