From c01d589813f40c9ea25db3cdaa2c6c2144ab4e53 Mon Sep 17 00:00:00 2001
From: Kevin Musgrave <kevin.musgrave@gmail.com>
Date: Mon, 15 Dec 2025 17:00:29 -0500
Subject: [PATCH] [Benchmarks] `auto_tune.sh`: Use hostname variable for server
 requests (#30529)

Signed-off-by: Kevin Musgrave <kevin.musgrave@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 benchmarks/auto_tune/auto_tune.sh | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index 25baa9cbda39c..a245e2022e605 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -18,6 +18,11 @@ MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
 MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
 NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
 NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
+HOSTNAME=$(hostname)
+if [[ -z "$HOSTNAME" ]]; then
+    echo "Error: Failed to determine hostname." >&2
+    exit 1
+fi
 
 LOG_FOLDER="$BASE/auto-benchmark/$TAG"
 RESULT="$LOG_FOLDER/result.txt"
@@ -82,6 +87,7 @@ start_server() {
         "$MODEL"
         "--disable-log-requests"
         "--port" "8004"
+        "--host" "$HOSTNAME"
         "--gpu-memory-utilization" "$gpu_memory_utilization"
         "--max-num-seqs" "$max_num_seqs"
         "--max-num-batched-tokens" "$max_num_batched_tokens"
@@ -113,7 +119,7 @@ start_server() {
         # since that we should always have permission to send signal to the server process.
         kill -0 $server_pid 2> /dev/null || break
 
-        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
+        RESPONSE=$(curl -s -X GET "http://${HOSTNAME}:8004/health" -w "%{http_code}" -o /dev/stdout)
         STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
         if [[ "$STATUS_CODE" -eq 200 ]]; then
             server_started=1
@@ -173,6 +179,7 @@ run_benchmark() {
         --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
         --num-prompts 1000 \
         --random-prefix-len $prefix_len \
+        --host "$HOSTNAME" \
         --port 8004 &> "$bm_log"
     throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
     e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -188,7 +195,7 @@ run_benchmark() {
         request_rate=$((${throughput%.*} + 1))
         while ((request_rate > 0)); do
             # clear prefix cache
-            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
+            curl -X POST http://${HOSTNAME}:8004/reset_prefix_cache
             sleep 5
             bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
             vllm bench serve \
@@ -204,6 +211,7 @@ run_benchmark() {
                 --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
                 --num-prompts 100 \
                 --random-prefix-len $prefix_len \
+                --host "$HOSTNAME" \
                 --port 8004 &> "$bm_log"
             throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
             e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
@@ -304,6 +312,7 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
         --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
         --num-prompts 100 \
         --random-prefix-len $prefix_len \
+        --host "$HOSTNAME" \
         --port 8004 \
         --profile &> "$bm_log"
 else