diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md
index 7732f50b1d229..ae5962fe92542 100644
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -39,6 +39,7 @@ You must set the following variables at the top of the script before execution.
 | `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
 | `INPUT_LEN` | **Required.** Request input length. | `4000` |
 | `OUTPUT_LEN` | **Required.** Request output length. | `16` |
+| `MAX_MODEL_LEN` | **Required.** Max model length. | `4096` |
 | `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` |
 | `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` |
 | `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` |
@@ -69,6 +70,7 @@ Here are a few examples of how to configure the script for different goals:
 ```bash
 INPUT_LEN=1800
 OUTPUT_LEN=20
+MAX_MODEL_LEN=2048
 MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
 ```
@@ -80,6 +82,7 @@ MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
 ```bash
 INPUT_LEN=1800
 OUTPUT_LEN=20
+MAX_MODEL_LEN=2048
 MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=500
 ```
@@ -91,6 +94,7 @@ MAX_LATENCY_ALLOWED_MS=500
 ```bash
 INPUT_LEN=1800
 OUTPUT_LEN=20
+MAX_MODEL_LEN=2048
 MIN_CACHE_HIT_PCT=60
 MAX_LATENCY_ALLOWED_MS=500
 ```
diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh
index eaa28ea5c92b9..8d3e1d4bee352 100644
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -4,13 +4,15 @@
 # See details in README (benchmarks/auto_tune/README.md).
 
 TAG=$(date +"%Y_%m_%d_%H_%M")
-BASE=""
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+BASE="$SCRIPT_DIR/../../.."
 MODEL="meta-llama/Llama-3.1-8B-Instruct"
 SYSTEM="TPU"
 TP=1
 DOWNLOAD_DIR=""
 INPUT_LEN=4000
 OUTPUT_LEN=16
+MAX_MODEL_LEN=4096
 MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=100000000000
 NUM_SEQS_LIST="128 256"
@@ -36,6 +38,13 @@ current_hash=$(git rev-parse HEAD)
 echo "hash:$current_hash" >> "$RESULT"
 echo "current_hash: $current_hash"
 
+TOTAL_LEN=$((INPUT_LEN + OUTPUT_LEN))
+RED='\033[0;31m'
+if (( TOTAL_LEN > MAX_MODEL_LEN )); then
+    echo -e "${RED}FAILED: INPUT_LEN($INPUT_LEN) + OUTPUT_LEN($OUTPUT_LEN) = $TOTAL_LEN, which is > MAX_MODEL_LEN = $MAX_MODEL_LEN.\033[0m" >&2
+    exit 1
+fi
+
 best_throughput=0
 best_max_num_seqs=0
 best_num_batched_tokens=0
@@ -60,7 +69,7 @@ start_server() {
         --enable-prefix-caching \
         --load-format dummy \
         --download-dir "$DOWNLOAD_DIR" \
-        --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
+        --max-model-len $MAX_MODEL_LEN > "$vllm_log" 2>&1 &
 
     # wait for 10 minutes...
     server_started=0
@@ -245,4 +254,3 @@ done
 echo "finish permutations"
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
-