mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-31 13:47:08 +08:00
[Misc][Tools] make max-model-len a parameter in auto_tune script (#21321)
Signed-off-by: Chengji Yao <chengjiyao@google.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
parent
fde60ee775
commit
947edd099e
@ -39,6 +39,7 @@ You must set the following variables at the top of the script before execution.
|
|||||||
| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
|
| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
|
||||||
| `INPUT_LEN` | **Required.** Request input length. | `4000` |
|
| `INPUT_LEN` | **Required.** Request input length. | `4000` |
|
||||||
| `OUTPUT_LEN` | **Required.** Request output length. | `16` |
|
| `OUTPUT_LEN` | **Required.** Request output length. | `16` |
|
||||||
|
| `MAX_MODEL_LEN` | **Required.** Max model length. | `4096` |
|
||||||
| `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` |
|
| `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` |
|
||||||
| `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` |
|
| `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` |
|
||||||
| `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` |
|
| `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` |
|
||||||
@ -69,6 +70,7 @@ Here are a few examples of how to configure the script for different goals:
|
|||||||
```bash
|
```bash
|
||||||
INPUT_LEN=1800
|
INPUT_LEN=1800
|
||||||
OUTPUT_LEN=20
|
OUTPUT_LEN=20
|
||||||
|
MAX_MODEL_LEN=2048
|
||||||
MIN_CACHE_HIT_PCT=0
|
MIN_CACHE_HIT_PCT=0
|
||||||
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
|
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
|
||||||
```
|
```
|
||||||
@ -80,6 +82,7 @@ MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
|
|||||||
```bash
|
```bash
|
||||||
INPUT_LEN=1800
|
INPUT_LEN=1800
|
||||||
OUTPUT_LEN=20
|
OUTPUT_LEN=20
|
||||||
|
MAX_MODEL_LEN=2048
|
||||||
MIN_CACHE_HIT_PCT=0
|
MIN_CACHE_HIT_PCT=0
|
||||||
MAX_LATENCY_ALLOWED_MS=500
|
MAX_LATENCY_ALLOWED_MS=500
|
||||||
```
|
```
|
||||||
@ -91,6 +94,7 @@ MAX_LATENCY_ALLOWED_MS=500
|
|||||||
```bash
|
```bash
|
||||||
INPUT_LEN=1800
|
INPUT_LEN=1800
|
||||||
OUTPUT_LEN=20
|
OUTPUT_LEN=20
|
||||||
|
MAX_MODEL_LEN=2048
|
||||||
MIN_CACHE_HIT_PCT=60
|
MIN_CACHE_HIT_PCT=60
|
||||||
MAX_LATENCY_ALLOWED_MS=500
|
MAX_LATENCY_ALLOWED_MS=500
|
||||||
```
|
```
|
||||||
|
|||||||
@ -4,13 +4,15 @@
|
|||||||
# See details in README (benchmarks/auto_tune/README.md).
|
# See details in README (benchmarks/auto_tune/README.md).
|
||||||
|
|
||||||
TAG=$(date +"%Y_%m_%d_%H_%M")
|
TAG=$(date +"%Y_%m_%d_%H_%M")
|
||||||
BASE=""
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||||
|
BASE="$SCRIPT_DIR/../../.."
|
||||||
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
||||||
SYSTEM="TPU"
|
SYSTEM="TPU"
|
||||||
TP=1
|
TP=1
|
||||||
DOWNLOAD_DIR=""
|
DOWNLOAD_DIR=""
|
||||||
INPUT_LEN=4000
|
INPUT_LEN=4000
|
||||||
OUTPUT_LEN=16
|
OUTPUT_LEN=16
|
||||||
|
MAX_MODEL_LEN=4096
|
||||||
MIN_CACHE_HIT_PCT=0
|
MIN_CACHE_HIT_PCT=0
|
||||||
MAX_LATENCY_ALLOWED_MS=100000000000
|
MAX_LATENCY_ALLOWED_MS=100000000000
|
||||||
NUM_SEQS_LIST="128 256"
|
NUM_SEQS_LIST="128 256"
|
||||||
@ -36,6 +38,13 @@ current_hash=$(git rev-parse HEAD)
|
|||||||
echo "hash:$current_hash" >> "$RESULT"
|
echo "hash:$current_hash" >> "$RESULT"
|
||||||
echo "current_hash: $current_hash"
|
echo "current_hash: $current_hash"
|
||||||
|
|
||||||
|
TOTAL_LEN=$((INPUT_LEN + OUTPUT_LEN))
|
||||||
|
RED='\033[0;31m'
|
||||||
|
if (( TOTAL_LEN > MAX_MODEL_LEN )); then
|
||||||
|
echo -e "${RED}FAILED: INPUT_LEN($INPUT_LEN) + OUTPUT_LEN($OUTPUT_LEN) = $TOTAL_LEN, which is > MAX_MODEL_LEN = $MAX_MODEL_LEN.\033[0m" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
best_throughput=0
|
best_throughput=0
|
||||||
best_max_num_seqs=0
|
best_max_num_seqs=0
|
||||||
best_num_batched_tokens=0
|
best_num_batched_tokens=0
|
||||||
@ -60,7 +69,7 @@ start_server() {
|
|||||||
--enable-prefix-caching \
|
--enable-prefix-caching \
|
||||||
--load-format dummy \
|
--load-format dummy \
|
||||||
--download-dir "$DOWNLOAD_DIR" \
|
--download-dir "$DOWNLOAD_DIR" \
|
||||||
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
|
--max-model-len $MAX_MODEL_LEN > "$vllm_log" 2>&1 &
|
||||||
|
|
||||||
# wait for 10 minutes...
|
# wait for 10 minutes...
|
||||||
server_started=0
|
server_started=0
|
||||||
@ -245,4 +254,3 @@ done
|
|||||||
echo "finish permutations"
|
echo "finish permutations"
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user