diff --git a/benchmarks/README.md b/benchmarks/README.md
index caff8f034214..1d715a193ea1 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -40,7 +40,7 @@ become available.
wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv |
- | Sonnet |
+ Sonnet (deprecated) |
✅ |
✅ |
Local file: benchmarks/sonnet.txt |
@@ -51,6 +51,12 @@ become available.
✅ |
synthetic |
+
+ | Prefix Repetition |
+ ✅ |
+ ✅ |
+ synthetic |
+
| HuggingFace-VisionArena |
✅ |
@@ -592,6 +598,20 @@ python3 benchmarks/benchmark_prefix_caching.py \
--input-length-range 128:256
```
+### Prefix Repetition Dataset
+
+```bash
+vllm bench serve \
+ --backend openai \
+ --model meta-llama/Llama-2-7b-chat-hf \
+ --dataset-name prefix_repetition \
+ --num-prompts 100 \
+ --prefix-repetition-prefix-len 512 \
+ --prefix-repetition-suffix-len 128 \
+ --prefix-repetition-num-prefixes 5 \
+ --prefix-repetition-output-len 128
+```
+
## ⚡ Example - Request Prioritization Benchmark
diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py
index fdf6548ada5b..0c19fa6dcfdd 100644
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -18,9 +18,11 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer,
from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset,
ConversationDataset,
- InstructCoderDataset, RandomDataset,
- SampleRequest, ShareGPTDataset,
- SonnetDataset, VisionArenaDataset)
+ InstructCoderDataset,
+ PrefixRepetitionRandomDataset,
+ RandomDataset, SampleRequest,
+ ShareGPTDataset, SonnetDataset,
+ VisionArenaDataset)
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
write_to_json)
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
@@ -327,6 +329,12 @@ def get_requests(args, tokenizer):
dataset_cls = AIMODataset
common_kwargs['dataset_subset'] = None
common_kwargs['dataset_split'] = "train"
+ elif args.dataset_name == "prefix_repetition":
+ dataset_cls = PrefixRepetitionRandomDataset
+ sample_kwargs["prefix_len"] = args.prefix_repetition_prefix_len
+ sample_kwargs["suffix_len"] = args.prefix_repetition_suffix_len
+ sample_kwargs["num_prefixes"] = args.prefix_repetition_num_prefixes
+ sample_kwargs["output_len"] = args.prefix_repetition_output_len
else:
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
# Remove None values
@@ -356,7 +364,11 @@ def validate_args(args):
raise ValueError(f"Unsupported backend: {args.backend}")
# === Dataset Configuration ===
- if not args.dataset and not args.dataset_path:
+ if (
+ not args.dataset
+ and not args.dataset_path
+ and args.dataset_name not in {"prefix_repetition"}
+ ):
print(
"When dataset path is not set, it will default to random dataset")
args.dataset_name = 'random'
@@ -432,7 +444,10 @@ def add_cli_args(parser: argparse.ArgumentParser):
parser.add_argument(
"--dataset-name",
type=str,
- choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
+ choices=[
+ "sharegpt", "random", "sonnet", "burstgpt", "hf",
+ "prefix_repetition"
+ ],
help="Name of the dataset to benchmark on.",
default="sharegpt")
parser.add_argument(
@@ -521,6 +536,38 @@ def add_cli_args(parser: argparse.ArgumentParser):
default=None,
help="Split of the HF dataset.")
+ # prefix repetition dataset
+ prefix_repetition_group = parser.add_argument_group(
+ "prefix repetition dataset options")
+ prefix_repetition_group.add_argument(
+ "--prefix-repetition-prefix-len",
+ type=int,
+ default=None,
+ help="Number of prefix tokens per request, used only for prefix "
+ "repetition dataset.",
+ )
+ prefix_repetition_group.add_argument(
+ "--prefix-repetition-suffix-len",
+ type=int,
+ default=None,
+ help="Number of suffix tokens per request, used only for prefix "
+ "repetition dataset. Total input length is prefix_len + suffix_len.",
+ )
+ prefix_repetition_group.add_argument(
+ "--prefix-repetition-num-prefixes",
+ type=int,
+ default=None,
+ help="Number of prefixes to generate, used only for prefix repetition "
+ "dataset. Prompts per prefix is num_requests // num_prefixes.",
+ )
+ prefix_repetition_group.add_argument(
+ "--prefix-repetition-output-len",
+ type=int,
+ default=None,
+ help="Number of output tokens per request, used only for prefix "
+ "repetition dataset.",
+ )
+
parser = AsyncEngineArgs.add_cli_args(parser)