From 9ff2af6d2ba1757c6e59fe803225a73e61fe526f Mon Sep 17 00:00:00 2001 From: Li Wang Date: Wed, 9 Jul 2025 21:35:16 +0800 Subject: [PATCH] [Benchmark] Parameterization of streaming loading of multimodal datasets (#20528) Signed-off-by: wangli --- benchmarks/benchmark_dataset.py | 4 +++- benchmarks/benchmark_serving.py | 6 ++++++ benchmarks/benchmark_throughput.py | 6 ++++++ vllm/benchmarks/datasets.py | 10 +++++++++- 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 55c0cf851264..8df071d6033f 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -701,6 +701,7 @@ class HuggingFaceDataset(BenchmarkDataset): self, dataset_path: str, dataset_split: str, + no_stream: bool = False, dataset_subset: Optional[str] = None, **kwargs, ) -> None: @@ -708,6 +709,7 @@ class HuggingFaceDataset(BenchmarkDataset): self.dataset_split = dataset_split self.dataset_subset = dataset_subset + self.load_stream = not no_stream self.load_data() def load_data(self) -> None: @@ -716,7 +718,7 @@ class HuggingFaceDataset(BenchmarkDataset): self.dataset_path, name=self.dataset_subset, split=self.dataset_split, - streaming=True, + streaming=self.load_stream, ) self.data = self.data.shuffle(seed=self.random_seed) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 9b235266dff1..f3a20842137e 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -825,6 +825,7 @@ def main(args: argparse.Namespace): dataset_subset=args.hf_subset, dataset_split=args.hf_split, random_seed=args.seed, + no_stream=args.no_stream, ).sample( num_requests=args.num_prompts, tokenizer=tokenizer, @@ -1033,6 +1034,11 @@ def create_argument_parser(): help="Path to the sharegpt/sonnet dataset. " "Or the huggingface dataset ID if using HF dataset.", ) + parser.add_argument( + "--no-stream", + action="store_true", + help="Do not load the dataset in streaming mode.", + ) parser.add_argument( "--max-concurrency", type=int, diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 0ded34c70bad..14461121fece 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -356,6 +356,7 @@ def get_requests(args, tokenizer): elif args.dataset_name == "burstgpt": dataset_cls = BurstGPTDataset elif args.dataset_name == "hf": + common_kwargs["no_stream"] = args.no_stream if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: dataset_cls = VisionArenaDataset common_kwargs["dataset_subset"] = None @@ -610,6 +611,11 @@ def create_argument_parser(): help="Name of the dataset to benchmark on.", default="sharegpt", ) + parser.add_argument( + "--no-stream", + action="store_true", + help="Do not load the dataset in streaming mode.", + ) parser.add_argument( "--dataset", type=str, diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index b3688d2340e4..fdc4e9175a73 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -481,6 +481,11 @@ def add_dataset_parser(parser: FlexibleArgumentParser): choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"], help="Name of the dataset to benchmark on.", ) + parser.add_argument( + "--no-stream", + action="store_true", + help="Do not load the dataset in streaming mode.", + ) parser.add_argument( "--dataset-path", type=str, @@ -674,6 +679,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: dataset_subset=args.hf_subset, dataset_split=args.hf_split, random_seed=args.seed, + no_stream=args.no_stream, ).sample( num_requests=args.num_prompts, tokenizer=tokenizer, @@ -971,6 +977,7 @@ class HuggingFaceDataset(BenchmarkDataset): self, dataset_path: str, dataset_split: str, + no_stream: bool = False, dataset_subset: Optional[str] = None, **kwargs, ) -> None: @@ -978,6 +985,7 @@ class HuggingFaceDataset(BenchmarkDataset): self.dataset_split = dataset_split self.dataset_subset = dataset_subset + self.load_stream = not no_stream self.load_data() def load_data(self) -> None: @@ -986,7 +994,7 @@ class HuggingFaceDataset(BenchmarkDataset): self.dataset_path, name=self.dataset_subset, split=self.dataset_split, - streaming=True, + streaming=self.load_stream, ) self.data = self.data.shuffle(seed=self.random_seed)