mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 12:05:48 +08:00
[Benchmark] Parameterization of streaming loading of multimodal datasets (#20528)
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
parent
70ca5484f5
commit
9ff2af6d2b
@ -701,6 +701,7 @@ class HuggingFaceDataset(BenchmarkDataset):
|
|||||||
self,
|
self,
|
||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
dataset_split: str,
|
dataset_split: str,
|
||||||
|
no_stream: bool = False,
|
||||||
dataset_subset: Optional[str] = None,
|
dataset_subset: Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
@ -708,6 +709,7 @@ class HuggingFaceDataset(BenchmarkDataset):
|
|||||||
|
|
||||||
self.dataset_split = dataset_split
|
self.dataset_split = dataset_split
|
||||||
self.dataset_subset = dataset_subset
|
self.dataset_subset = dataset_subset
|
||||||
|
self.load_stream = not no_stream
|
||||||
self.load_data()
|
self.load_data()
|
||||||
|
|
||||||
def load_data(self) -> None:
|
def load_data(self) -> None:
|
||||||
@ -716,7 +718,7 @@ class HuggingFaceDataset(BenchmarkDataset):
|
|||||||
self.dataset_path,
|
self.dataset_path,
|
||||||
name=self.dataset_subset,
|
name=self.dataset_subset,
|
||||||
split=self.dataset_split,
|
split=self.dataset_split,
|
||||||
streaming=True,
|
streaming=self.load_stream,
|
||||||
)
|
)
|
||||||
self.data = self.data.shuffle(seed=self.random_seed)
|
self.data = self.data.shuffle(seed=self.random_seed)
|
||||||
|
|
||||||
|
|||||||
@ -825,6 +825,7 @@ def main(args: argparse.Namespace):
|
|||||||
dataset_subset=args.hf_subset,
|
dataset_subset=args.hf_subset,
|
||||||
dataset_split=args.hf_split,
|
dataset_split=args.hf_split,
|
||||||
random_seed=args.seed,
|
random_seed=args.seed,
|
||||||
|
no_stream=args.no_stream,
|
||||||
).sample(
|
).sample(
|
||||||
num_requests=args.num_prompts,
|
num_requests=args.num_prompts,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
@ -1033,6 +1034,11 @@ def create_argument_parser():
|
|||||||
help="Path to the sharegpt/sonnet dataset. "
|
help="Path to the sharegpt/sonnet dataset. "
|
||||||
"Or the huggingface dataset ID if using HF dataset.",
|
"Or the huggingface dataset ID if using HF dataset.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-stream",
|
||||||
|
action="store_true",
|
||||||
|
help="Do not load the dataset in streaming mode.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--max-concurrency",
|
"--max-concurrency",
|
||||||
type=int,
|
type=int,
|
||||||
|
|||||||
@ -356,6 +356,7 @@ def get_requests(args, tokenizer):
|
|||||||
elif args.dataset_name == "burstgpt":
|
elif args.dataset_name == "burstgpt":
|
||||||
dataset_cls = BurstGPTDataset
|
dataset_cls = BurstGPTDataset
|
||||||
elif args.dataset_name == "hf":
|
elif args.dataset_name == "hf":
|
||||||
|
common_kwargs["no_stream"] = args.no_stream
|
||||||
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
||||||
dataset_cls = VisionArenaDataset
|
dataset_cls = VisionArenaDataset
|
||||||
common_kwargs["dataset_subset"] = None
|
common_kwargs["dataset_subset"] = None
|
||||||
@ -610,6 +611,11 @@ def create_argument_parser():
|
|||||||
help="Name of the dataset to benchmark on.",
|
help="Name of the dataset to benchmark on.",
|
||||||
default="sharegpt",
|
default="sharegpt",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-stream",
|
||||||
|
action="store_true",
|
||||||
|
help="Do not load the dataset in streaming mode.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--dataset",
|
"--dataset",
|
||||||
type=str,
|
type=str,
|
||||||
|
|||||||
@ -481,6 +481,11 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
|
|||||||
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
|
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
|
||||||
help="Name of the dataset to benchmark on.",
|
help="Name of the dataset to benchmark on.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-stream",
|
||||||
|
action="store_true",
|
||||||
|
help="Do not load the dataset in streaming mode.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--dataset-path",
|
"--dataset-path",
|
||||||
type=str,
|
type=str,
|
||||||
@ -674,6 +679,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
|
|||||||
dataset_subset=args.hf_subset,
|
dataset_subset=args.hf_subset,
|
||||||
dataset_split=args.hf_split,
|
dataset_split=args.hf_split,
|
||||||
random_seed=args.seed,
|
random_seed=args.seed,
|
||||||
|
no_stream=args.no_stream,
|
||||||
).sample(
|
).sample(
|
||||||
num_requests=args.num_prompts,
|
num_requests=args.num_prompts,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
@ -971,6 +977,7 @@ class HuggingFaceDataset(BenchmarkDataset):
|
|||||||
self,
|
self,
|
||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
dataset_split: str,
|
dataset_split: str,
|
||||||
|
no_stream: bool = False,
|
||||||
dataset_subset: Optional[str] = None,
|
dataset_subset: Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
@ -978,6 +985,7 @@ class HuggingFaceDataset(BenchmarkDataset):
|
|||||||
|
|
||||||
self.dataset_split = dataset_split
|
self.dataset_split = dataset_split
|
||||||
self.dataset_subset = dataset_subset
|
self.dataset_subset = dataset_subset
|
||||||
|
self.load_stream = not no_stream
|
||||||
self.load_data()
|
self.load_data()
|
||||||
|
|
||||||
def load_data(self) -> None:
|
def load_data(self) -> None:
|
||||||
@ -986,7 +994,7 @@ class HuggingFaceDataset(BenchmarkDataset):
|
|||||||
self.dataset_path,
|
self.dataset_path,
|
||||||
name=self.dataset_subset,
|
name=self.dataset_subset,
|
||||||
split=self.dataset_split,
|
split=self.dataset_split,
|
||||||
streaming=True,
|
streaming=self.load_stream,
|
||||||
)
|
)
|
||||||
self.data = self.data.shuffle(seed=self.random_seed)
|
self.data = self.data.shuffle(seed=self.random_seed)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user