mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-27 03:34:26 +08:00
[Benchmark] Add support for local hf dataset path in benchmark (#23999)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
This commit is contained in:
parent
0e1759cd54
commit
c83c4ff815
@ -110,7 +110,12 @@ become available.
|
|||||||
|
|
||||||
🚧: to be supported
|
🚧: to be supported
|
||||||
|
|
||||||
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
|
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
|
||||||
|
For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
|
||||||
|
|
||||||
|
```bash
|
||||||
|
--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
|
||||||
|
```
|
||||||
|
|
||||||
## 🚀 Example - Online Benchmark
|
## 🚀 Example - Online Benchmark
|
||||||
|
|
||||||
|
|||||||
@ -1227,6 +1227,16 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
|
|||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help="Split of the HF dataset.")
|
help="Split of the HF dataset.")
|
||||||
|
hf_group.add_argument(
|
||||||
|
"--hf-name",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help=(
|
||||||
|
"Name of the dataset on HuggingFace "
|
||||||
|
"(e.g., 'lmarena-ai/VisionArena-Chat'). "
|
||||||
|
"Specify this if your dataset-path is a local path."
|
||||||
|
),
|
||||||
|
)
|
||||||
hf_group.add_argument(
|
hf_group.add_argument(
|
||||||
"--hf-output-len",
|
"--hf-output-len",
|
||||||
type=int,
|
type=int,
|
||||||
@ -1307,28 +1317,53 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
|
|||||||
elif args.dataset_name == "hf":
|
elif args.dataset_name == "hf":
|
||||||
# all following datasets are implemented from the
|
# all following datasets are implemented from the
|
||||||
# HuggingFaceDataset base class
|
# HuggingFaceDataset base class
|
||||||
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
if (
|
||||||
|
args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS
|
||||||
|
or args.hf_name in VisionArenaDataset.SUPPORTED_DATASET_PATHS
|
||||||
|
):
|
||||||
dataset_class = VisionArenaDataset
|
dataset_class = VisionArenaDataset
|
||||||
args.hf_split = "train"
|
args.hf_split = "train"
|
||||||
args.hf_subset = None
|
args.hf_subset = None
|
||||||
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
|
elif (
|
||||||
|
args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS
|
||||||
|
or args.hf_name in InstructCoderDataset.SUPPORTED_DATASET_PATHS
|
||||||
|
):
|
||||||
dataset_class = InstructCoderDataset
|
dataset_class = InstructCoderDataset
|
||||||
args.hf_split = "train"
|
args.hf_split = "train"
|
||||||
elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS:
|
elif (
|
||||||
|
args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS
|
||||||
|
or args.hf_name in MTBenchDataset.SUPPORTED_DATASET_PATHS
|
||||||
|
):
|
||||||
dataset_class = MTBenchDataset
|
dataset_class = MTBenchDataset
|
||||||
args.hf_split = "train"
|
args.hf_split = "train"
|
||||||
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
|
elif (
|
||||||
|
args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS
|
||||||
|
or args.hf_name in ConversationDataset.SUPPORTED_DATASET_PATHS
|
||||||
|
):
|
||||||
dataset_class = ConversationDataset
|
dataset_class = ConversationDataset
|
||||||
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
|
elif (
|
||||||
|
args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS
|
||||||
|
or args.hf_name in AIMODataset.SUPPORTED_DATASET_PATHS
|
||||||
|
):
|
||||||
dataset_class = AIMODataset
|
dataset_class = AIMODataset
|
||||||
args.hf_split = "train"
|
args.hf_split = "train"
|
||||||
elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS: # noqa: E501
|
elif (
|
||||||
|
args.dataset_path
|
||||||
|
in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS # noqa: E501
|
||||||
|
or args.hf_name in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS
|
||||||
|
):
|
||||||
dataset_class = NextEditPredictionDataset
|
dataset_class = NextEditPredictionDataset
|
||||||
args.hf_split = "train"
|
args.hf_split = "train"
|
||||||
elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
|
elif (
|
||||||
|
args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS
|
||||||
|
or args.hf_name in ASRDataset.SUPPORTED_DATASET_PATHS
|
||||||
|
):
|
||||||
dataset_class = ASRDataset
|
dataset_class = ASRDataset
|
||||||
args.hf_split = "train"
|
args.hf_split = "train"
|
||||||
elif args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS:
|
elif (
|
||||||
|
args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS
|
||||||
|
or args.hf_name in MLPerfDataset.SUPPORTED_DATASET_PATHS
|
||||||
|
):
|
||||||
dataset_class = MLPerfDataset
|
dataset_class = MLPerfDataset
|
||||||
args.hf_split = "train"
|
args.hf_split = "train"
|
||||||
else:
|
else:
|
||||||
@ -1358,6 +1393,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
|
|||||||
dataset_split=args.hf_split,
|
dataset_split=args.hf_split,
|
||||||
random_seed=args.seed,
|
random_seed=args.seed,
|
||||||
no_stream=args.no_stream,
|
no_stream=args.no_stream,
|
||||||
|
hf_name=args.hf_name,
|
||||||
).sample(
|
).sample(
|
||||||
num_requests=args.num_prompts,
|
num_requests=args.num_prompts,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
@ -1710,6 +1746,7 @@ class HuggingFaceDataset(BenchmarkDataset):
|
|||||||
dataset_split: str,
|
dataset_split: str,
|
||||||
no_stream: bool = False,
|
no_stream: bool = False,
|
||||||
dataset_subset: Optional[str] = None,
|
dataset_subset: Optional[str] = None,
|
||||||
|
hf_name: Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(dataset_path=dataset_path, **kwargs)
|
super().__init__(dataset_path=dataset_path, **kwargs)
|
||||||
@ -1717,6 +1754,7 @@ class HuggingFaceDataset(BenchmarkDataset):
|
|||||||
self.dataset_split = dataset_split
|
self.dataset_split = dataset_split
|
||||||
self.dataset_subset = dataset_subset
|
self.dataset_subset = dataset_subset
|
||||||
self.load_stream = not no_stream
|
self.load_stream = not no_stream
|
||||||
|
self.hf_name = hf_name or dataset_path
|
||||||
self.load_data()
|
self.load_data()
|
||||||
|
|
||||||
def load_data(self) -> None:
|
def load_data(self) -> None:
|
||||||
@ -1827,10 +1865,9 @@ class VisionArenaDataset(HuggingFaceDataset):
|
|||||||
for i, item in enumerate(self.data):
|
for i, item in enumerate(self.data):
|
||||||
if len(sampled_requests) >= num_requests:
|
if len(sampled_requests) >= num_requests:
|
||||||
break
|
break
|
||||||
parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
|
parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name)
|
||||||
if parser_fn is None:
|
if parser_fn is None:
|
||||||
raise ValueError(
|
raise ValueError(f"Unsupported dataset path: {self.hf_name}")
|
||||||
f"Unsupported dataset path: {self.dataset_path}")
|
|
||||||
prompt = parser_fn(item)
|
prompt = parser_fn(item)
|
||||||
mm_content = process_image(item["images"][0])
|
mm_content = process_image(item["images"][0])
|
||||||
prompt_len = len(tokenizer(prompt).input_ids)
|
prompt_len = len(tokenizer(prompt).input_ids)
|
||||||
@ -2099,10 +2136,9 @@ class NextEditPredictionDataset(HuggingFaceDataset):
|
|||||||
def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int,
|
def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int,
|
||||||
request_id_prefix: str = "",
|
request_id_prefix: str = "",
|
||||||
**kwargs):
|
**kwargs):
|
||||||
formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(
|
formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.hf_name)
|
||||||
self.dataset_path)
|
|
||||||
if formatting_prompt_func is None:
|
if formatting_prompt_func is None:
|
||||||
raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
|
raise ValueError(f"Unsupported dataset path: {self.hf_name}")
|
||||||
samples = []
|
samples = []
|
||||||
for i, sample in enumerate(self.data):
|
for i, sample in enumerate(self.data):
|
||||||
sample = formatting_prompt_func(sample)
|
sample = formatting_prompt_func(sample)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user