diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index f332566d64f8..c2fbe2bb6d27 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -582,15 +582,6 @@ class HuggingFaceDataset(BenchmarkDataset): ) -> None: super().__init__(dataset_path=dataset_path, **kwargs) - # Validate dataset path - if self.SUPPORTED_DATASET_PATHS and \ - self.dataset_path not in self.SUPPORTED_DATASET_PATHS: - raise ValueError( - f"{self.__class__.__name__} " - f"only supports: {', '.join(self.SUPPORTED_DATASET_PATHS)}. " - "Please consider contributing if you would " - "like to add support for additional dataset formats.") - self.dataset_split = dataset_split self.dataset_subset = dataset_subset self.load_data() diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index dabf2214c84a..ec2ed1a1750b 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -50,9 +50,9 @@ except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser from benchmark_dataset import (BurstGPTDataset, ConversationDataset, - InstructCoderDataset, RandomDataset, - SampleRequest, ShareGPTDataset, SonnetDataset, - VisionArenaDataset) + HuggingFaceDataset, InstructCoderDataset, + RandomDataset, SampleRequest, ShareGPTDataset, + SonnetDataset, VisionArenaDataset) from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json MILLISECONDS_TO_SECONDS_CONVERSION = 1000 @@ -595,6 +595,17 @@ def main(args: argparse.Namespace): args.hf_split = "train" elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: dataset_class = ConversationDataset + else: + supported_datasets = set([ + dataset_name for cls in HuggingFaceDataset.__subclasses__() + for dataset_name in cls.SUPPORTED_DATASET_PATHS + ]) + raise ValueError( + f"Unsupported dataset path: {args.dataset_path}. " + "Huggingface dataset only supports dataset_path" + f" from one of following: {supported_datasets}. " + "Please consider contributing if you would " + "like to add support for additional dataset formats.") input_requests = dataset_class( dataset_path=args.dataset_path, dataset_subset=args.hf_subset,