From 9b0d1aa27711e8a4c149bf457737e63e40b69b4f Mon Sep 17 00:00:00 2001 From: Reagan Date: Wed, 24 Dec 2025 16:23:24 -0800 Subject: [PATCH] add random-mm, random-rerank --- vllm/benchmarks/datasets.py | 126 +++++++++++--------- vllm/benchmarks/mm_processor.py | 203 ++++++++++++++------------------ vllm/benchmarks/throughput.py | 133 +++++++++++++++++---- 3 files changed, 270 insertions(+), 192 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 49ee0faf049d1..c6709cb018c66 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1436,6 +1436,75 @@ def add_dataset_parser(parser: FlexibleArgumentParser): help="Maximum distance for blazedit dataset. Min: 0, Max: 1.0", ) + # Add random dataset arguments (random-mm and random-rerank) + add_random_dataset_args(parser) + + hf_group = parser.add_argument_group("hf dataset options") + hf_group.add_argument( + "--hf-subset", type=str, default=None, help="Subset of the HF dataset." + ) + hf_group.add_argument( + "--hf-split", type=str, default=None, help="Split of the HF dataset." + ) + hf_group.add_argument( + "--hf-name", + type=str, + default=None, + help=( + "Name of the dataset on HuggingFace " + "(e.g., 'lmarena-ai/VisionArena-Chat'). " + "Specify this if your dataset-path is a local path." + ), + ) + hf_group.add_argument( + "--hf-output-len", + type=int, + default=None, + help="Output length for each request. Overrides the output lengths " + "from the sampled HF dataset.", + ) + + prefix_repetition_group = parser.add_argument_group( + "prefix repetition dataset options" + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-prefix-len", + type=int, + default=256, + help="Number of prefix tokens per request, used only for prefix " + "repetition dataset.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-suffix-len", + type=int, + default=256, + help="Number of suffix tokens per request, used only for prefix " + "repetition dataset. Total input length is prefix_len + suffix_len.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-num-prefixes", + type=int, + default=10, + help="Number of prefixes to generate, used only for prefix repetition " + "dataset. Prompts per prefix is num_requests // num_prefixes.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-output-len", + type=int, + default=128, + help="Number of output tokens per request, used only for prefix " + "repetition dataset.", + ) + +def add_random_dataset_args(parser: FlexibleArgumentParser) -> None: + """Add CLI arguments for random multimodal and random reranking datasets. + + This function adds arguments needed for: + - random-mm (random multimodal dataset) + - random-rerank (random dataset for reranking) + + It can be called directly by benchmark scripts or by add_dataset_parser. + """ random_group = parser.add_argument_group("random dataset options") random_group.add_argument( "--random-input-len", @@ -1580,63 +1649,6 @@ def add_dataset_parser(parser: FlexibleArgumentParser): ), ) - hf_group = parser.add_argument_group("hf dataset options") - hf_group.add_argument( - "--hf-subset", type=str, default=None, help="Subset of the HF dataset." - ) - hf_group.add_argument( - "--hf-split", type=str, default=None, help="Split of the HF dataset." - ) - hf_group.add_argument( - "--hf-name", - type=str, - default=None, - help=( - "Name of the dataset on HuggingFace " - "(e.g., 'lmarena-ai/VisionArena-Chat'). " - "Specify this if your dataset-path is a local path." - ), - ) - hf_group.add_argument( - "--hf-output-len", - type=int, - default=None, - help="Output length for each request. Overrides the output lengths " - "from the sampled HF dataset.", - ) - - prefix_repetition_group = parser.add_argument_group( - "prefix repetition dataset options" - ) - prefix_repetition_group.add_argument( - "--prefix-repetition-prefix-len", - type=int, - default=256, - help="Number of prefix tokens per request, used only for prefix " - "repetition dataset.", - ) - prefix_repetition_group.add_argument( - "--prefix-repetition-suffix-len", - type=int, - default=256, - help="Number of suffix tokens per request, used only for prefix " - "repetition dataset. Total input length is prefix_len + suffix_len.", - ) - prefix_repetition_group.add_argument( - "--prefix-repetition-num-prefixes", - type=int, - default=10, - help="Number of prefixes to generate, used only for prefix repetition " - "dataset. Prompts per prefix is num_requests // num_prefixes.", - ) - prefix_repetition_group.add_argument( - "--prefix-repetition-output-len", - type=int, - default=128, - help="Number of output tokens per request, used only for prefix " - "repetition dataset.", - ) - def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: if not hasattr(args, "request_id_prefix"): diff --git a/vllm/benchmarks/mm_processor.py b/vllm/benchmarks/mm_processor.py index 32e7075fb6408..4bcd116273c7e 100644 --- a/vllm/benchmarks/mm_processor.py +++ b/vllm/benchmarks/mm_processor.py @@ -3,32 +3,33 @@ r"""Benchmark multimodal processor latency. This benchmark measures the latency of the mm processor module -using randomly generated multimodal prompts with synthetic images. +using multimodal prompts from datasets. MM processor stats are automatically enabled. Run: vllm bench mm-processor \ --model \ + --dataset-name random-mm \ --num-prompts 10 \ - --input-len 1024 \ - --output-len 128 \ - --num-images 1 """ import argparse import dataclasses import json import time +import warnings from dataclasses import dataclass from datetime import datetime from typing import Any import numpy as np +from vllm.benchmarks.throughput import get_requests from vllm.engine.arg_utils import EngineArgs from vllm.multimodal.processing import ( get_timing_stats_from_engine_client, ) +from vllm.tokenizers import get_tokenizer from vllm.utils.gc_utils import freeze_gc_heap from vllm.utils.import_utils import PlaceholderModule @@ -37,22 +38,6 @@ try: except ImportError: pd = PlaceholderModule("pandas") - -@dataclass -class MultimodalProcessorBenchmarkMetrics: - """Metrics for multimodal processor benchmark.""" - - completed: int - failed: int - mean_e2el_ms: float - median_e2el_ms: float - std_e2el_ms: float - percentiles_e2el_ms: list[tuple[float, float]] - - """Per-stage timing stats: mean, median, std, percentiles for each stage.""" - mm_processor_stats: dict[str, dict[str, float]] - - def collect_mm_processor_stats( llm_engine: Any, ) -> dict[str, list[float]]: @@ -118,54 +103,56 @@ def calculate_mm_processor_metrics( return metrics -def generate_random_multimodal_prompts( - num_prompts: int, - input_len: int, - output_len: int, - tokenizer: Any, - num_images: int = 1, - image_width: int = 256, - image_height: int = 256, - seed: int = 0, -) -> tuple[list[list[dict]], list[int]]: +def validate_args(args): """ - Generate random multimodal prompts with synthetic images and text tokens. - - Returns: - tuple: (prompts, expected_output_lens) - - prompts: List of OpenAI chat format messages with text and images - - expected_output_lens: List of expected output lengths + Validate command-line arguments for mm_processor benchmark. """ - from PIL import Image + if not getattr(args, "tokenizer", None): + args.tokenizer = args.model - from vllm.benchmarks.datasets import process_image + if getattr(args, "dataset", None) is not None: + warnings.warn( + "The '--dataset' argument will be deprecated in the next release. " + "Please use '--dataset-name' and '--dataset-path' instead.", + stacklevel=2, + ) + args.dataset_path = args.dataset - rng = np.random.default_rng(seed) + if not hasattr(args, "dataset_path"): + args.dataset_path = None + if not hasattr(args, "data_parallel_size"): + args.data_parallel_size = 1 + if not hasattr(args, "lora_path"): + args.lora_path = None + if not hasattr(args, "max_loras"): + args.max_loras = None - prompts = [] - expected_output_lens = [] + # === Random Dataset Argument Conflict Detection === + # Check for conflicts between regular and random arguments when using random datasets + dataset_name = getattr(args, "dataset_name", None) + if dataset_name in {"random", "random-mm", "random-rerank"}: + random_input_len = getattr(args, "random_input_len", None) + random_output_len = getattr(args, "random_output_len", None) + random_prefix_len = getattr(args, "random_prefix_len", None) + input_len = getattr(args, "input_len", None) + output_len = getattr(args, "output_len", None) + prefix_len = getattr(args, "prefix_len", None) - for i in range(num_prompts): - vocab_size = tokenizer.vocab_size - prompt_token_ids = rng.integers(0, vocab_size, size=input_len).tolist() - - text_prompt = tokenizer.decode(prompt_token_ids) - - mm_items = [] - for _ in range(num_images): - random_pixels = rng.integers( - 0, 256, (image_height, image_width, 3), dtype=np.uint8 + if input_len is not None and random_input_len is not None: + raise ValueError( + "Cannot specify both --input-len and --random-input-len. " + "For random datasets, use only one of them (prefer --random-input-len)." + ) + if output_len is not None and random_output_len is not None: + raise ValueError( + "Cannot specify both --output-len and --random-output-len. " + "For random datasets, use only one of them (prefer --random-output-len)." + ) + if prefix_len is not None and random_prefix_len is not None: + raise ValueError( + "Cannot specify both --prefix-len and --random-prefix-len. " + "For random datasets, use only one of them (prefer --random-prefix-len)." ) - image = Image.fromarray(random_pixels) - mm_item = process_image(image) - mm_items.append(mm_item) - - content = [{"type": "text", "text": text_prompt}] - content.extend(mm_items) - prompts.append([{"role": "user", "content": content}]) - expected_output_lens.append(output_len) - - return prompts, expected_output_lens def benchmark_multimodal_processor( @@ -176,28 +163,33 @@ def benchmark_multimodal_processor( """ from vllm import LLM, SamplingParams + validate_args(args) + + if args.seed is None: + args.seed = 0 + + tokenizer = get_tokenizer( + args.tokenizer, + tokenizer_mode=getattr(args, "tokenizer_mode", "auto"), + trust_remote_code=getattr(args, "trust_remote_code", False), + ) + + requests = get_requests(args, tokenizer) + engine_args = EngineArgs.from_cli_args(args) llm = LLM(**dataclasses.asdict(engine_args)) - assert llm.llm_engine.model_config.max_model_len >= ( - args.input_len + args.output_len + assert all( + llm.llm_engine.model_config.max_model_len + >= (request.prompt_len + request.expected_output_len) + for request in requests ), ( - "Please ensure that max_model_len is greater than " - "the sum of input_len and output_len." + "Please ensure that max_model_len is greater than the sum of " + "prompt_len and expected_output_len for all requests." ) - seed = getattr(args, "seed", 0) - tokenizer = llm.get_tokenizer() - prompts, expected_output_lens = generate_random_multimodal_prompts( - num_prompts=args.num_prompts, - input_len=args.input_len, - output_len=args.output_len, - tokenizer=tokenizer, - num_images=args.num_images, - image_width=args.image_width, - image_height=args.image_height, - seed=seed, - ) + prompts = [request.prompt for request in requests] + expected_output_lens = [request.expected_output_len for request in requests] sampling_params = [ SamplingParams( @@ -297,42 +289,32 @@ def add_cli_args(parser: argparse.ArgumentParser) -> None: parser.set_defaults(enable_mm_processor_stats=True) + parser.add_argument( + "--dataset-name", + type=str, + default="random-mm", + help="Name of the dataset to benchmark on. Defaults to 'random-mm'.", + ) + parser.add_argument( + "--prefix-len", + type=int, + default=0, + help="Number of fixed prefix tokens before the random context in a request. " + "For random datasets (random, random-mm, random-rerank), either this " + "argument or the corresponding --random-prefix-len argument can be used, " + "but not both.", + ) parser.add_argument( "--num-prompts", type=int, default=10, help="Number of prompts to process.", ) - parser.add_argument( - "--input-len", - type=int, - default=1024, - help="Number of input tokens per request.", - ) - parser.add_argument( - "--output-len", - type=int, - default=128, - help="Number of output tokens per request.", - ) - parser.add_argument( - "--num-images", - type=int, - default=1, - help="Number of images per prompt.", - ) - parser.add_argument( - "--image-width", - type=int, - default=256, - help="Width of generated images in pixels.", - ) - parser.add_argument( - "--image-height", - type=int, - default=256, - help="Height of generated images in pixels.", - ) + + from vllm.benchmarks.datasets import add_random_dataset_args + + # (random, random-mm, random-rerank) + add_random_dataset_args(parser) parser.add_argument( "--output-json", @@ -414,11 +396,8 @@ def main(args: argparse.Namespace) -> None: result["config"] = { "model": args.model, "num_prompts": args.num_prompts, - "input_len": args.input_len, - "output_len": args.output_len, - "num_images": args.num_images, - "image_width": args.image_width, - "image_height": args.image_height, + "input_len": getattr(args, "random_input_len", None), + "output_len": getattr(args, "random_output_len", None), } result["timestamp"] = datetime.now().isoformat() diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index 37b8952a350b4..cab60177b8333 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -24,10 +24,13 @@ from vllm.benchmarks.datasets import ( MultiModalConversationDataset, PrefixRepetitionRandomDataset, RandomDataset, + RandomDatasetForReranking, + RandomMultiModalDataset, SampleRequest, ShareGPTDataset, SonnetDataset, VisionArenaDataset, + add_random_dataset_args, ) from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs @@ -351,7 +354,13 @@ def get_requests(args, tokenizer): and args.dataset_name not in {"prefix_repetition", "random-mm", "random-rerank"} ): sample_kwargs["range_ratio"] = args.random_range_ratio - sample_kwargs["prefix_len"] = args.prefix_len + # prefer random_* arguments, fall back to regular arguments + random_prefix_len = getattr(args, "random_prefix_len", None) + sample_kwargs["prefix_len"] = random_prefix_len if random_prefix_len is not None else args.prefix_len + random_input_len = getattr(args, "random_input_len", None) + sample_kwargs["input_len"] = random_input_len if random_input_len is not None else args.input_len + random_output_len = getattr(args, "random_output_len", None) + sample_kwargs["output_len"] = random_output_len if random_output_len is not None else args.output_len dataset_cls = RandomDataset elif args.dataset_name == "sharegpt": dataset_cls = ShareGPTDataset @@ -395,6 +404,39 @@ def get_requests(args, tokenizer): sample_kwargs["suffix_len"] = args.prefix_repetition_suffix_len sample_kwargs["num_prefixes"] = args.prefix_repetition_num_prefixes sample_kwargs["output_len"] = args.prefix_repetition_output_len + elif args.dataset_name == "random-mm": + dataset_cls = RandomMultiModalDataset + # prefer random_* arguments, fall back to regular arguments + random_input_len = getattr(args, "random_input_len", None) + sample_kwargs["input_len"] = random_input_len if random_input_len is not None else getattr(args, "input_len", None) + random_output_len = getattr(args, "random_output_len", None) + sample_kwargs["output_len"] = random_output_len if random_output_len is not None else getattr(args, "output_len", None) + sample_kwargs["base_items_per_request"] = getattr( + args, "random_mm_base_items_per_request", None + ) + sample_kwargs["num_mm_items_range_ratio"] = getattr( + args, "random_mm_num_mm_items_range_ratio", None + ) + sample_kwargs["limit_mm_per_prompt"] = getattr( + args, "random_mm_limit_mm_per_prompt", None + ) + sample_kwargs["bucket_config"] = getattr( + args, "random_mm_bucket_config", None + ) + sample_kwargs["enable_multimodal_chat"] = True + random_prefix_len = getattr(args, "random_prefix_len", None) + sample_kwargs["prefix_len"] = random_prefix_len if random_prefix_len is not None else args.prefix_len + sample_kwargs["range_ratio"] = args.random_range_ratio + elif args.dataset_name == "random-rerank": + dataset_cls = RandomDatasetForReranking + # prefer random_* arguments, fall back to regular arguments + random_input_len = getattr(args, "random_input_len", None) + sample_kwargs["input_len"] = random_input_len if random_input_len is not None else getattr(args, "input_len", None) + random_output_len = getattr(args, "random_output_len", None) + sample_kwargs["output_len"] = random_output_len if random_output_len is not None else getattr(args, "output_len", None) + sample_kwargs["batchsize"] = getattr(args, "random_batch_size", 1) + sample_kwargs["is_reranker"] = not getattr(args, "no_reranker", False) + sample_kwargs["range_ratio"] = args.random_range_ratio else: raise ValueError(f"Unknown dataset name: {args.dataset_name}") # Remove None values @@ -451,8 +493,11 @@ def validate_args(args): ): print("When dataset path is not set, it will default to random dataset") args.dataset_name = "random" - if args.input_len is None: - raise ValueError("input_len must be provided for a random dataset") + random_input_len = getattr(args, "random_input_len", None) + if args.input_len is None and random_input_len is None: + raise ValueError( + "Either --input-len or --random-input-len must be provided for a random dataset" + ) # === Dataset Name Specific Checks === # --hf-subset and --hf-split: only used @@ -485,26 +530,66 @@ def validate_args(args): else: raise ValueError(f"{args.dataset_path} is not supported by hf dataset.") - # --random-range-ratio: only used when dataset_name is 'random' - if args.dataset_name != "random" and args.random_range_ratio is not None: + # --random-range-ratio: only used when dataset_name is 'random', 'random-mm', or 'random-rerank' + if args.dataset_name not in {"random", "random-mm", "random-rerank"} and args.random_range_ratio is not None: warnings.warn( "--random-range-ratio will be ignored since \ - --dataset-name is not 'random'.", + --dataset-name is not 'random', 'random-mm', or 'random-rerank'.", stacklevel=2, ) - # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not + # --random-batch-size: only used when dataset_name is 'random-rerank' + if args.dataset_name != "random-rerank" and getattr(args, "random_batch_size", None) is not None: + if args.random_batch_size != 1: + warnings.warn( + "--random-batch-size will be ignored since \ + --dataset-name is not 'random-rerank'.", + stacklevel=2, + ) + + # --no-reranker: only used when dataset_name is 'random-rerank' + if args.dataset_name != "random-rerank" and getattr(args, "no_reranker", False): + warnings.warn( + "--no-reranker will be ignored since \ + --dataset-name is not 'random-rerank'.", + stacklevel=2, + ) + + # --prefix-len: only used when dataset_name is 'random', 'random-mm', 'sonnet', or not # set. if ( - args.dataset_name not in {"random", "sonnet", None} + args.dataset_name not in {"random", "random-mm", "sonnet", None} and args.prefix_len is not None ): warnings.warn( "--prefix-len will be ignored since --dataset-name\ - is not 'random', 'sonnet', or not set.", + is not 'random', 'random-mm', 'sonnet', or not set.", stacklevel=2, ) + # === Random Dataset Argument Conflict Detection === + # Check for conflicts between regular and random arguments when using random datasets + if args.dataset_name in {"random", "random-mm", "random-rerank"}: + random_input_len = getattr(args, "random_input_len", None) + random_output_len = getattr(args, "random_output_len", None) + random_prefix_len = getattr(args, "random_prefix_len", None) + + if args.input_len is not None and random_input_len is not None: + raise ValueError( + "Cannot specify both --input-len and --random-input-len. " + "For random datasets, use only one of them (prefer --random-input-len)." + ) + if args.output_len is not None and random_output_len is not None: + raise ValueError( + "Cannot specify both --output-len and --random-output-len. " + "For random datasets, use only one of them (prefer --random-output-len)." + ) + if args.prefix_len is not None and random_prefix_len is not None: + raise ValueError( + "Cannot specify both --prefix-len and --random-prefix-len. " + "For random datasets, use only one of them (prefer --random-prefix-len)." + ) + # === LoRA Settings === if getattr(args, "enable_lora", False) and args.backend != "vllm": raise ValueError("LoRA benchmarking is only supported for vLLM backend") @@ -554,7 +639,7 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--dataset-name", type=str, - choices=["sharegpt", "random", "sonnet", "burstgpt", "hf", "prefix_repetition"], + choices=["sharegpt", "random", "sonnet", "burstgpt", "hf", "prefix_repetition", "random-mm", "random-rerank"], help="Name of the dataset to benchmark on.", default="sharegpt", ) @@ -574,14 +659,20 @@ def add_cli_args(parser: argparse.ArgumentParser): "--input-len", type=int, default=None, - help="Input prompt length for each request", + help="Input prompt length for each request. " + "For random datasets (random, random-mm, random-rerank), either this " + "argument or the corresponding --random-input-len argument can be used, " + "but not both.", ) parser.add_argument( "--output-len", type=int, default=None, help="Output length for each request. Overrides the " - "output length from the dataset.", + "output length from the dataset. " + "For random datasets (random, random-mm, random-rerank), either this " + "argument or the corresponding --random-output-len argument can be used, " + "but not both.", ) parser.add_argument( "--n", type=int, default=1, help="Number of generated sequences per prompt." @@ -634,17 +725,10 @@ def add_cli_args(parser: argparse.ArgumentParser): type=int, default=0, help="Number of fixed prefix tokens before the random " - "context in a request (default: 0).", - ) - # random dataset - parser.add_argument( - "--random-range-ratio", - type=float, - default=0.0, - help="Range ratio for sampling input/output length, " - "used only for RandomDataset. Must be in the range [0, 1) to define " - "a symmetric sampling range " - "[length * (1 - range_ratio), length * (1 + range_ratio)].", + "context in a request (default: 0). " + "For random datasets (random, random-mm, random-rerank), either this " + "argument or the corresponding --random-prefix-len argument can be used, " + "but not both.", ) # hf dtaset @@ -694,6 +778,9 @@ def add_cli_args(parser: argparse.ArgumentParser): "repetition dataset.", ) + # (random, random-mm, random-rerank) + add_random_dataset_args(parser) + parser = AsyncEngineArgs.add_cli_args(parser)