From 31436e8b4f0a210cf5c6f9ec9b47fc1b5d7cc47a Mon Sep 17 00:00:00 2001 From: hustxiayang Date: Tue, 19 Aug 2025 04:32:18 -0400 Subject: [PATCH] [Misc] Add request_id into benchmark_serve.py (#23065) Signed-off-by: yangxia --- benchmarks/backend_request_func.py | 23 +++- benchmarks/benchmark_dataset.py | 109 ++++++++++++++---- benchmarks/benchmark_serving.py | 23 +++- vllm/benchmarks/datasets.py | 113 +++++++++++++++---- vllm/benchmarks/lib/endpoint_request_func.py | 7 ++ vllm/benchmarks/serve.py | 14 ++- 6 files changed, 243 insertions(+), 46 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 1559ca2d92841..ba7c733be0b25 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -34,6 +34,7 @@ class RequestFuncInput: multi_modal_content: Optional[dict | list[dict]] = None ignore_eos: bool = False language: Optional[str] = None + request_id: Optional[str] = None @dataclass @@ -71,6 +72,9 @@ async def async_request_tgi( "inputs": request_func_input.prompt, "parameters": params, } + headers = None + if request_func_input.request_id: + headers = {"x-request-id": request_func_input.request_id} output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len if request_func_input.ignore_eos: @@ -82,7 +86,9 @@ async def async_request_tgi( st = time.perf_counter() most_recent_timestamp = st try: - async with session.post(url=api_url, json=payload) as response: + async with session.post( + url=api_url, json=payload, headers=headers + ) as response: if response.status == 200: async for chunk_bytes in response.content: chunk_bytes = chunk_bytes.strip() @@ -145,6 +151,9 @@ async def async_request_trt_llm( } if request_func_input.ignore_eos: payload["min_length"] = request_func_input.output_len + headers = None + if request_func_input.request_id: + headers = {"x-request-id": request_func_input.request_id} output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -152,7 +161,9 @@ async def async_request_trt_llm( st = time.perf_counter() most_recent_timestamp = st try: - async with session.post(url=api_url, json=payload) as response: + async with session.post( + url=api_url, json=payload, headers=headers + ) as response: if response.status == 200: async for chunk_bytes in response.content: chunk_bytes = chunk_bytes.strip() @@ -211,6 +222,8 @@ async def async_request_deepspeed_mii( "top_p": 1.0, } headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} + if request_func_input.request_id: + headers["x-request-id"] = request_func_input.request_id output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -283,6 +296,8 @@ async def async_request_openai_completions( if request_func_input.extra_body: payload.update(request_func_input.extra_body) headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} + if request_func_input.request_id: + headers["x-request-id"] = request_func_input.request_id output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -395,6 +410,8 @@ async def async_request_openai_chat_completions( "Content-Type": "application/json", "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", } + if request_func_input.request_id: + headers["x-request-id"] = request_func_input.request_id output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -491,6 +508,8 @@ async def async_request_openai_audio( headers = { "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", } + if request_func_input.request_id: + headers["x-request-id"] = request_func_input.request_id # Send audio file def to_bytes(y, sr): diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 572292a5aca46..c62934ed94cb5 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -19,6 +19,7 @@ import logging import random from abc import ABC, abstractmethod from collections.abc import Mapping +from copy import deepcopy from dataclasses import dataclass from functools import cache from io import BytesIO @@ -54,6 +55,7 @@ class SampleRequest: expected_output_len: int multi_modal_data: Optional[Union[MultiModalDataDict, dict, list[dict]]] = None lora_request: Optional[LoRARequest] = None + request_id: Optional[str] = None # ----------------------------------------------------------------------------- @@ -155,7 +157,10 @@ class BenchmarkDataset(ABC): @abstractmethod def sample( - self, tokenizer: PreTrainedTokenizerBase, num_requests: int + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + request_id_prefix: str = "", ) -> list[SampleRequest]: """ Abstract method to generate sample requests from the dataset. @@ -167,6 +172,7 @@ class BenchmarkDataset(ABC): tokenizer (PreTrainedTokenizerBase): The tokenizer to be used for processing the dataset's text. num_requests (int): The number of sample requests to generate. + request_id_prefix (str) The prefix of request_id. Returns: list[SampleRequest]: A list of sample requests generated from the @@ -175,7 +181,10 @@ class BenchmarkDataset(ABC): raise NotImplementedError("sample must be implemented in subclasses.") def maybe_oversample_requests( - self, requests: list[SampleRequest], num_requests: int + self, + requests: list[SampleRequest], + num_requests: int, + request_id_prefix: str = "", ) -> None: """ Oversamples the list of requests if its size is less than the desired @@ -183,11 +192,18 @@ class BenchmarkDataset(ABC): Args: requests (List[SampleRequest]): The current list of sampled - requests. num_requests (int): The target number of requests. + requests. + num_requests (int): The target number of requests. + request_id_prefix (str) The prefix of the request ids. """ if len(requests) < num_requests: random.seed(self.random_seed) - additional = random.choices(requests, k=num_requests - len(requests)) + additional = deepcopy( + random.choices(requests, k=num_requests - len(requests)) + ) + for i in range(len(additional)): + req = additional[i] + req.request_id = request_id_prefix + str(len(requests) + i) requests.extend(additional) logger.info("Oversampled requests to reach %d total samples.", num_requests) @@ -303,6 +319,7 @@ class RandomDataset(BenchmarkDataset): range_ratio: float = DEFAULT_RANGE_RATIO, input_len: int = DEFAULT_INPUT_LEN, output_len: int = DEFAULT_OUTPUT_LEN, + request_id_prefix: str = "", **kwargs, ) -> list[SampleRequest]: # Enforce range_ratio < 1 @@ -363,8 +380,10 @@ class RandomDataset(BenchmarkDataset): prompt=prompt, prompt_len=total_input_len, expected_output_len=int(output_lens[i]), + request_id=request_id_prefix + str(i), ) ) + return requests @@ -406,9 +425,11 @@ class ShareGPTDataset(BenchmarkDataset): max_loras: Optional[int] = None, output_len: Optional[int] = None, enable_multimodal_chat: bool = False, + request_id_prefix: str = "", **kwargs, ) -> list: samples: list = [] + ind = 0 for entry in self.data: if len(samples) >= num_requests: break @@ -444,9 +465,11 @@ class ShareGPTDataset(BenchmarkDataset): expected_output_len=new_output_len, lora_request=lora_request, multi_modal_data=mm_content, + request_id=request_id_prefix + str(ind), ) ) - self.maybe_oversample_requests(samples, num_requests) + ind += 1 + self.maybe_oversample_requests(samples, num_requests, request_id_prefix) return samples @@ -512,10 +535,11 @@ class CustomDataset(BenchmarkDataset): output_len: Optional[int] = None, enable_multimodal_chat: bool = False, skip_chat_template: bool = False, + request_id_prefix: str = "", **kwargs, ) -> list: sampled_requests = [] - for item in self.data: + for i, item in enumerate(self.data): if len(sampled_requests) >= num_requests: break prompt = item["prompt"] @@ -534,9 +558,12 @@ class CustomDataset(BenchmarkDataset): prompt=prompt, prompt_len=prompt_len, expected_output_len=output_len, + request_id=request_id_prefix + str(i), ) ) - self.maybe_oversample_requests(sampled_requests, num_requests) + self.maybe_oversample_requests( + sampled_requests, num_requests, request_id_prefix + ) return sampled_requests @@ -578,6 +605,7 @@ class SonnetDataset(BenchmarkDataset): input_len: int = DEFAULT_INPUT_LEN, output_len: int = DEFAULT_OUTPUT_LEN, return_prompt_formatted: bool = False, + request_id_prefix: str = "", **kwargs, ) -> list: # Calculate average token length for a poem line. @@ -603,6 +631,7 @@ class SonnetDataset(BenchmarkDataset): prefix_lines = self.data[:num_prefix_lines] samples = [] + ind = 0 while len(samples) < num_requests: extra_lines = random.choices( self.data, k=num_input_lines - num_prefix_lines @@ -613,14 +642,17 @@ class SonnetDataset(BenchmarkDataset): msg, add_generation_prompt=True, tokenize=False ) prompt_len = len(tokenizer(prompt_formatted).input_ids) + if prompt_len <= input_len: samples.append( SampleRequest( prompt=prompt_formatted if return_prompt_formatted else prompt, prompt_len=prompt_len, expected_output_len=output_len, + request_id=request_id_prefix + str(ind), ) ) + ind += 1 return samples @@ -672,6 +704,7 @@ class BurstGPTDataset(BenchmarkDataset): num_requests: int, max_loras: Optional[int] = None, lora_path: Optional[str] = None, + request_id_prefix: str = "", **kwargs, ) -> list[SampleRequest]: samples = [] @@ -693,6 +726,7 @@ class BurstGPTDataset(BenchmarkDataset): prompt_len=input_len, expected_output_len=output_len, lora_request=lora_req, + request_id=request_id_prefix + str(i), ) ) return samples @@ -752,12 +786,14 @@ class ConversationDataset(HuggingFaceDataset): num_requests: int, output_len: Optional[int] = None, enable_multimodal_chat: bool = False, + request_id_prefix: str = "", **kwargs, ) -> list: # Filter examples with at least 2 conversations filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2) sampled_requests = [] dynamic_output = output_len is None + ind = 0 for item in filtered_data: if len(sampled_requests) >= num_requests: @@ -785,9 +821,13 @@ class ConversationDataset(HuggingFaceDataset): prompt_len=prompt_len, expected_output_len=output_len, multi_modal_data=mm_content, + request_id=request_id_prefix + str(ind), ) ) - self.maybe_oversample_requests(sampled_requests, num_requests) + ind += 1 + self.maybe_oversample_requests( + sampled_requests, num_requests, request_id_prefix + ) return sampled_requests @@ -814,11 +854,12 @@ class VisionArenaDataset(HuggingFaceDataset): num_requests: int, output_len: Optional[int] = None, enable_multimodal_chat: bool = False, + request_id_prefix: str = "", **kwargs, ) -> list: output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN sampled_requests = [] - for item in self.data: + for i, item in enumerate(self.data): if len(sampled_requests) >= num_requests: break parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path) @@ -838,9 +879,12 @@ class VisionArenaDataset(HuggingFaceDataset): prompt_len=prompt_len, expected_output_len=output_len, multi_modal_data=mm_content, + request_id=request_id_prefix + str(i), ) ) - self.maybe_oversample_requests(sampled_requests, num_requests) + self.maybe_oversample_requests( + sampled_requests, num_requests, request_id_prefix + ) return sampled_requests @@ -870,11 +914,12 @@ class InstructCoderDataset(HuggingFaceDataset): num_requests: int, output_len: Optional[int] = None, enable_multimodal_chat: bool = False, + request_id_prefix: str = "", **kwargs, ) -> list: output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN sampled_requests = [] - for item in self.data: + for i, item in enumerate(self.data): if len(sampled_requests) >= num_requests: break prompt = f"{item['input']}\n\n{item['instruction']} Just output \ @@ -892,9 +937,12 @@ class InstructCoderDataset(HuggingFaceDataset): prompt=prompt, prompt_len=prompt_len, expected_output_len=output_len, + request_id=request_id_prefix + str(i), ) ) - self.maybe_oversample_requests(sampled_requests, num_requests) + self.maybe_oversample_requests( + sampled_requests, num_requests, request_id_prefix + ) return sampled_requests @@ -924,12 +972,13 @@ class MTBenchDataset(HuggingFaceDataset): num_requests: int, output_len: Optional[int] = None, enable_multimodal_chat: bool = False, + request_id_prefix: str = "", **kwargs, ) -> list: output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN sampled_requests = [] - for item in self.data: + for i, item in enumerate(self.data): if len(sampled_requests) >= num_requests: break prompt = item["turns"][0] @@ -947,9 +996,12 @@ class MTBenchDataset(HuggingFaceDataset): prompt=prompt, prompt_len=prompt_len, expected_output_len=output_len, + request_id=request_id_prefix + str(i), ) ) - self.maybe_oversample_requests(sampled_requests, num_requests) + self.maybe_oversample_requests( + sampled_requests, num_requests, request_id_prefix + ) return sampled_requests @@ -974,10 +1026,12 @@ class AIMODataset(HuggingFaceDataset): tokenizer: PreTrainedTokenizerBase, num_requests: int, output_len: Optional[int] = None, + request_id_prefix: str = "", **kwargs, ) -> list: sampled_requests = [] dynamic_output = output_len is None + ind = 0 for item in self.data: if len(sampled_requests) >= num_requests: @@ -1000,9 +1054,13 @@ class AIMODataset(HuggingFaceDataset): prompt_len=prompt_len, expected_output_len=output_len, multi_modal_data=None, + request_id=request_id_prefix + str(ind), ) ) - self.maybe_oversample_requests(sampled_requests, num_requests) + ind += 1 + self.maybe_oversample_requests( + sampled_requests, num_requests, request_id_prefix + ) return sampled_requests @@ -1072,12 +1130,18 @@ class NextEditPredictionDataset(HuggingFaceDataset): "zed-industries/zeta": _format_zeta_prompt, } - def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, **kwargs): + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + request_id_prefix: str = "", + **kwargs, + ): formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.dataset_path) if formatting_prompt_func is None: raise ValueError(f"Unsupported dataset path: {self.dataset_path}") samples = [] - for sample in self.data: + for i, sample in enumerate(self.data): sample = formatting_prompt_func(sample) samples.append( SampleRequest( @@ -1086,11 +1150,12 @@ class NextEditPredictionDataset(HuggingFaceDataset): expected_output_len=len( tokenizer(sample["expected_output"]).input_ids ), + request_id=request_id_prefix + str(i), ) ) if len(samples) >= num_requests: break - self.maybe_oversample_requests(samples, num_requests) + self.maybe_oversample_requests(samples, num_requests, request_id_prefix) return samples @@ -1139,6 +1204,7 @@ class ASRDataset(HuggingFaceDataset): tokenizer: PreTrainedTokenizerBase, num_requests: int, output_len: Optional[int] = None, + request_id_prefix: str = "", **kwargs, ) -> list: import librosa @@ -1148,6 +1214,7 @@ class ASRDataset(HuggingFaceDataset): prompt_len = len(tokenizer(prompt).input_ids) sampled_requests = [] skipped = 0 + ind = 0 for item in self.data: if len(sampled_requests) >= num_requests: break @@ -1166,8 +1233,10 @@ class ASRDataset(HuggingFaceDataset): prompt_len=prompt_len, expected_output_len=output_len, multi_modal_data=mm_content, + request_id=request_id_prefix + str(ind), ) ) + ind += 1 if skipped: logger.warning( "%d samples discarded from dataset due to" @@ -1175,5 +1244,7 @@ class ASRDataset(HuggingFaceDataset): " what Whisper supports.", skipped, ) - self.maybe_oversample_requests(sampled_requests, num_requests) + self.maybe_oversample_requests( + sampled_requests, num_requests, request_id_prefix + ) return sampled_requests diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index ae38caf7290b1..02f5f585c0c16 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -375,11 +375,12 @@ async def benchmark( rps_change_events.append({"rps": rps_val, "timestamp": timestamp}) last_int_rps = current_int_rps - prompt, prompt_len, output_len, mm_content = ( + prompt, prompt_len, output_len, mm_content, request_id = ( request.prompt, request.prompt_len, request.expected_output_len, request.multi_modal_data, + request.request_id, ) req_model_id, req_model_name = model_id, model_name if lora_modules: @@ -397,6 +398,7 @@ async def benchmark( multi_modal_content=mm_content, ignore_eos=ignore_eos, extra_body=extra_body, + request_id=request_id, ) task = limited_request_func(request_func_input=request_func_input, pbar=pbar) tasks.append(asyncio.create_task(task)) @@ -665,6 +667,7 @@ def main(args: argparse.Namespace): tokenizer=tokenizer, output_len=args.custom_output_len, skip_chat_template=args.custom_skip_chat_template, + request_id_prefix=args.request_id_prefix, ) elif args.dataset_name == "sonnet": @@ -678,6 +681,7 @@ def main(args: argparse.Namespace): prefix_len=args.sonnet_prefix_len, tokenizer=tokenizer, return_prompt_formatted=False, + request_id_prefix=args.request_id_prefix, ) else: assert tokenizer.chat_template or tokenizer.default_chat_template, ( @@ -690,6 +694,7 @@ def main(args: argparse.Namespace): prefix_len=args.sonnet_prefix_len, tokenizer=tokenizer, return_prompt_formatted=True, + request_id_prefix=args.request_id_prefix, ) elif args.dataset_name == "hf": @@ -751,6 +756,7 @@ def main(args: argparse.Namespace): num_requests=args.num_prompts, tokenizer=tokenizer, output_len=args.hf_output_len, + request_id_prefix=args.request_id_prefix, ) else: @@ -762,10 +768,15 @@ def main(args: argparse.Namespace): tokenizer=tokenizer, num_requests=args.num_prompts, output_len=args.sharegpt_output_len, + request_id_prefix=args.request_id_prefix, ), "burstgpt": lambda: BurstGPTDataset( random_seed=args.seed, dataset_path=args.dataset_path - ).sample(tokenizer=tokenizer, num_requests=args.num_prompts), + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + request_id_prefix=args.request_id_prefix, + ), "random": lambda: RandomDataset(dataset_path=args.dataset_path).sample( tokenizer=tokenizer, num_requests=args.num_prompts, @@ -773,6 +784,7 @@ def main(args: argparse.Namespace): input_len=args.random_input_len, output_len=args.random_output_len, range_ratio=args.random_range_ratio, + request_id_prefix=args.request_id_prefix, ), } @@ -1118,6 +1130,13 @@ def create_argument_parser(): "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " "and the blog: https://hao-ai-lab.github.io/blogs/distserve", ) + parser.add_argument( + "--request-id-prefix", + type=str, + required=False, + default="benchmark-serving", + help="Specify the prefix of request id.", + ) # group for dataset specific arguments custom_group = parser.add_argument_group("custom dataset options") diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 72d7ce49b8e14..b575e8b9e0a01 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -18,6 +18,7 @@ import logging import random from abc import ABC, abstractmethod from collections.abc import Mapping +from copy import deepcopy from dataclasses import dataclass from functools import cache from io import BytesIO @@ -76,6 +77,7 @@ class SampleRequest: Union[MultiModalDataDict, dict, list[dict]] ] = None lora_request: Optional[LoRARequest] = None + request_id: Optional[str] = None # ----------------------------------------------------------------------------- @@ -183,7 +185,8 @@ class BenchmarkDataset(ABC): @abstractmethod def sample(self, tokenizer: PreTrainedTokenizerBase, - num_requests: int) -> list[SampleRequest]: + num_requests: int, + request_id_prefix: str = "") -> list[SampleRequest]: """ Abstract method to generate sample requests from the dataset. @@ -194,6 +197,8 @@ class BenchmarkDataset(ABC): tokenizer (PreTrainedTokenizerBase): The tokenizer to be used for processing the dataset's text. num_requests (int): The number of sample requests to generate. + request_id_prefix (str) The prefix of request_id. + Returns: list[SampleRequest]: A list of sample requests generated from the @@ -201,8 +206,12 @@ class BenchmarkDataset(ABC): """ raise NotImplementedError("sample must be implemented in subclasses.") - def maybe_oversample_requests(self, requests: list[SampleRequest], - num_requests: int) -> None: + def maybe_oversample_requests( + self, + requests: list[SampleRequest], + num_requests: int, + request_id_prefix: str = "", + ) -> None: """ Oversamples the list of requests if its size is less than the desired number. @@ -211,11 +220,17 @@ class BenchmarkDataset(ABC): requests (List[SampleRequest]): The current list of sampled requests. num_requests (int): The target number of requests. + request_id_prefix (str) The prefix of the request ids. + """ if len(requests) < num_requests: random.seed(self.random_seed) - additional = random.choices(requests, - k=num_requests - len(requests)) + additional = deepcopy( + random.choices(requests, k=num_requests - len(requests)) + ) + for i in range(len(additional)): + req = additional[i] + req.request_id = request_id_prefix + str(len(requests) + i) requests.extend(additional) logger.info("Oversampled requests to reach %d total samples.", num_requests) @@ -334,6 +349,7 @@ class RandomDataset(BenchmarkDataset): range_ratio: float = DEFAULT_RANGE_RATIO, input_len: int = DEFAULT_INPUT_LEN, output_len: int = DEFAULT_OUTPUT_LEN, + request_id_prefix: str = "", **kwargs, ) -> list[SampleRequest]: # Enforce range_ratio < 1 @@ -391,6 +407,7 @@ class RandomDataset(BenchmarkDataset): prompt=prompt, prompt_len=total_input_len, expected_output_len=int(output_lens[i]), + request_id=request_id_prefix + str(i), )) return requests @@ -432,9 +449,11 @@ class ShareGPTDataset(BenchmarkDataset): max_loras: Optional[int] = None, output_len: Optional[int] = None, enable_multimodal_chat: bool = False, + request_id_prefix: str = "", **kwargs, ) -> list: samples: list = [] + ind = 0 for entry in self.data: if len(samples) >= num_requests: break @@ -470,8 +489,10 @@ class ShareGPTDataset(BenchmarkDataset): expected_output_len=new_output_len, lora_request=lora_request, multi_modal_data=mm_content, + request_id=request_id_prefix + str(ind), )) - self.maybe_oversample_requests(samples, num_requests) + ind += 1 + self.maybe_oversample_requests(samples, num_requests, request_id_prefix) return samples @@ -647,6 +668,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: tokenizer=tokenizer, output_len=args.custom_output_len, skip_chat_template=args.custom_skip_chat_template, + request_id_prefix=args.request_id_prefix, ) elif args.dataset_name == "sonnet": @@ -660,6 +682,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: prefix_len=args.sonnet_prefix_len, tokenizer=tokenizer, return_prompt_formatted=False, + request_id_prefix=args.request_id_prefix, ) else: assert tokenizer.chat_template or tokenizer.default_chat_template, ( @@ -671,6 +694,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: prefix_len=args.sonnet_prefix_len, tokenizer=tokenizer, return_prompt_formatted=True, + request_id_prefix=args.request_id_prefix, ) elif args.dataset_name == "hf": @@ -730,6 +754,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: num_requests=args.num_prompts, tokenizer=tokenizer, output_len=args.hf_output_len, + request_id_prefix=args.request_id_prefix, ) else: @@ -741,11 +766,13 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: tokenizer=tokenizer, num_requests=args.num_prompts, output_len=args.sharegpt_output_len, + request_id_prefix=args.request_id_prefix, ), "burstgpt": lambda: BurstGPTDataset(random_seed=args.seed, dataset_path=args.dataset_path). - sample(tokenizer=tokenizer, num_requests=args.num_prompts), + sample(tokenizer=tokenizer, num_requests=args.num_prompts, + request_id_prefix=args.request_id_prefix,), "random": lambda: RandomDataset(random_seed=args.seed, dataset_path=args.dataset_path).sample( @@ -755,6 +782,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: input_len=args.random_input_len, output_len=args.random_output_len, range_ratio=args.random_range_ratio, + request_id_prefix=args.request_id_prefix, ), "prefix_repetition": lambda: PrefixRepetitionRandomDataset( @@ -766,6 +794,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: suffix_len=args.prefix_repetition_suffix_len, num_prefixes=args.prefix_repetition_num_prefixes, output_len=args.prefix_repetition_output_len, + request_id_prefix=args.request_id_prefix, ), } @@ -839,10 +868,11 @@ class CustomDataset(BenchmarkDataset): output_len: Optional[int] = None, enable_multimodal_chat: bool = False, skip_chat_template: bool = False, + request_id_prefix: str = "", **kwargs, ) -> list: sampled_requests = [] - for item in self.data: + for i, item in enumerate(self.data): if len(sampled_requests) >= num_requests: break prompt = item["prompt"] @@ -864,8 +894,10 @@ class CustomDataset(BenchmarkDataset): prompt=prompt, prompt_len=prompt_len, expected_output_len=output_len, + request_id=request_id_prefix + str(i), )) - self.maybe_oversample_requests(sampled_requests, num_requests) + self.maybe_oversample_requests(sampled_requests, num_requests, + request_id_prefix) return sampled_requests @@ -909,6 +941,7 @@ class SonnetDataset(BenchmarkDataset): input_len: int = DEFAULT_INPUT_LEN, output_len: int = DEFAULT_OUTPUT_LEN, return_prompt_formatted: bool = False, + request_id_prefix: str = "", **kwargs, ) -> list: # Calculate average token length for a poem line. @@ -934,6 +967,7 @@ class SonnetDataset(BenchmarkDataset): prefix_lines = self.data[:num_prefix_lines] samples = [] + ind = 0 while len(samples) < num_requests: extra_lines = random.choices(self.data, k=num_input_lines - num_prefix_lines) @@ -949,7 +983,9 @@ class SonnetDataset(BenchmarkDataset): if return_prompt_formatted else prompt, prompt_len=prompt_len, expected_output_len=output_len, + request_id=request_id_prefix + str(ind), )) + ind += 1 return samples @@ -1000,6 +1036,7 @@ class BurstGPTDataset(BenchmarkDataset): num_requests: int, max_loras: Optional[int] = None, lora_path: Optional[str] = None, + request_id_prefix: str = "", **kwargs, ) -> list[SampleRequest]: samples = [] @@ -1020,6 +1057,7 @@ class BurstGPTDataset(BenchmarkDataset): prompt_len=input_len, expected_output_len=output_len, lora_request=lora_req, + request_id=request_id_prefix + str(i), )) return samples @@ -1075,11 +1113,13 @@ class ConversationDataset(HuggingFaceDataset): num_requests: int, output_len: Optional[int] = None, enable_multimodal_chat: bool = False, + request_id_prefix: str = "", **kwargs) -> list: # Filter examples with at least 2 conversations filtered_data = self.data.filter( lambda x: len(x["conversations"]) >= 2) sampled_requests = [] + ind = 0 dynamic_output = output_len is None for item in filtered_data: @@ -1111,8 +1151,11 @@ class ConversationDataset(HuggingFaceDataset): prompt_len=prompt_len, expected_output_len=output_len, multi_modal_data=mm_content, + request_id=request_id_prefix + str(ind), )) - self.maybe_oversample_requests(sampled_requests, num_requests) + ind += 1 + self.maybe_oversample_requests(sampled_requests, num_requests, + request_id_prefix) return sampled_requests @@ -1141,12 +1184,13 @@ class VisionArenaDataset(HuggingFaceDataset): num_requests: int, output_len: Optional[int] = None, enable_multimodal_chat: bool = False, + request_id_prefix: str = "", **kwargs, ) -> list: output_len = (output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN) sampled_requests = [] - for item in self.data: + for i, item in enumerate(self.data): if len(sampled_requests) >= num_requests: break parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path) @@ -1168,8 +1212,10 @@ class VisionArenaDataset(HuggingFaceDataset): prompt_len=prompt_len, expected_output_len=output_len, multi_modal_data=mm_content, + request_id=request_id_prefix + str(i), )) - self.maybe_oversample_requests(sampled_requests, num_requests) + self.maybe_oversample_requests(sampled_requests, num_requests, + request_id_prefix) return sampled_requests @@ -1198,11 +1244,12 @@ class InstructCoderDataset(HuggingFaceDataset): num_requests: int, output_len: Optional[int] = None, enable_multimodal_chat: bool = False, + request_id_prefix: str = "", **kwargs) -> list: output_len = (output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN) sampled_requests = [] - for item in self.data: + for i, item in enumerate(self.data): if len(sampled_requests) >= num_requests: break prompt = f"{item['input']}\n\n{item['instruction']} Just output \ @@ -1224,8 +1271,10 @@ class InstructCoderDataset(HuggingFaceDataset): prompt=prompt, prompt_len=prompt_len, expected_output_len=output_len, + request_id=request_id_prefix + str(i), )) - self.maybe_oversample_requests(sampled_requests, num_requests) + self.maybe_oversample_requests(sampled_requests, num_requests, + request_id_prefix) return sampled_requests @@ -1255,13 +1304,14 @@ class MTBenchDataset(HuggingFaceDataset): num_requests: int, output_len: Optional[int] = None, enable_multimodal_chat: bool = False, + request_id_prefix: str = "", **kwargs, ) -> list: output_len = (output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN) sampled_requests = [] - for item in self.data: + for i, item in enumerate(self.data): if len(sampled_requests) >= num_requests: break prompt = item["turns"][0] @@ -1282,8 +1332,10 @@ class MTBenchDataset(HuggingFaceDataset): prompt=prompt, prompt_len=prompt_len, expected_output_len=output_len, + request_id=request_id_prefix + str(i), )) - self.maybe_oversample_requests(sampled_requests, num_requests) + self.maybe_oversample_requests(sampled_requests, num_requests, + request_id_prefix) return sampled_requests @@ -1305,8 +1357,10 @@ class AIMODataset(HuggingFaceDataset): tokenizer: PreTrainedTokenizerBase, num_requests: int, output_len: Optional[int] = None, + request_id_prefix: str = "", **kwargs) -> list: sampled_requests = [] + ind = 0 dynamic_output = output_len is None for item in self.data: @@ -1331,8 +1385,12 @@ class AIMODataset(HuggingFaceDataset): prompt_len=prompt_len, expected_output_len=output_len, multi_modal_data=None, + request_id=request_id_prefix + str(ind), + )) - self.maybe_oversample_requests(sampled_requests, num_requests) + ind += 1 + self.maybe_oversample_requests(sampled_requests, num_requests, + request_id_prefix) return sampled_requests @@ -1403,13 +1461,14 @@ class NextEditPredictionDataset(HuggingFaceDataset): } def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, + request_id_prefix: str = "", **kwargs): formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get( self.dataset_path) if formatting_prompt_func is None: raise ValueError(f"Unsupported dataset path: {self.dataset_path}") samples = [] - for sample in self.data: + for i, sample in enumerate(self.data): sample = formatting_prompt_func(sample) samples.append( SampleRequest( @@ -1417,10 +1476,11 @@ class NextEditPredictionDataset(HuggingFaceDataset): prompt_len=len(tokenizer(sample["prompt"]).input_ids), expected_output_len=len( tokenizer(sample["expected_output"]).input_ids), + request_id=request_id_prefix + str(i), )) if len(samples) >= num_requests: break - self.maybe_oversample_requests(samples, num_requests) + self.maybe_oversample_requests(samples, num_requests, request_id_prefix) return samples @@ -1470,6 +1530,7 @@ class ASRDataset(HuggingFaceDataset): tokenizer: PreTrainedTokenizerBase, num_requests: int, output_len: Optional[int] = None, + request_id_prefix: str = "", **kwargs, ) -> list: output_len = (output_len @@ -1477,6 +1538,7 @@ class ASRDataset(HuggingFaceDataset): prompt = ASRDataset.TRANSCRIPTION_PREAMBLE prompt_len = len(tokenizer(prompt).input_ids) sampled_requests = [] + ind = 0 skipped = 0 for item in self.data: if len(sampled_requests) >= num_requests: @@ -1496,7 +1558,9 @@ class ASRDataset(HuggingFaceDataset): prompt_len=prompt_len, expected_output_len=output_len, multi_modal_data=mm_content, + request_id=request_id_prefix + str(ind), )) + ind += 1 if skipped: logger.warning( "%d samples discarded from dataset due to" @@ -1504,7 +1568,8 @@ class ASRDataset(HuggingFaceDataset): " what Whisper supports.", skipped, ) - self.maybe_oversample_requests(sampled_requests, num_requests) + self.maybe_oversample_requests(sampled_requests, num_requests, + request_id_prefix) return sampled_requests @@ -1541,11 +1606,13 @@ class MLPerfDataset(HuggingFaceDataset): tokenizer: PreTrainedTokenizerBase, num_requests: int, output_len: Optional[int] = None, + request_id_prefix: str = "", **kwargs, ) -> list[SampleRequest]: # Force dynamic output length based on reference completion. dynamic_output = output_len is None sampled_requests: list[SampleRequest] = [] + ind = 0 for item in self.data: if len(sampled_requests) >= num_requests: @@ -1580,10 +1647,13 @@ class MLPerfDataset(HuggingFaceDataset): prompt=prompt_formatted, prompt_len=prompt_len, expected_output_len=expected_output_len, + request_id=request_id_prefix + str(ind), ) ) + ind += 1 - self.maybe_oversample_requests(sampled_requests, num_requests) + self.maybe_oversample_requests(sampled_requests, num_requests, + request_id_prefix) return sampled_requests @@ -1616,6 +1686,7 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset): suffix_len: int = DEFAULT_SUFFIX_LEN, num_prefixes: int = DEFAULT_NUM_PREFIXES, output_len: int = DEFAULT_OUTPUT_LEN, + request_id_prefix: str = "", **kwargs, ) -> list[SampleRequest]: vocab_size = tokenizer.vocab_size diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index 47bc288774504..677fe16cf5ccd 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -31,6 +31,7 @@ class RequestFuncInput: multi_modal_content: Optional[dict | list[dict]] = None ignore_eos: bool = False language: Optional[str] = None + request_id: Optional[str] = None @dataclass @@ -87,6 +88,8 @@ async def async_request_openai_completions( headers = { "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" } + if request_func_input.request_id: + headers["x-request-id"] = request_func_input.request_id output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -210,6 +213,8 @@ async def async_request_openai_chat_completions( "Content-Type": "application/json", "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", } + if request_func_input.request_id: + headers["x-request-id"] = request_func_input.request_id output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -311,6 +316,8 @@ async def async_request_openai_audio( headers = { "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", } + if request_func_input.request_id: + headers["x-request-id"] = request_func_input.request_id # Send audio file def to_bytes(y, sr): diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 7bf04c7532411..79f2c475cbe5d 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -478,11 +478,12 @@ async def benchmark( "timestamp": timestamp }) last_int_rps = current_int_rps - prompt, prompt_len, output_len, mm_content = ( + prompt, prompt_len, output_len, mm_content, request_id = ( request.prompt, request.prompt_len, request.expected_output_len, request.multi_modal_data, + request.request_id, ) req_model_id, req_model_name = model_id, model_name if lora_modules: @@ -498,7 +499,8 @@ async def benchmark( logprobs=logprobs, multi_modal_content=mm_content, ignore_eos=ignore_eos, - extra_body=extra_body) + extra_body=extra_body, + request_id=request_id,) tasks.append( asyncio.create_task( limited_request_func(request_func_input=request_func_input, @@ -865,6 +867,14 @@ def add_cli_args(parser: argparse.ArgumentParser): "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " "and the blog: https://hao-ai-lab.github.io/blogs/distserve", ) + parser.add_argument( + "--request-id-prefix", + type=str, + required=False, + default="benchmark-serving", + help="Specify the prefix of request id.", + ) + sampling_group = parser.add_argument_group("sampling parameters") sampling_group.add_argument(