From ec8a5e53863826eb25a9b242535d4671bbb3f6ce Mon Sep 17 00:00:00 2001 From: Brayden Zhong Date: Wed, 26 Feb 2025 06:06:47 -0500 Subject: [PATCH] [Misc]: Add support for goodput on guided benchmarking + TPOT calculation refactor (#13736) Signed-off-by: Brayden Zhong --- benchmarks/benchmark_serving_guided.py | 87 ++++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 5 deletions(-) diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py index 04942b06ffd5d..05eadff797876 100644 --- a/benchmarks/benchmark_serving_guided.py +++ b/benchmarks/benchmark_serving_guided.py @@ -9,7 +9,7 @@ On the server side, run one of the following commands: ./launch_tgi_server.sh On the client side, run: - python benchmarks/benchmark_serving.py \ + python benchmarks/benchmark_serving_guided.py \ --backend \ --model \ --dataset json \ @@ -31,7 +31,7 @@ import random import time import warnings from dataclasses import dataclass -from typing import AsyncGenerator, List, Optional, Tuple +from typing import AsyncGenerator, Dict, List, Optional, Tuple import datasets import numpy as np @@ -264,6 +264,7 @@ def calculate_metrics( tokenizer: PreTrainedTokenizerBase, selected_percentile_metrics: List[str], selected_percentiles: List[float], + goodput_config_dict: Optional[Dict[str, float]] = None, ) -> Tuple[BenchmarkMetrics, List[int]]: actual_output_lens: List[int] = [] total_input = 0 @@ -287,10 +288,10 @@ def calculate_metrics( total_input += input_requests[i].prompt_len tpot = 0 if output_len > 1: - tpot = (outputs[i].latency - outputs[i].ttft) / (output_len - - 1) + latency_minus_ttft = outputs[i].latency - outputs[i].ttft + tpot = latency_minus_ttft / (output_len - 1) tpots.append(tpot) - outputs[i].tpot = sum(tpots) / len(tpots) if len(tpots) else 0 + outputs[i].tpot = tpot # Note: if output_len <= 1, we regard tpot as 0 for goodput all_tpots.append(tpot) itls += outputs[i].itl @@ -300,6 +301,28 @@ def calculate_metrics( else: actual_output_lens.append(0) + if goodput_config_dict: + valid_metrics = [] + slo_values = [] + + if "ttft" in goodput_config_dict: + valid_metrics.append(ttfts) + slo_values.append(goodput_config_dict["ttft"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + if "tpot" in goodput_config_dict: + valid_metrics.append(all_tpots) + slo_values.append(goodput_config_dict["tpot"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + if "e2el" in goodput_config_dict: + valid_metrics.append(e2els) + slo_values.append(goodput_config_dict["e2el"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + + for req_metric in zip(*valid_metrics): + is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) + if is_good_req: + good_completed += 1 + if completed == 0: warnings.warn( "All requests failed. This is likely due to a misconfiguration " @@ -356,6 +379,7 @@ async def benchmark( max_concurrency: Optional[int], guided_decoding_ratio: float, guided_decoding_backend: str, + goodput_config_dict: Optional[Dict[str, float]] = None, ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -483,6 +507,7 @@ async def benchmark( tokenizer=tokenizer, selected_percentile_metrics=selected_percentile_metrics, selected_percentiles=selected_percentiles, + goodput_config_dict=goodput_config_dict, ) print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) @@ -494,6 +519,9 @@ async def benchmark( metrics.total_output)) print("{:<40} {:<10.2f}".format("Request throughput (req/s):", metrics.request_throughput)) + if goodput_config_dict: + print("{:<40} {:<10.2f}".format("Request goodput (req/s):", + metrics.request_goodput)) print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", metrics.output_throughput)) print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", @@ -617,6 +645,40 @@ def evaluate(ret, args): 100) if len(not_none_scores) > 0 else None +def parse_goodput(slo_pairs): + goodput_config_dict = {} + try: + for slo_pair in slo_pairs: + slo_name, slo_val = slo_pair.split(":") + goodput_config_dict[slo_name] = float(slo_val) + except ValueError as err: + raise argparse.ArgumentTypeError( + "Invalid format found for service level objectives. " + "Specify service level objectives for goodput as \"KEY:VALUE\" " + "pairs, where the key is a metric name, and the value is a " + "number in milliseconds.") from err + return goodput_config_dict + + +def check_goodput_args(args): + goodput_config_dict = {} + VALID_NAMES = ["ttft", "tpot", "e2el"] + if args.goodput: + goodput_config_dict = parse_goodput(args.goodput) + for slo_name, slo_val in goodput_config_dict.items(): + if slo_name not in VALID_NAMES: + raise ValueError( + f"Invalid metric name found, {slo_name}: {slo_val}. " + "The service level objective name should be one of " + f"{str(VALID_NAMES)}. ") + if slo_val < 0: + raise ValueError( + f"Invalid value found, {slo_name}: {slo_val}. " + "The service level objective value should be " + "non-negative.") + return goodput_config_dict + + def main(args: argparse.Namespace): print(args) random.seed(args.seed) @@ -661,6 +723,8 @@ def main(args: argparse.Namespace): input_requests = sample_requests(tokenizer, args) + goodput_config_dict = check_goodput_args(args) + benchmark_result, ret = asyncio.run( benchmark( backend=backend, @@ -681,6 +745,7 @@ def main(args: argparse.Namespace): max_concurrency=args.max_concurrency, guided_decoding_ratio=args.guided_decoding_ratio, guided_decoding_backend=args.guided_decoding_backend, + goodput_config_dict=goodput_config_dict, )) # Save config and results to json @@ -865,6 +930,18 @@ if __name__ == "__main__": "Default value is \"99\". " "Use \"--percentile-metrics\" to select metrics.", ) + parser.add_argument( + "--goodput", + nargs="+", + required=False, + help="Specify service level objectives for goodput as \"KEY:VALUE\" " + "pairs, where the key is a metric name, and the value is in " + "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, " + "separated by spaces. Allowed request level metric names are " + "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of " + "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " + "and the blog: https://hao-ai-lab.github.io/blogs/distserve") + parser.add_argument("--no-guided-decoding", action='store_true', default=False,