[Frontend] Add --logprobs argument to benchmark_serving.py (#8191)

2026-05-17 01:49:10 +08:00 · 2024-09-06 12:01:14 -04:00 · 2024-09-06 12:01:14 -04:00 · e5cab71531
commit e5cab71531
parent baa5467547
3 changed files with 19 additions and 1 deletions
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -24,6 +24,7 @@ class RequestFuncInput:
    model: str
    best_of: int = 1
    use_beam_search: bool = False
    logprobs: Optional[int] = None
@dataclass
@ -236,6 +237,7 @@ async def async_request_openai_completions(
            "temperature": 0.0,
            "best_of": request_func_input.best_of,
            "max_tokens": request_func_input.output_len,
            "logprobs": request_func_input.logprobs,
            "stream": True,
        }
        headers = {
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -318,6 +318,7 @@ async def benchmark(
    model_id: str,
    tokenizer: PreTrainedTokenizerBase,
    input_requests: List[Tuple[str, int, int]],
    logprobs: Optional[int],
    best_of: int,
    use_beam_search: bool,
    request_rate: float,
@ -339,6 +340,7 @@ async def benchmark(
        api_url=api_url,
        prompt_len=test_prompt_len,
        output_len=test_output_len,
        logprobs=logprobs,
        best_of=best_of,
        use_beam_search=use_beam_search,
    )
@ -358,6 +360,7 @@ async def benchmark(
            api_url=base_url + "/start_profile",
            prompt_len=test_prompt_len,
            output_len=test_output_len,
            logprobs=logprobs,
            best_of=best_of,
            use_beam_search=use_beam_search,
        )
@ -379,6 +382,7 @@ async def benchmark(
            api_url=api_url,
            prompt_len=prompt_len,
            output_len=output_len,
            logprobs=logprobs,
            best_of=best_of,
            use_beam_search=use_beam_search,
        )
@ -396,6 +400,7 @@ async def benchmark(
            api_url=base_url + "/stop_profile",
            prompt_len=test_prompt_len,
            output_len=test_output_len,
            logprobs=logprobs,
            best_of=best_of,
            use_beam_search=use_beam_search,
        )
@ -580,6 +585,7 @@ def main(args: argparse.Namespace):
            model_id=model_id,
            tokenizer=tokenizer,
            input_requests=input_requests,
            logprobs=args.logprobs,
            best_of=args.best_of,
            use_beam_search=args.use_beam_search,
            request_rate=args.request_rate,
@ -721,6 +727,16 @@ if __name__ == "__main__":
        help=
        "Number of output tokens per request, used only for sonnet dataset.",
    )
    parser.add_argument(
        "--logprobs",
        type=int,
        default=None,
        help=("Number of logprobs-per-token to compute & return as part of "
              "the request. If unspecified, then either (1) if beam search "
              "is disabled, no logprobs are computed & a single dummy "
              "logprob is returned for each token; or (2) if beam search "
              "is enabled 1 logprob per token is computed"),
    )
    parser.add_argument(
        "--sonnet-prefix-len",
        type=int,
--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
@ -57,7 +57,7 @@ def test_multi_step_llm(
                           GPU -> CPU output transfer
      num_prompts: number of example prompts under test
      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
-                    completions endpoint; `None` -> no logprobs
+                    completions endpoint; `None` -> 1 logprob returned.
    """
    prompts = example_prompts