From bd7599d34ae9aa46994f8e233e2293f6f2d13019 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Thu, 3 Apr 2025 03:36:01 +0200 Subject: [PATCH] [V1][TPU] Do not compile sampling more than needed (#15883) Signed-off-by: NickLucche --- vllm/v1/worker/tpu_model_runner.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index c2edbaf351d04..b1d5c0f338541 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -862,7 +862,9 @@ class TPUModelRunner: out = self.model.sample_from_hidden(dummy_hidden, sampling_meta) out = out.cpu() - if num_reqs_to_sample >= self.max_num_reqs: + # Requests can't be more than tokens. But do compile for the + # next bigger value in case num_tokens uses bucketed padding. + if num_reqs_to_sample >= min(num_tokens, self.max_num_reqs): break # Make sure to compile the `max_num_reqs` upper-limit case num_reqs_to_sample = _get_padded_num_reqs_with_upper_limit(