[TPU][V1][Bugfix] Fix chunked prefill with padding (#15037)

Signed-off-by: NickLucche <nlucches@redhat.com>
2025-12-16 04:45:01 +08:00 · 2025-03-18 15:34:45 +01:00 · 2025-03-18 15:34:45 +01:00 · af35d3a3cc
commit af35d3a3cc
parent 3b457143d2
1 changed files with 3 additions and 0 deletions
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@ -410,6 +410,9 @@ class TPUModelRunner:
        # Do the padding and copy the tensors to the TPU.
        padded_total_num_scheduled_tokens = _get_padded_token_len(
            total_num_scheduled_tokens)
        # Zero out to avoid spurious values from prev iteration (last cp chunk)
        self.input_ids_cpu[
            total_num_scheduled_tokens:padded_total_num_scheduled_tokens] = 0
        self.input_ids = self.input_ids_cpu[:
                                            padded_total_num_scheduled_tokens].to(
                                                self.device)