From c2ed069b32e2805c05a858c6157f4c6393b145a8 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sat, 1 Nov 2025 10:51:24 -0700 Subject: [PATCH] [BugFix] Fix mixed penalties batch with async scheduling (#27910) Signed-off-by: Nick Hill --- vllm/v1/sample/ops/penalties.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py index 898b90d41abae..241d9de957ea2 100644 --- a/vllm/v1/sample/ops/penalties.py +++ b/vllm/v1/sample/ops/penalties.py @@ -21,6 +21,14 @@ def apply_all_penalties( """ _, vocab_size = logits.shape output_tokens_t = _convert_to_tensors(output_token_ids, vocab_size, logits.device) + + # In the async scheduling case, rows that won't have penalties applied may contain + # -1 placeholder token ids. We must replace these with valid token ids so that the + # scatter done in apply_penalties is valid. + # NOTE(nick): The penalties implementation is currently quite inefficient and + # will be reworked anyhow. + output_tokens_t.masked_fill_(output_tokens_t == -1, vocab_size) + return apply_penalties( logits, prompt_token_ids,