From 9e6562a3f625279fd7c8b9ac53c30fed3b01f5b9 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 9 Dec 2025 09:59:54 -0800 Subject: [PATCH] [Model Runner V2] Fix Triton warning on tl.where (#30355) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/sample/penalties.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/worker/gpu/sample/penalties.py b/vllm/v1/worker/gpu/sample/penalties.py index c8d4b7d81841d..b4fcc822ecfce 100644 --- a/vllm/v1/worker/gpu/sample/penalties.py +++ b/vllm/v1/worker/gpu/sample/penalties.py @@ -62,6 +62,7 @@ def _penalties_and_temperature_kernel( mask=packed_block < tl.cdiv(vocab_size, 32), ) prompt_bin_mask = (packed_mask[:, None] >> (tl.arange(0, 32)[None, :])) & 1 + prompt_bin_mask = prompt_bin_mask.to(tl.int1) prompt_bin_mask = prompt_bin_mask.reshape(BLOCK_SIZE) # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.