From 70fb77b4dcc3cfb368831d1aba8e1e2dca7c31a9 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Tue, 2 Dec 2025 00:55:02 -0800 Subject: [PATCH] [BugFix] add max-num-batched-token to scheduler hash (#29829) Signed-off-by: Boyuan Feng --- vllm/config/scheduler.py | 14 ++++++++++++-- vllm/config/vllm.py | 4 ---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 88f3e62fbd4e..1e089b42cccd 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -175,9 +175,19 @@ class SchedulerConfig: excluding anything before input ids/embeddings and after the final hidden states. """ - # no factors to consider. - # this config will not affect the computation graph. factors: list[Any] = [] + + # max_num_batched_tokens need to be included in the hash due + # to two reasons: + # 1. LoRA creates static buffers based on max_num_batched_tokens. + # The tensor sizes and strides get captured in the torch.compile + # graph explicitly. + # 2. Inductor decides whether using 32-bit or 64-bit indexing integer + # based on the data sizes. `max_num_batched_tokens` has an + # impact on that. For more details, please check + # https://github.com/vllm-project/vllm/issues/29585 + factors.append(self.max_num_batched_tokens) + hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 4542866aa166..615b1f8489ef 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -265,10 +265,6 @@ class VllmConfig: vllm_factors.append("None") if self.lora_config: vllm_factors.append(self.lora_config.compute_hash()) - # LoRA creates static buffers based on max_num_batched_tokens. - # The tensor sizes and strides get captured in the torch.compile - # graph explicitly. - vllm_factors.append(str(self.scheduler_config.max_num_batched_tokens)) else: vllm_factors.append("None") if self.speculative_config: