From 70fb77b4dcc3cfb368831d1aba8e1e2dca7c31a9 Mon Sep 17 00:00:00 2001
From: Boyuan Feng <boyuan@meta.com>
Date: Tue, 2 Dec 2025 00:55:02 -0800
Subject: [PATCH] [BugFix] add max-num-batched-token to scheduler hash (#29829)

Signed-off-by: Boyuan Feng <boyuan@meta.com>
---
 vllm/config/scheduler.py | 14 ++++++++++++--
 vllm/config/vllm.py      |  4 ----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 88f3e62fbd4e..1e089b42cccd 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -175,9 +175,19 @@ class SchedulerConfig:
         excluding anything before input ids/embeddings and after
         the final hidden states.
         """
-        # no factors to consider.
-        # this config will not affect the computation graph.
         factors: list[Any] = []
+
+        # max_num_batched_tokens need to be included in the hash due
+        # to two reasons:
+        # 1. LoRA creates static buffers based on max_num_batched_tokens.
+        #   The tensor sizes and strides get captured in the torch.compile
+        #   graph explicitly.
+        # 2. Inductor decides whether using 32-bit or 64-bit indexing integer
+        #   based on the data sizes. `max_num_batched_tokens` has an
+        #   impact on that. For more details, please check
+        #   https://github.com/vllm-project/vllm/issues/29585
+        factors.append(self.max_num_batched_tokens)
+
         hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 4542866aa166..615b1f8489ef 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -265,10 +265,6 @@ class VllmConfig:
             vllm_factors.append("None")
         if self.lora_config:
             vllm_factors.append(self.lora_config.compute_hash())
-            # LoRA creates static buffers based on max_num_batched_tokens.
-            # The tensor sizes and strides get captured in the torch.compile
-            # graph explicitly.
-            vllm_factors.append(str(self.scheduler_config.max_num_batched_tokens))
         else:
             vllm_factors.append("None")
         if self.speculative_config: