diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 88f3e62fbd4ed..1e089b42cccde 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -175,9 +175,19 @@ class SchedulerConfig: excluding anything before input ids/embeddings and after the final hidden states. """ - # no factors to consider. - # this config will not affect the computation graph. factors: list[Any] = [] + + # max_num_batched_tokens need to be included in the hash due + # to two reasons: + # 1. LoRA creates static buffers based on max_num_batched_tokens. + # The tensor sizes and strides get captured in the torch.compile + # graph explicitly. + # 2. Inductor decides whether using 32-bit or 64-bit indexing integer + # based on the data sizes. `max_num_batched_tokens` has an + # impact on that. For more details, please check + # https://github.com/vllm-project/vllm/issues/29585 + factors.append(self.max_num_batched_tokens) + hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 4542866aa166c..615b1f8489eff 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -265,10 +265,6 @@ class VllmConfig: vllm_factors.append("None") if self.lora_config: vllm_factors.append(self.lora_config.compute_hash()) - # LoRA creates static buffers based on max_num_batched_tokens. - # The tensor sizes and strides get captured in the torch.compile - # graph explicitly. - vllm_factors.append(str(self.scheduler_config.max_num_batched_tokens)) else: vllm_factors.append("None") if self.speculative_config: