[BugFix] add max-num-batched-token to scheduler hash (#29829)

Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-12-24 10:36:38 +08:00 · 2025-12-02 00:55:02 -08:00 · 2025-12-02 00:55:02 -08:00 · 70fb77b4dc
commit 70fb77b4dc
parent 48d15a32aa
2 changed files with 12 additions and 6 deletions
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@ -175,9 +175,19 @@ class SchedulerConfig:
        excluding anything before input ids/embeddings and after
        the final hidden states.
        """
        # no factors to consider.
        # this config will not affect the computation graph.
        factors: list[Any] = []
        # max_num_batched_tokens need to be included in the hash due
        # to two reasons:
        # 1. LoRA creates static buffers based on max_num_batched_tokens.
        #   The tensor sizes and strides get captured in the torch.compile
        #   graph explicitly.
        # 2. Inductor decides whether using 32-bit or 64-bit indexing integer
        #   based on the data sizes. `max_num_batched_tokens` has an
        #   impact on that. For more details, please check
        #   https://github.com/vllm-project/vllm/issues/29585
        factors.append(self.max_num_batched_tokens)
        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@ -265,10 +265,6 @@ class VllmConfig:
            vllm_factors.append("None")
        if self.lora_config:
            vllm_factors.append(self.lora_config.compute_hash())
            # LoRA creates static buffers based on max_num_batched_tokens.
            # The tensor sizes and strides get captured in the torch.compile
            # graph explicitly.
            vllm_factors.append(str(self.scheduler_config.max_num_batched_tokens))
        else:
            vllm_factors.append("None")
        if self.speculative_config: