mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-24 10:36:38 +08:00
[BugFix] add max-num-batched-token to scheduler hash (#29829)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
This commit is contained in:
parent
48d15a32aa
commit
70fb77b4dc
@ -175,9 +175,19 @@ class SchedulerConfig:
|
|||||||
excluding anything before input ids/embeddings and after
|
excluding anything before input ids/embeddings and after
|
||||||
the final hidden states.
|
the final hidden states.
|
||||||
"""
|
"""
|
||||||
# no factors to consider.
|
|
||||||
# this config will not affect the computation graph.
|
|
||||||
factors: list[Any] = []
|
factors: list[Any] = []
|
||||||
|
|
||||||
|
# max_num_batched_tokens need to be included in the hash due
|
||||||
|
# to two reasons:
|
||||||
|
# 1. LoRA creates static buffers based on max_num_batched_tokens.
|
||||||
|
# The tensor sizes and strides get captured in the torch.compile
|
||||||
|
# graph explicitly.
|
||||||
|
# 2. Inductor decides whether using 32-bit or 64-bit indexing integer
|
||||||
|
# based on the data sizes. `max_num_batched_tokens` has an
|
||||||
|
# impact on that. For more details, please check
|
||||||
|
# https://github.com/vllm-project/vllm/issues/29585
|
||||||
|
factors.append(self.max_num_batched_tokens)
|
||||||
|
|
||||||
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||||
return hash_str
|
return hash_str
|
||||||
|
|
||||||
|
|||||||
@ -265,10 +265,6 @@ class VllmConfig:
|
|||||||
vllm_factors.append("None")
|
vllm_factors.append("None")
|
||||||
if self.lora_config:
|
if self.lora_config:
|
||||||
vllm_factors.append(self.lora_config.compute_hash())
|
vllm_factors.append(self.lora_config.compute_hash())
|
||||||
# LoRA creates static buffers based on max_num_batched_tokens.
|
|
||||||
# The tensor sizes and strides get captured in the torch.compile
|
|
||||||
# graph explicitly.
|
|
||||||
vllm_factors.append(str(self.scheduler_config.max_num_batched_tokens))
|
|
||||||
else:
|
else:
|
||||||
vllm_factors.append("None")
|
vllm_factors.append("None")
|
||||||
if self.speculative_config:
|
if self.speculative_config:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user