mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-24 11:26:23 +08:00
[BugFix] Fix using dbo_decode_token_threshold always (and ignoring dbo_prefill_token_threshold) (#25622)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
parent
d70c154975
commit
6ca8d9753c
@ -1045,11 +1045,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
|
num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
|
||||||
num_tokens_padded = num_tokens_unpadded + self.get_local_padding(
|
num_tokens_padded = num_tokens_unpadded + self.get_local_padding(
|
||||||
num_tokens_unpadded)
|
num_tokens_unpadded)
|
||||||
|
uniform_decode = \
|
||||||
|
(max_num_scheduled_tokens == self.uniform_decode_query_len) and \
|
||||||
|
(total_num_scheduled_tokens == num_reqs * max_num_scheduled_tokens)
|
||||||
ubatch_slices, num_tokens_after_padding = \
|
ubatch_slices, num_tokens_after_padding = \
|
||||||
ubatch_split(num_scheduled_tokens,
|
ubatch_split(num_scheduled_tokens,
|
||||||
num_tokens_unpadded,
|
num_tokens_unpadded,
|
||||||
num_tokens_padded,
|
num_tokens_padded,
|
||||||
self.vllm_config)
|
uniform_decode=uniform_decode,
|
||||||
|
vllm_config=self.vllm_config)
|
||||||
|
|
||||||
self.seq_lens.np[:num_reqs] = (
|
self.seq_lens.np[:num_reqs] = (
|
||||||
self.input_batch.num_computed_tokens_cpu[:num_reqs] +
|
self.input_batch.num_computed_tokens_cpu[:num_reqs] +
|
||||||
@ -2989,7 +2993,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
num_scheduled_tokens,
|
num_scheduled_tokens,
|
||||||
total_num_scheduled_tokens,
|
total_num_scheduled_tokens,
|
||||||
total_num_scheduled_tokens,
|
total_num_scheduled_tokens,
|
||||||
self.vllm_config,
|
uniform_decode=uniform_decode,
|
||||||
|
vllm_config=self.vllm_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
# If we failed to microbatch, currently need to resynchronize
|
# If we failed to microbatch, currently need to resynchronize
|
||||||
|
|||||||
@ -139,6 +139,7 @@ def ubatch_split(
|
|||||||
num_scheduled_tokens_per_request: np.ndarray,
|
num_scheduled_tokens_per_request: np.ndarray,
|
||||||
num_tokens_unpadded: int,
|
num_tokens_unpadded: int,
|
||||||
num_tokens_padded: int,
|
num_tokens_padded: int,
|
||||||
|
uniform_decode: bool,
|
||||||
vllm_config: VllmConfig,
|
vllm_config: VllmConfig,
|
||||||
) -> tuple[Optional[UBatchSlices], Optional[torch.Tensor]]:
|
) -> tuple[Optional[UBatchSlices], Optional[torch.Tensor]]:
|
||||||
"""
|
"""
|
||||||
@ -164,7 +165,7 @@ def ubatch_split(
|
|||||||
should_attempt_ubatching = check_ubatch_thresholds(
|
should_attempt_ubatching = check_ubatch_thresholds(
|
||||||
parallel_config,
|
parallel_config,
|
||||||
num_tokens_unpadded,
|
num_tokens_unpadded,
|
||||||
vllm_config,
|
uniform_decode=uniform_decode,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Don't microbatch unless every other DP worker is also microbatching
|
# Don't microbatch unless every other DP worker is also microbatching
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user