mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-14 06:55:01 +08:00
[perf] Use CPU tensor to reduce GPU->CPU sync (#25884)
Signed-off-by: Lehua Ding <lehuading@tencent.com>
This commit is contained in:
parent
d7e34b4210
commit
e184c9c510
@ -2478,7 +2478,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
effective_drafter_max_model_len = (
|
effective_drafter_max_model_len = (
|
||||||
self.speculative_config.draft_model_config.max_model_len)
|
self.speculative_config.draft_model_config.max_model_len)
|
||||||
input_fits_in_drafter = spec_decode_common_attn_metadata and (
|
input_fits_in_drafter = spec_decode_common_attn_metadata and (
|
||||||
spec_decode_common_attn_metadata.seq_lens.max() +
|
spec_decode_common_attn_metadata.max_seq_len +
|
||||||
self.speculative_config.num_speculative_tokens
|
self.speculative_config.num_speculative_tokens
|
||||||
<= effective_drafter_max_model_len)
|
<= effective_drafter_max_model_len)
|
||||||
if use_padded_batch_for_eagle and input_fits_in_drafter:
|
if use_padded_batch_for_eagle and input_fits_in_drafter:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user