From eea2536a35fccdcbd8a5c7fc1641c747c2f48630 Mon Sep 17 00:00:00 2001 From: Lehua Ding Date: Tue, 30 Sep 2025 19:51:16 +0800 Subject: [PATCH] [perf] Use CPU tensor to reduce GPU->CPU sync (#25884) Signed-off-by: Lehua Ding Signed-off-by: yewentao256 --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index bb5c3ea742936..f4c28dc24d70f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2478,7 +2478,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): effective_drafter_max_model_len = ( self.speculative_config.draft_model_config.max_model_len) input_fits_in_drafter = spec_decode_common_attn_metadata and ( - spec_decode_common_attn_metadata.seq_lens.max() + + spec_decode_common_attn_metadata.max_seq_len + self.speculative_config.num_speculative_tokens <= effective_drafter_max_model_len) if use_padded_batch_for_eagle and input_fits_in_drafter: