From 6dd94dbe94c1820a1e224cba65efcf0befa97995 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 24 Jan 2025 11:34:27 +0800 Subject: [PATCH] [perf] fix perf regression from #12253 (#12380) Signed-off-by: youkaichao --- vllm/worker/model_runner.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index cf2f1c6b3b877..bf1a40d48a789 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -455,7 +455,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): self.enable_prompt_adapter = (self.runner.prompt_adapter_config is not None) self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper - self.decode_only = True # Attention metadata inputs. if self.attn_backend is not None: @@ -477,6 +476,10 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): finished_requests_ids: Optional[List[str]] = None) -> None: self.finished_requests_ids = finished_requests_ids + # if the current batch is decode-only. + # will be set to False if there is any non-decode request. + self.decode_only = True + # Intermediate data (data in CPU before going to GPU) for # the current sequence group. self.inter_data_list: List[