From 2705f03cad320b7ea8fff5646361059487fa978f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 18 Sep 2025 12:40:34 -0700 Subject: [PATCH] [Chore] Minor simplification for GPU worker Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_worker.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 6855526583f04..146045a910cae 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -334,7 +334,8 @@ class Worker(WorkerBase): self.model_runner._dummy_run(size, skip_eplb=True, remove_lora=False) - self.model_runner.maybe_remove_all_loras(self.model_runner.lora_config) + if self.lora_config is not None: + self.model_runner.maybe_remove_all_loras(self.lora_config) # Warmup and tune the kernels used during model execution before # cuda graph capture. @@ -427,6 +428,9 @@ class Worker(WorkerBase): self, scheduler_output: "SchedulerOutput", ) -> Optional[Union[ModelRunnerOutput, AsyncModelRunnerOutput]]: + if len(get_pp_group().ranks) == 1: + return self.model_runner.execute_model(scheduler_output) + intermediate_tensors = None forward_pass = scheduler_output.total_num_scheduled_tokens > 0 num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens @@ -445,8 +449,6 @@ class Worker(WorkerBase): output = self.model_runner.execute_model(scheduler_output, intermediate_tensors) - if isinstance(output, (ModelRunnerOutput, AsyncModelRunnerOutput)): - return output assert isinstance(output, IntermediateTensors) parallel_config = self.vllm_config.parallel_config