diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index bdf9203b1b1d..43cb95fb47f9 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -205,23 +205,18 @@ class EngineCore: self.batch_queue.put_nowait( (future, scheduler_output)) # type: ignore - # If all requests are scheduled or the job queue is full, + scheduled_batch = (scheduler_output is not None + and scheduler_output.total_num_scheduled_tokens > 0) + + # If no more requests can be scheduled and the job queue is not empty, # block until the first batch in the job queue is finished. - if (scheduler_output is None - or scheduler_output.total_num_scheduled_tokens == 0): - try: - future, scheduler_output = self.batch_queue.get( - timeout=POLLING_TIMEOUT_S) - # Blocking until the first result is available. - model_output = future.result() - self.batch_queue.task_done() - engine_core_outputs = self.scheduler.update_from_output( - scheduler_output, model_output) - except queue.Empty: - # If the queue is empty (timeout at .get), return - # an empty EngineCoreOutputs for logging. - engine_core_outputs = EngineCoreOutputs( - outputs=[], scheduler_stats=self.scheduler.make_stats()) + if not scheduled_batch and not self.batch_queue.empty(): + future, scheduler_output = self.batch_queue.get_nowait() + # Blocking until the first result is available. + model_output = future.result() + self.batch_queue.task_done() + engine_core_outputs = self.scheduler.update_from_output( + scheduler_output, model_output) return engine_core_outputs