diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 74b226b45424..2061806e6b36 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -50,6 +50,7 @@ class MultiprocExecutor(Executor): self.is_failed = False self.shutdown_event = threading.Event() self.failure_callback: Optional[FailureCallback] = None + self.io_thread_pool: Optional[ThreadPoolExecutor] = None self.world_size = self.parallel_config.world_size tensor_parallel_size = self.parallel_config.tensor_parallel_size @@ -107,7 +108,6 @@ class MultiprocExecutor(Executor): # For pipeline parallel, we use a thread pool for asynchronous # execute_model. - self.io_thread_pool: Optional[ThreadPoolExecutor] = None if self.max_concurrent_batches > 1: # Note: must use only 1 IO thread to keep dequeue sequence # from the response queue