From b3b89cf755d5d4ca48f217c5c0303103d243214a Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 16 Apr 2024 09:42:15 +0000 Subject: [PATCH] Renew TPU executor --- vllm/executor/tpu_executor.py | 77 ++++++++++------------------------- 1 file changed, 21 insertions(+), 56 deletions(-) diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py index 80e820209ccf7..54fc40c013261 100644 --- a/vllm/executor/tpu_executor.py +++ b/vllm/executor/tpu_executor.py @@ -1,10 +1,6 @@ -import os -from typing import Dict, List, Optional +from typing import Dict, List, Set, Tuple -from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, VisionLanguageConfig) from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase -from vllm.executor.utils import check_block_size_valid from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import SamplerOutput, SequenceGroupMetadata @@ -15,31 +11,13 @@ logger = init_logger(__name__) class TPUExecutor(ExecutorBase): - def __init__( - self, - model_config: ModelConfig, - cache_config: CacheConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - lora_config: Optional[LoRAConfig], - vision_language_config: Optional[VisionLanguageConfig], - ) -> None: - self.model_config = model_config - self.cache_config = cache_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.device_config = device_config - assert lora_config is None, "LoRA is not supported for TPU backend." - self.vision_language_config = vision_language_config - + def _init_executor(self) -> None: + assert not self.speculative_config, ( + "Speculative decoding not yet supported for TPU backend") # Instantiate the worker and load the model to the device. self._init_worker() - # Profile the memory usage and initialize the cache. - self._init_cache() def _init_worker(self): - os.environ["PJRT_DEVICE"] = "TPU" from vllm.worker.tpu_worker import TPUWorker assert self.parallel_config.world_size == 1, ( @@ -53,33 +31,24 @@ class TPUExecutor(ExecutorBase): self.driver_worker.init_device() self.driver_worker.load_model() - def _init_cache(self) -> None: - """Profiles the memory usage and initializes the KV cache. + def initialize_cache( + self, + num_gpu_blocks: int, + num_cpu_blocks: int, + ) -> None: + """Initialize the KV cache by invoking the underlying worker.""" + # NOTE: This is logged in the executor because there can be >1 worker + # with other executors. We could log in the engine level, but work + # remains to abstract away the device for non-GPU configurations. + logger.info(f"# TPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") + self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - The engine first profiles the existing memory usage. - Then, it allocates the remaining memory for KV blocks. - - .. tip:: - You may limit the usage of TPU HBM by adjusting the - `gpu_memory_utilization` parameter. + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Determine the number of available KV blocks by invoking the + underlying worker. """ - # Get the maximum number of blocks that can be allocated on TPU. - num_tpu_blocks = self.driver_worker.profile_num_available_blocks( - block_size=self.cache_config.block_size, - gpu_memory_utilization=self.cache_config.gpu_memory_utilization, - cache_dtype=self.cache_config.cache_dtype, - ) - logger.info(f"# TPU blocks: {num_tpu_blocks}") - - check_block_size_valid(num_tpu_blocks, self.cache_config.block_size, - self.model_config.max_model_len) - self.cache_config.num_gpu_blocks = num_tpu_blocks - self.cache_config.num_cpu_blocks = 0 - - # Allocate the KV cache. - self.driver_worker.allocate_kv_cache(self.cache_config) - # Warm up the model. - self.driver_worker.warm_up_model() + return self.driver_worker.determine_num_available_blocks() def execute_model( self, @@ -102,7 +71,7 @@ class TPUExecutor(ExecutorBase): def remove_lora(self, lora_id: int) -> bool: raise NotImplementedError("LoRA is not implemented for TPU backend.") - def list_loras(self) -> List[int]: + def list_loras(self) -> Set[int]: raise NotImplementedError("LoRA is not implemented for TPU backend.") def check_health(self) -> None: @@ -125,7 +94,3 @@ class TPUExecutorAsync(TPUExecutor, ExecutorAsyncBase): blocks_to_swap_out=blocks_to_swap_out, blocks_to_copy=blocks_to_copy) return output - - async def check_health_async(self) -> None: - # TPUExecutor will always be healthy as long as it's running. - return