From b3b89cf755d5d4ca48f217c5c0303103d243214a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 16 Apr 2024 09:42:15 +0000
Subject: [PATCH] Renew TPU executor

---
 vllm/executor/tpu_executor.py | 77 ++++++++++-------------------------
 1 file changed, 21 insertions(+), 56 deletions(-)

diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py
index 80e820209ccf7..54fc40c013261 100644
--- a/vllm/executor/tpu_executor.py
+++ b/vllm/executor/tpu_executor.py
@@ -1,10 +1,6 @@
-import os
-from typing import Dict, List, Optional
+from typing import Dict, List, Set, Tuple
 
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, VisionLanguageConfig)
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
-from vllm.executor.utils import check_block_size_valid
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
@@ -15,31 +11,13 @@ logger = init_logger(__name__)
 
 class TPUExecutor(ExecutorBase):
 
-    def __init__(
-        self,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
-    ) -> None:
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.parallel_config = parallel_config
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        assert lora_config is None, "LoRA is not supported for TPU backend."
-        self.vision_language_config = vision_language_config
-
+    def _init_executor(self) -> None:
+        assert not self.speculative_config, (
+            "Speculative decoding not yet supported for TPU backend")
         # Instantiate the worker and load the model to the device.
         self._init_worker()
-        # Profile the memory usage and initialize the cache.
-        self._init_cache()
 
     def _init_worker(self):
-        os.environ["PJRT_DEVICE"] = "TPU"
         from vllm.worker.tpu_worker import TPUWorker
 
         assert self.parallel_config.world_size == 1, (
@@ -53,33 +31,24 @@ class TPUExecutor(ExecutorBase):
         self.driver_worker.init_device()
         self.driver_worker.load_model()
 
-    def _init_cache(self) -> None:
-        """Profiles the memory usage and initializes the KV cache.
+    def initialize_cache(
+        self,
+        num_gpu_blocks: int,
+        num_cpu_blocks: int,
+    ) -> None:
+        """Initialize the KV cache by invoking the underlying worker."""
+        # NOTE: This is logged in the executor because there can be >1 worker
+        # with other executors. We could log in the engine level, but work
+        # remains to abstract away the device for non-GPU configurations.
+        logger.info(f"# TPU blocks: {num_gpu_blocks}, "
+                    f"# CPU blocks: {num_cpu_blocks}")
+        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
 
-        The engine first profiles the existing memory usage.
-        Then, it allocates the remaining memory for KV blocks.
-
-        .. tip::
-            You may limit the usage of TPU HBM by adjusting the
-            `gpu_memory_utilization` parameter.
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available KV blocks by invoking the
+        underlying worker.
         """
-        # Get the maximum number of blocks that can be allocated on TPU.
-        num_tpu_blocks = self.driver_worker.profile_num_available_blocks(
-            block_size=self.cache_config.block_size,
-            gpu_memory_utilization=self.cache_config.gpu_memory_utilization,
-            cache_dtype=self.cache_config.cache_dtype,
-        )
-        logger.info(f"# TPU blocks: {num_tpu_blocks}")
-
-        check_block_size_valid(num_tpu_blocks, self.cache_config.block_size,
-                               self.model_config.max_model_len)
-        self.cache_config.num_gpu_blocks = num_tpu_blocks
-        self.cache_config.num_cpu_blocks = 0
-
-        # Allocate the KV cache.
-        self.driver_worker.allocate_kv_cache(self.cache_config)
-        # Warm up the model.
-        self.driver_worker.warm_up_model()
+        return self.driver_worker.determine_num_available_blocks()
 
     def execute_model(
         self,
@@ -102,7 +71,7 @@ class TPUExecutor(ExecutorBase):
     def remove_lora(self, lora_id: int) -> bool:
         raise NotImplementedError("LoRA is not implemented for TPU backend.")
 
-    def list_loras(self) -> List[int]:
+    def list_loras(self) -> Set[int]:
         raise NotImplementedError("LoRA is not implemented for TPU backend.")
 
     def check_health(self) -> None:
@@ -125,7 +94,3 @@ class TPUExecutorAsync(TPUExecutor, ExecutorAsyncBase):
             blocks_to_swap_out=blocks_to_swap_out,
             blocks_to_copy=blocks_to_copy)
         return output
-
-    async def check_health_async(self) -> None:
-        # TPUExecutor will always be healthy as long as it's running.
-        return