Renew TPU executor

This commit is contained in:
Woosuk Kwon 2024-04-16 09:42:15 +00:00
parent 6692a30266
commit b3b89cf755

View File

@ -1,10 +1,6 @@
import os
from typing import Dict, List, Optional
from typing import Dict, List, Set, Tuple
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
ParallelConfig, SchedulerConfig, VisionLanguageConfig)
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
from vllm.executor.utils import check_block_size_valid
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
@ -15,31 +11,13 @@ logger = init_logger(__name__)
class TPUExecutor(ExecutorBase):
def __init__(
self,
model_config: ModelConfig,
cache_config: CacheConfig,
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
vision_language_config: Optional[VisionLanguageConfig],
) -> None:
self.model_config = model_config
self.cache_config = cache_config
self.parallel_config = parallel_config
self.scheduler_config = scheduler_config
self.device_config = device_config
assert lora_config is None, "LoRA is not supported for TPU backend."
self.vision_language_config = vision_language_config
def _init_executor(self) -> None:
assert not self.speculative_config, (
"Speculative decoding not yet supported for TPU backend")
# Instantiate the worker and load the model to the device.
self._init_worker()
# Profile the memory usage and initialize the cache.
self._init_cache()
def _init_worker(self):
os.environ["PJRT_DEVICE"] = "TPU"
from vllm.worker.tpu_worker import TPUWorker
assert self.parallel_config.world_size == 1, (
@ -53,33 +31,24 @@ class TPUExecutor(ExecutorBase):
self.driver_worker.init_device()
self.driver_worker.load_model()
def _init_cache(self) -> None:
"""Profiles the memory usage and initializes the KV cache.
def initialize_cache(
self,
num_gpu_blocks: int,
num_cpu_blocks: int,
) -> None:
"""Initialize the KV cache by invoking the underlying worker."""
# NOTE: This is logged in the executor because there can be >1 worker
# with other executors. We could log in the engine level, but work
# remains to abstract away the device for non-GPU configurations.
logger.info(f"# TPU blocks: {num_gpu_blocks}, "
f"# CPU blocks: {num_cpu_blocks}")
self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
The engine first profiles the existing memory usage.
Then, it allocates the remaining memory for KV blocks.
.. tip::
You may limit the usage of TPU HBM by adjusting the
`gpu_memory_utilization` parameter.
def determine_num_available_blocks(self) -> Tuple[int, int]:
"""Determine the number of available KV blocks by invoking the
underlying worker.
"""
# Get the maximum number of blocks that can be allocated on TPU.
num_tpu_blocks = self.driver_worker.profile_num_available_blocks(
block_size=self.cache_config.block_size,
gpu_memory_utilization=self.cache_config.gpu_memory_utilization,
cache_dtype=self.cache_config.cache_dtype,
)
logger.info(f"# TPU blocks: {num_tpu_blocks}")
check_block_size_valid(num_tpu_blocks, self.cache_config.block_size,
self.model_config.max_model_len)
self.cache_config.num_gpu_blocks = num_tpu_blocks
self.cache_config.num_cpu_blocks = 0
# Allocate the KV cache.
self.driver_worker.allocate_kv_cache(self.cache_config)
# Warm up the model.
self.driver_worker.warm_up_model()
return self.driver_worker.determine_num_available_blocks()
def execute_model(
self,
@ -102,7 +71,7 @@ class TPUExecutor(ExecutorBase):
def remove_lora(self, lora_id: int) -> bool:
raise NotImplementedError("LoRA is not implemented for TPU backend.")
def list_loras(self) -> List[int]:
def list_loras(self) -> Set[int]:
raise NotImplementedError("LoRA is not implemented for TPU backend.")
def check_health(self) -> None:
@ -125,7 +94,3 @@ class TPUExecutorAsync(TPUExecutor, ExecutorAsyncBase):
blocks_to_swap_out=blocks_to_swap_out,
blocks_to_copy=blocks_to_copy)
return output
async def check_health_async(self) -> None:
# TPUExecutor will always be healthy as long as it's running.
return