diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 916052ca5ebff..38c9545e37476 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -11,7 +11,6 @@ import torch.nn as nn import vllm.envs as envs from vllm.config import VllmConfig -from vllm.device_allocator.cumem import CuMemAllocator from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment, set_custom_all_reduce) @@ -79,6 +78,8 @@ class Worker(WorkerBase): self.profiler = None def sleep(self, level: int = 1) -> None: + from vllm.device_allocator.cumem import CuMemAllocator + free_bytes_before_sleep = torch.cuda.mem_get_info()[0] # Save the buffers before level 2 sleep @@ -101,6 +102,8 @@ class Worker(WorkerBase): used_bytes / GiB_bytes) def wake_up(self, tags: Optional[list[str]] = None) -> None: + from vllm.device_allocator.cumem import CuMemAllocator + allocator = CuMemAllocator.get_instance() allocator.wake_up(tags) @@ -174,6 +177,8 @@ class Worker(WorkerBase): # to hijack tensor allocation. def load_model(self) -> None: if self.vllm_config.model_config.enable_sleep_mode: + from vllm.device_allocator.cumem import CuMemAllocator + allocator = CuMemAllocator.get_instance() assert allocator.get_current_usage() == 0, ( "Sleep mode can only be " @@ -241,7 +246,10 @@ class Worker(WorkerBase): def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None: """Allocate GPU KV cache with the specified kv_cache_config.""" + if self.vllm_config.model_config.enable_sleep_mode: + from vllm.device_allocator.cumem import CuMemAllocator + allocator = CuMemAllocator.get_instance() context = allocator.use_memory_pool(tag="kv_cache") else: