diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index b6ee4105340a1..73726eeab5fc7 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1092,6 +1092,7 @@ class AsyncLLMEngine(EngineClient): self.engine.reset_prefix_cache(device) async def sleep(self, level: int = 1) -> None: + await self.reset_prefix_cache() self.engine.sleep(level) async def wake_up(self, tags: Optional[list[str]] = None) -> None: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a2706327914c5..edc2e235c3c3f 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -576,6 +576,7 @@ class AsyncLLM(EngineClient): await self.engine_core.reset_prefix_cache_async() async def sleep(self, level: int = 1) -> None: + await self.reset_prefix_cache() await self.engine_core.sleep_async(level) async def wake_up(self, tags: Optional[list[str]] = None) -> None: