From dd6ac1c2bb3d29f8ba612a2f66f350a2c55c7e8b Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Fri, 14 Nov 2025 23:59:42 -0800 Subject: [PATCH] [RL] [V1] Remove unused device argument from reset_kv_cache (#28766) Signed-off-by: Zhuohan Li --- vllm/engine/protocol.py | 2 +- vllm/entrypoints/llm.py | 5 ++--- vllm/entrypoints/openai/api_server.py | 10 +++------- vllm/v1/engine/async_llm.py | 6 ++---- vllm/v1/engine/llm_engine.py | 3 +-- 5 files changed, 9 insertions(+), 17 deletions(-) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 24fcd9fe1cab..462d2c4e50e7 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -125,7 +125,7 @@ class EngineClient(ABC): ... @abstractmethod - async def reset_prefix_cache(self, device: Device | None = None) -> None: + async def reset_prefix_cache(self) -> None: """Reset the prefix cache""" ... diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 62717a7eacdf..b0786bd355aa 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -32,7 +32,6 @@ from vllm.config.model import ( TokenizerMode, ) from vllm.engine.arg_utils import EngineArgs -from vllm.engine.protocol import Device from vllm.entrypoints.chat_utils import ( ChatCompletionMessageParam, ChatTemplateContentFormatOption, @@ -1499,8 +1498,8 @@ class LLM: def stop_profile(self) -> None: self.llm_engine.stop_profile() - def reset_prefix_cache(self, device: Device | None = None) -> None: - self.llm_engine.reset_prefix_cache(device) + def reset_prefix_cache(self) -> None: + self.llm_engine.reset_prefix_cache() def sleep(self, level: int = 1): """ diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 3e59af717d95..3cf66fcd27e2 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -39,7 +39,7 @@ from typing_extensions import assert_never import vllm.envs as envs from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.protocol import Device, EngineClient +from vllm.engine.protocol import EngineClient from vllm.entrypoints.anthropic.protocol import ( AnthropicError, AnthropicErrorResponse, @@ -1069,12 +1069,8 @@ if envs.VLLM_SERVER_DEV_MODE: Reset the prefix cache. Note that we currently do not check if the prefix cache is successfully reset in the API server. """ - device = None - device_str = raw_request.query_params.get("device") - if device_str is not None: - device = Device[device_str.upper()] - logger.info("Resetting prefix cache with specific %s...", str(device)) - await engine_client(raw_request).reset_prefix_cache(device) + logger.info("Resetting prefix cache...") + await engine_client(raw_request).reset_prefix_cache() return Response(status_code=200) @router.post("/reset_mm_cache") diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 48ea6ef8515c..c160c7cbcab4 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -14,7 +14,7 @@ import torch import vllm.envs as envs from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.protocol import Device, EngineClient +from vllm.engine.protocol import EngineClient from vllm.entrypoints.utils import _validate_truncation_size from vllm.inputs import PromptType from vllm.logger import init_logger @@ -672,9 +672,7 @@ class AsyncLLM(EngineClient): self.processor.clear_mm_cache() await self.engine_core.reset_mm_cache_async() - async def reset_prefix_cache(self, device: Device | None = None) -> None: - if device == Device.CPU: - raise ValueError("Not supported on CPU.") + async def reset_prefix_cache(self) -> None: await self.engine_core.reset_prefix_cache_async() async def sleep(self, level: int = 1) -> None: diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 1db83446ba0b..e403cea87788 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -14,7 +14,6 @@ from vllm.config import ParallelConfig, VllmConfig from vllm.distributed import stateless_destroy_torch_distributed_process_group from vllm.distributed.parallel_state import get_dp_group from vllm.engine.arg_utils import EngineArgs -from vllm.engine.protocol import Device from vllm.inputs import PromptType from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -321,7 +320,7 @@ class LLMEngine: self.processor.clear_mm_cache() self.engine_core.reset_mm_cache() - def reset_prefix_cache(self, device: Device | None = None): + def reset_prefix_cache(self): self.engine_core.reset_prefix_cache() def sleep(self, level: int = 1):