[Responses API] Disable response store by default (#22137)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Woosuk Kwon 2025-08-03 04:04:21 -07:00 committed by GitHub
parent aefeea0fde
commit 6d98843b31
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 46 additions and 10 deletions

View File

@ -21,12 +21,16 @@ def default_server_args():
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(default_server_args): def server_with_store(default_server_args):
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: with RemoteOpenAIServer(
MODEL_NAME,
default_server_args,
env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
) as remote_server:
yield remote_server yield remote_server
@pytest_asyncio.fixture @pytest_asyncio.fixture
async def client(server): async def client(server_with_store):
async with server.get_async_client() as async_client: async with server_with_store.get_async_client() as async_client:
yield async_client yield async_client

View File

@ -37,8 +37,11 @@ def default_image_server_args():
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def image_server(default_image_server_args): def image_server(default_image_server_args):
with RemoteOpenAIServer(MODEL_NAME, with RemoteOpenAIServer(
default_image_server_args) as remote_server: MODEL_NAME,
default_image_server_args,
env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
) as remote_server:
yield remote_server yield remote_server

View File

@ -11,6 +11,7 @@ import jinja2
from fastapi import Request from fastapi import Request
from openai.types.responses import ResponseOutputMessage, ResponseOutputText from openai.types.responses import ResponseOutputMessage, ResponseOutputText
from vllm import envs
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
@ -89,15 +90,17 @@ class OpenAIServingResponses(OpenAIServing):
logger.info("Using default chat sampling params from %s: %s", logger.info("Using default chat sampling params from %s: %s",
source, self.default_sampling_params) source, self.default_sampling_params)
# False by default.
self.enable_store = envs.VLLM_ENABLE_RESPONSES_API_STORE
# HACK(woosuk): This is a hack. We should use a better store. # HACK(woosuk): This is a hack. We should use a better store.
# FIXME: This causes a memory leak since we never remove responses # FIXME: If enable_store=True, this may cause a memory leak since we
# from the store. # never remove responses from the store.
self.response_store: dict[str, ResponsesResponse] = {} self.response_store: dict[str, ResponsesResponse] = {}
self.response_store_lock = asyncio.Lock() self.response_store_lock = asyncio.Lock()
# HACK(woosuk): This is a hack. We should use a better store. # HACK(woosuk): This is a hack. We should use a better store.
# FIXME: This causes a memory leak since we never remove messages # FIXME: If enable_store=True, this may cause a memory leak since we
# from the store. # never remove messages from the store.
self.msg_store: dict[str, list[ChatCompletionMessageParam]] = {} self.msg_store: dict[str, list[ChatCompletionMessageParam]] = {}
self.background_tasks: dict[str, asyncio.Task] = {} self.background_tasks: dict[str, asyncio.Task] = {}
@ -118,6 +121,10 @@ class OpenAIServingResponses(OpenAIServing):
if self.engine_client.errored: if self.engine_client.errored:
raise self.engine_client.dead_error raise self.engine_client.dead_error
# If store is not enabled, return an error.
if request.store and not self.enable_store:
return self._make_store_not_supported_error()
# Handle the previous response ID. # Handle the previous response ID.
prev_response_id = request.previous_response_id prev_response_id = request.previous_response_id
if prev_response_id is not None: if prev_response_id is not None:
@ -456,3 +463,13 @@ class OpenAIServingResponses(OpenAIServing):
message=f"Response with id '{response_id}' not found.", message=f"Response with id '{response_id}' not found.",
status_code=HTTPStatus.NOT_FOUND, status_code=HTTPStatus.NOT_FOUND,
) )
def _make_store_not_supported_error(self) -> ErrorResponse:
return self.create_error_response(
err_type="invalid_request_error",
message=("`store=True` (default) is not supported. Please set "
"`store=False` in Responses API or set "
"`VLLM_ENABLE_RESPONSES_API_STORE=1` in the env var when "
"starting the vLLM server."),
status_code=HTTPStatus.BAD_REQUEST,
)

View File

@ -151,6 +151,7 @@ if TYPE_CHECKING:
VLLM_ENABLE_CUDAGRAPH_GC: bool = False VLLM_ENABLE_CUDAGRAPH_GC: bool = False
VLLM_LOOPBACK_IP: str = "" VLLM_LOOPBACK_IP: str = ""
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
VLLM_ENABLE_RESPONSES_API_STORE: bool = False
def get_default_cache_root(): def get_default_cache_root():
@ -1056,6 +1057,17 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE": "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE":
lambda: bool(int(os.getenv(\ lambda: bool(int(os.getenv(\
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))), "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))),
# Enables support for the "store" option in the OpenAI Responses API.
# When set to 1, vLLM's OpenAI server will retain the input and output
# messages for those requests in memory. By default, this is disabled (0).
# NOTE/WARNING:
# 1. Messages are kept in memory only (not persisted to disk) and will be
# lost when the vLLM server shuts down.
# 2. Enabling this option will cause a memory leak, as stored messages are
# never removed from memory until the server terminates.
"VLLM_ENABLE_RESPONSES_API_STORE":
lambda: bool(int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))),
} }
# --8<-- [end:env-vars-definition] # --8<-- [end:env-vars-definition]