[Responses API] Ignore store=True and process the request by default (#22185)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Woosuk Kwon 2025-08-04 05:12:48 -07:00 committed by GitHub
parent a5fff3bd49
commit 9af654cc38
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 30 additions and 4 deletions

View File

@ -90,8 +90,17 @@ class OpenAIServingResponses(OpenAIServing):
logger.info("Using default chat sampling params from %s: %s",
source, self.default_sampling_params)
# False by default.
# If False (default), the "store" option is (silently) ignored and the
# response is not stored. If True, the response is stored in memory.
# NOTE(woosuk): This may not be intuitive for users, as the default
# behavior in OpenAI's Responses API is to store the response, but
# vLLM's default behavior is not.
self.enable_store = envs.VLLM_ENABLE_RESPONSES_API_STORE
if self.enable_store:
logger.warning_once(
"`VLLM_ENABLE_RESPONSES_API_STORE` is enabled. This may "
"cause a memory leak since we never remove responses from "
"the store.")
# HACK(woosuk): This is a hack. We should use a better store.
# FIXME: If enable_store=True, this may cause a memory leak since we
# never remove responses from the store.
@ -121,9 +130,25 @@ class OpenAIServingResponses(OpenAIServing):
if self.engine_client.errored:
raise self.engine_client.dead_error
# If store is not enabled, return an error.
if request.store and not self.enable_store:
return self._make_store_not_supported_error()
if request.background:
return self.create_error_response(
err_type="invalid_request_error",
message=(
"This vLLM engine does not support `store=True` and "
"therefore does not support the background mode. To "
"enable these features, set the environment variable "
"`VLLM_ENABLE_RESPONSES_API_STORE=1` when launching "
"the vLLM server."),
status_code=HTTPStatus.BAD_REQUEST,
)
# Disable the store option.
# NOTE(woosuk): Although returning an error is possible, we opted
# to implicitly disable store and process the request anyway, as
# we assume most users do not intend to actually store the response
# (i.e., their request's `store=True` just because it's the default
# value).
request.store = False
# Handle the previous response ID.
prev_response_id = request.previous_response_id

View File

@ -1060,7 +1060,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Enables support for the "store" option in the OpenAI Responses API.
# When set to 1, vLLM's OpenAI server will retain the input and output
# messages for those requests in memory. By default, this is disabled (0).
# messages for those requests in memory. By default, this is disabled (0),
# and the "store" option is ignored.
# NOTE/WARNING:
# 1. Messages are kept in memory only (not persisted to disk) and will be
# lost when the vLLM server shuts down.