mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 05:45:00 +08:00
[Responses API] Ignore store=True and process the request by default (#22185)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
a5fff3bd49
commit
9af654cc38
@ -90,8 +90,17 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
logger.info("Using default chat sampling params from %s: %s",
|
||||
source, self.default_sampling_params)
|
||||
|
||||
# False by default.
|
||||
# If False (default), the "store" option is (silently) ignored and the
|
||||
# response is not stored. If True, the response is stored in memory.
|
||||
# NOTE(woosuk): This may not be intuitive for users, as the default
|
||||
# behavior in OpenAI's Responses API is to store the response, but
|
||||
# vLLM's default behavior is not.
|
||||
self.enable_store = envs.VLLM_ENABLE_RESPONSES_API_STORE
|
||||
if self.enable_store:
|
||||
logger.warning_once(
|
||||
"`VLLM_ENABLE_RESPONSES_API_STORE` is enabled. This may "
|
||||
"cause a memory leak since we never remove responses from "
|
||||
"the store.")
|
||||
# HACK(woosuk): This is a hack. We should use a better store.
|
||||
# FIXME: If enable_store=True, this may cause a memory leak since we
|
||||
# never remove responses from the store.
|
||||
@ -121,9 +130,25 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
if self.engine_client.errored:
|
||||
raise self.engine_client.dead_error
|
||||
|
||||
# If store is not enabled, return an error.
|
||||
if request.store and not self.enable_store:
|
||||
return self._make_store_not_supported_error()
|
||||
if request.background:
|
||||
return self.create_error_response(
|
||||
err_type="invalid_request_error",
|
||||
message=(
|
||||
"This vLLM engine does not support `store=True` and "
|
||||
"therefore does not support the background mode. To "
|
||||
"enable these features, set the environment variable "
|
||||
"`VLLM_ENABLE_RESPONSES_API_STORE=1` when launching "
|
||||
"the vLLM server."),
|
||||
status_code=HTTPStatus.BAD_REQUEST,
|
||||
)
|
||||
# Disable the store option.
|
||||
# NOTE(woosuk): Although returning an error is possible, we opted
|
||||
# to implicitly disable store and process the request anyway, as
|
||||
# we assume most users do not intend to actually store the response
|
||||
# (i.e., their request's `store=True` just because it's the default
|
||||
# value).
|
||||
request.store = False
|
||||
|
||||
# Handle the previous response ID.
|
||||
prev_response_id = request.previous_response_id
|
||||
|
||||
@ -1060,7 +1060,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
|
||||
# Enables support for the "store" option in the OpenAI Responses API.
|
||||
# When set to 1, vLLM's OpenAI server will retain the input and output
|
||||
# messages for those requests in memory. By default, this is disabled (0).
|
||||
# messages for those requests in memory. By default, this is disabled (0),
|
||||
# and the "store" option is ignored.
|
||||
# NOTE/WARNING:
|
||||
# 1. Messages are kept in memory only (not persisted to disk) and will be
|
||||
# lost when the vLLM server shuts down.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user