diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 5e9401cbd747..e009529fbd2a 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -90,8 +90,17 @@ class OpenAIServingResponses(OpenAIServing): logger.info("Using default chat sampling params from %s: %s", source, self.default_sampling_params) - # False by default. + # If False (default), the "store" option is (silently) ignored and the + # response is not stored. If True, the response is stored in memory. + # NOTE(woosuk): This may not be intuitive for users, as the default + # behavior in OpenAI's Responses API is to store the response, but + # vLLM's default behavior is not. self.enable_store = envs.VLLM_ENABLE_RESPONSES_API_STORE + if self.enable_store: + logger.warning_once( + "`VLLM_ENABLE_RESPONSES_API_STORE` is enabled. This may " + "cause a memory leak since we never remove responses from " + "the store.") # HACK(woosuk): This is a hack. We should use a better store. # FIXME: If enable_store=True, this may cause a memory leak since we # never remove responses from the store. @@ -121,9 +130,25 @@ class OpenAIServingResponses(OpenAIServing): if self.engine_client.errored: raise self.engine_client.dead_error - # If store is not enabled, return an error. if request.store and not self.enable_store: - return self._make_store_not_supported_error() + if request.background: + return self.create_error_response( + err_type="invalid_request_error", + message=( + "This vLLM engine does not support `store=True` and " + "therefore does not support the background mode. To " + "enable these features, set the environment variable " + "`VLLM_ENABLE_RESPONSES_API_STORE=1` when launching " + "the vLLM server."), + status_code=HTTPStatus.BAD_REQUEST, + ) + # Disable the store option. + # NOTE(woosuk): Although returning an error is possible, we opted + # to implicitly disable store and process the request anyway, as + # we assume most users do not intend to actually store the response + # (i.e., their request's `store=True` just because it's the default + # value). + request.store = False # Handle the previous response ID. prev_response_id = request.previous_response_id diff --git a/vllm/envs.py b/vllm/envs.py index 8d3c7eab471c..78f955f78a98 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1060,7 +1060,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # Enables support for the "store" option in the OpenAI Responses API. # When set to 1, vLLM's OpenAI server will retain the input and output - # messages for those requests in memory. By default, this is disabled (0). + # messages for those requests in memory. By default, this is disabled (0), + # and the "store" option is ignored. # NOTE/WARNING: # 1. Messages are kept in memory only (not persisted to disk) and will be # lost when the vLLM server shuts down.