[Responses API] Ignore store=True and process the request by default (#22185)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-12-14 04:24:56 +08:00 · 2025-08-04 05:12:48 -07:00 · 2025-08-04 05:12:48 -07:00 · 9af654cc38
commit 9af654cc38
parent a5fff3bd49
2 changed files with 30 additions and 4 deletions
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@ -90,8 +90,17 @@ class OpenAIServingResponses(OpenAIServing):
            logger.info("Using default chat sampling params from %s: %s",
                        source, self.default_sampling_params)
-        # False by default.
+        # If False (default), the "store" option is (silently) ignored and the
        # response is not stored. If True, the response is stored in memory.
        # NOTE(woosuk): This may not be intuitive for users, as the default
        # behavior in OpenAI's Responses API is to store the response, but
        # vLLM's default behavior is not.
        self.enable_store = envs.VLLM_ENABLE_RESPONSES_API_STORE
        if self.enable_store:
            logger.warning_once(
                "`VLLM_ENABLE_RESPONSES_API_STORE` is enabled. This may "
                "cause a memory leak since we never remove responses from "
                "the store.")
        # HACK(woosuk): This is a hack. We should use a better store.
        # FIXME: If enable_store=True, this may cause a memory leak since we
        # never remove responses from the store.
@ -121,9 +130,25 @@ class OpenAIServingResponses(OpenAIServing):
        if self.engine_client.errored:
            raise self.engine_client.dead_error
        # If store is not enabled, return an error.
        if request.store and not self.enable_store:
-            return self._make_store_not_supported_error()
+            if request.background:
                return self.create_error_response(
                    err_type="invalid_request_error",
                    message=(
                        "This vLLM engine does not support `store=True` and "
                        "therefore does not support the background mode. To "
                        "enable these features, set the environment variable "
                        "`VLLM_ENABLE_RESPONSES_API_STORE=1` when launching "
                        "the vLLM server."),
                    status_code=HTTPStatus.BAD_REQUEST,
                )
            # Disable the store option.
            # NOTE(woosuk): Although returning an error is possible, we opted
            # to implicitly disable store and process the request anyway, as
            # we assume most users do not intend to actually store the response
            # (i.e., their request's `store=True` just because it's the default
            # value).
            request.store = False
        # Handle the previous response ID.
        prev_response_id = request.previous_response_id
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -1060,7 +1060,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # Enables support for the "store" option in the OpenAI Responses API.
    # When set to 1, vLLM's OpenAI server will retain the input and output
-    # messages for those requests in memory. By default, this is disabled (0).
+    # messages for those requests in memory. By default, this is disabled (0),
    # and the "store" option is ignored.
    # NOTE/WARNING:
    # 1. Messages are kept in memory only (not persisted to disk) and will be
    #    lost when the vLLM server shuts down.