[Responses API] Ignore store=True and process the request by default (#22185)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2026-01-26 18:04:28 +08:00 · 2025-08-04 05:12:48 -07:00 · 2025-08-04 05:12:48 -07:00 · 9af654cc38
commit 9af654cc38
parent a5fff3bd49
2 changed files with 30 additions and 4 deletions
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@ -90,8 +90,17 @@ class OpenAIServingResponses(OpenAIServing):
            logger.info("Using default chat sampling params from %s: %s",
                        source, self.default_sampling_params)

-        # False by default.
+        # If False (default), the "store" option is (silently) ignored and the
+        # response is not stored. If True, the response is stored in memory.
+        # NOTE(woosuk): This may not be intuitive for users, as the default
+        # behavior in OpenAI's Responses API is to store the response, but
+        # vLLM's default behavior is not.
        self.enable_store = envs.VLLM_ENABLE_RESPONSES_API_STORE
+        if self.enable_store:
+            logger.warning_once(
+                "`VLLM_ENABLE_RESPONSES_API_STORE` is enabled. This may "
+                "cause a memory leak since we never remove responses from "
+                "the store.")
        # HACK(woosuk): This is a hack. We should use a better store.
        # FIXME: If enable_store=True, this may cause a memory leak since we
        # never remove responses from the store.
@ -121,9 +130,25 @@ class OpenAIServingResponses(OpenAIServing):
        if self.engine_client.errored:
            raise self.engine_client.dead_error

-        # If store is not enabled, return an error.
        if request.store and not self.enable_store:
-            return self._make_store_not_supported_error()
+            if request.background:
+                return self.create_error_response(
+                    err_type="invalid_request_error",
+                    message=(
+                        "This vLLM engine does not support `store=True` and "
+                        "therefore does not support the background mode. To "
+                        "enable these features, set the environment variable "
+                        "`VLLM_ENABLE_RESPONSES_API_STORE=1` when launching "
+                        "the vLLM server."),
+                    status_code=HTTPStatus.BAD_REQUEST,
+                )
+            # Disable the store option.
+            # NOTE(woosuk): Although returning an error is possible, we opted
+            # to implicitly disable store and process the request anyway, as
+            # we assume most users do not intend to actually store the response
+            # (i.e., their request's `store=True` just because it's the default
+            # value).
+            request.store = False

        # Handle the previous response ID.
        prev_response_id = request.previous_response_id
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -1060,7 +1060,8 @@ environment_variables: dict[str, Callable[[], Any]] = {

    # Enables support for the "store" option in the OpenAI Responses API.
    # When set to 1, vLLM's OpenAI server will retain the input and output
-    # messages for those requests in memory. By default, this is disabled (0).
+    # messages for those requests in memory. By default, this is disabled (0),
+    # and the "store" option is ignored.
    # NOTE/WARNING:
    # 1. Messages are kept in memory only (not persisted to disk) and will be
    #    lost when the vLLM server shuts down.