mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-14 07:25:01 +08:00
[Responses API] Disable response store by default (#22137)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
aefeea0fde
commit
6d98843b31
@ -21,12 +21,16 @@ def default_server_args():
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def server(default_server_args):
|
def server_with_store(default_server_args):
|
||||||
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
|
with RemoteOpenAIServer(
|
||||||
|
MODEL_NAME,
|
||||||
|
default_server_args,
|
||||||
|
env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
|
||||||
|
) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|
||||||
@pytest_asyncio.fixture
|
@pytest_asyncio.fixture
|
||||||
async def client(server):
|
async def client(server_with_store):
|
||||||
async with server.get_async_client() as async_client:
|
async with server_with_store.get_async_client() as async_client:
|
||||||
yield async_client
|
yield async_client
|
||||||
|
|||||||
@ -37,8 +37,11 @@ def default_image_server_args():
|
|||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def image_server(default_image_server_args):
|
def image_server(default_image_server_args):
|
||||||
with RemoteOpenAIServer(MODEL_NAME,
|
with RemoteOpenAIServer(
|
||||||
default_image_server_args) as remote_server:
|
MODEL_NAME,
|
||||||
|
default_image_server_args,
|
||||||
|
env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
|
||||||
|
) as remote_server:
|
||||||
yield remote_server
|
yield remote_server
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -11,6 +11,7 @@ import jinja2
|
|||||||
from fastapi import Request
|
from fastapi import Request
|
||||||
from openai.types.responses import ResponseOutputMessage, ResponseOutputText
|
from openai.types.responses import ResponseOutputMessage, ResponseOutputText
|
||||||
|
|
||||||
|
from vllm import envs
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.engine.protocol import EngineClient
|
from vllm.engine.protocol import EngineClient
|
||||||
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
|
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
|
||||||
@ -89,15 +90,17 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
logger.info("Using default chat sampling params from %s: %s",
|
logger.info("Using default chat sampling params from %s: %s",
|
||||||
source, self.default_sampling_params)
|
source, self.default_sampling_params)
|
||||||
|
|
||||||
|
# False by default.
|
||||||
|
self.enable_store = envs.VLLM_ENABLE_RESPONSES_API_STORE
|
||||||
# HACK(woosuk): This is a hack. We should use a better store.
|
# HACK(woosuk): This is a hack. We should use a better store.
|
||||||
# FIXME: This causes a memory leak since we never remove responses
|
# FIXME: If enable_store=True, this may cause a memory leak since we
|
||||||
# from the store.
|
# never remove responses from the store.
|
||||||
self.response_store: dict[str, ResponsesResponse] = {}
|
self.response_store: dict[str, ResponsesResponse] = {}
|
||||||
self.response_store_lock = asyncio.Lock()
|
self.response_store_lock = asyncio.Lock()
|
||||||
|
|
||||||
# HACK(woosuk): This is a hack. We should use a better store.
|
# HACK(woosuk): This is a hack. We should use a better store.
|
||||||
# FIXME: This causes a memory leak since we never remove messages
|
# FIXME: If enable_store=True, this may cause a memory leak since we
|
||||||
# from the store.
|
# never remove messages from the store.
|
||||||
self.msg_store: dict[str, list[ChatCompletionMessageParam]] = {}
|
self.msg_store: dict[str, list[ChatCompletionMessageParam]] = {}
|
||||||
|
|
||||||
self.background_tasks: dict[str, asyncio.Task] = {}
|
self.background_tasks: dict[str, asyncio.Task] = {}
|
||||||
@ -118,6 +121,10 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
if self.engine_client.errored:
|
if self.engine_client.errored:
|
||||||
raise self.engine_client.dead_error
|
raise self.engine_client.dead_error
|
||||||
|
|
||||||
|
# If store is not enabled, return an error.
|
||||||
|
if request.store and not self.enable_store:
|
||||||
|
return self._make_store_not_supported_error()
|
||||||
|
|
||||||
# Handle the previous response ID.
|
# Handle the previous response ID.
|
||||||
prev_response_id = request.previous_response_id
|
prev_response_id = request.previous_response_id
|
||||||
if prev_response_id is not None:
|
if prev_response_id is not None:
|
||||||
@ -456,3 +463,13 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
message=f"Response with id '{response_id}' not found.",
|
message=f"Response with id '{response_id}' not found.",
|
||||||
status_code=HTTPStatus.NOT_FOUND,
|
status_code=HTTPStatus.NOT_FOUND,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _make_store_not_supported_error(self) -> ErrorResponse:
|
||||||
|
return self.create_error_response(
|
||||||
|
err_type="invalid_request_error",
|
||||||
|
message=("`store=True` (default) is not supported. Please set "
|
||||||
|
"`store=False` in Responses API or set "
|
||||||
|
"`VLLM_ENABLE_RESPONSES_API_STORE=1` in the env var when "
|
||||||
|
"starting the vLLM server."),
|
||||||
|
status_code=HTTPStatus.BAD_REQUEST,
|
||||||
|
)
|
||||||
|
|||||||
12
vllm/envs.py
12
vllm/envs.py
@ -151,6 +151,7 @@ if TYPE_CHECKING:
|
|||||||
VLLM_ENABLE_CUDAGRAPH_GC: bool = False
|
VLLM_ENABLE_CUDAGRAPH_GC: bool = False
|
||||||
VLLM_LOOPBACK_IP: str = ""
|
VLLM_LOOPBACK_IP: str = ""
|
||||||
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
|
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
|
||||||
|
VLLM_ENABLE_RESPONSES_API_STORE: bool = False
|
||||||
|
|
||||||
|
|
||||||
def get_default_cache_root():
|
def get_default_cache_root():
|
||||||
@ -1056,6 +1057,17 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE":
|
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE":
|
||||||
lambda: bool(int(os.getenv(\
|
lambda: bool(int(os.getenv(\
|
||||||
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))),
|
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))),
|
||||||
|
|
||||||
|
# Enables support for the "store" option in the OpenAI Responses API.
|
||||||
|
# When set to 1, vLLM's OpenAI server will retain the input and output
|
||||||
|
# messages for those requests in memory. By default, this is disabled (0).
|
||||||
|
# NOTE/WARNING:
|
||||||
|
# 1. Messages are kept in memory only (not persisted to disk) and will be
|
||||||
|
# lost when the vLLM server shuts down.
|
||||||
|
# 2. Enabling this option will cause a memory leak, as stored messages are
|
||||||
|
# never removed from memory until the server terminates.
|
||||||
|
"VLLM_ENABLE_RESPONSES_API_STORE":
|
||||||
|
lambda: bool(int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))),
|
||||||
}
|
}
|
||||||
|
|
||||||
# --8<-- [end:env-vars-definition]
|
# --8<-- [end:env-vars-definition]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user