From ef65dcfa6f5820ce9e4a2411e9be18586f6fd467 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Mon, 18 Mar 2024 22:05:34 -0700 Subject: [PATCH] [Doc] Add docs about OpenAI compatible server (#3288) --- docs/requirements-docs.txt | 7 + docs/source/conf.py | 4 +- docs/source/dev/sampling_params.rst | 4 + docs/source/index.rst | 10 +- docs/source/models/lora.rst | 2 +- docs/source/serving/integrations.rst | 11 ++ .../serving/openai_compatible_server.md | 114 ++++++++++++ vllm/entrypoints/openai/api_server.py | 108 +----------- vllm/entrypoints/openai/cli_args.py | 118 +++++++++++++ vllm/entrypoints/openai/protocol.py | 166 +++++++++++++----- 10 files changed, 383 insertions(+), 161 deletions(-) create mode 100644 docs/source/dev/sampling_params.rst create mode 100644 docs/source/serving/integrations.rst create mode 100644 docs/source/serving/openai_compatible_server.md create mode 100644 vllm/entrypoints/openai/cli_args.py diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 95e54bd15185..96749b9327d7 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,3 +1,10 @@ sphinx == 6.2.1 sphinx-book-theme == 1.0.1 sphinx-copybutton == 0.5.2 +myst-parser == 2.0.0 +sphinx-argparse + +# packages to install to build the documentation +pydantic +-f https://download.pytorch.org/whl/cpu +torch \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 61d24e161212..2ca0d642b746 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -22,7 +22,7 @@ logger = logging.getLogger(__name__) # -- Project information ----------------------------------------------------- project = 'vLLM' -copyright = '2023, vLLM Team' +copyright = '2024, vLLM Team' author = 'the vLLM Team' # -- General configuration --------------------------------------------------- @@ -37,6 +37,8 @@ extensions = [ "sphinx_copybutton", "sphinx.ext.autodoc", "sphinx.ext.autosummary", + "myst_parser", + "sphinxarg.ext", ] # Add any paths that contain templates here, relative to this directory. diff --git a/docs/source/dev/sampling_params.rst b/docs/source/dev/sampling_params.rst new file mode 100644 index 000000000000..844859b3ec1f --- /dev/null +++ b/docs/source/dev/sampling_params.rst @@ -0,0 +1,4 @@ +Sampling Params +=============== + +.. automodule:: vllm.sampling_params.SamplingParams \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index 65bfbbabf8be..72081588b1bc 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -69,14 +69,11 @@ Documentation :maxdepth: 1 :caption: Serving - serving/distributed_serving - serving/run_on_sky - serving/deploying_with_kserve - serving/deploying_with_triton - serving/deploying_with_bentoml + serving/openai_compatible_server serving/deploying_with_docker - serving/serving_with_langchain + serving/distributed_serving serving/metrics + serving/integrations .. toctree:: :maxdepth: 1 @@ -98,6 +95,7 @@ Documentation :maxdepth: 2 :caption: Developer Documentation + dev/sampling_params dev/engine/engine_index dev/kernel/paged_attention diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst index f05fafe9f827..2278640481a9 100644 --- a/docs/source/models/lora.rst +++ b/docs/source/models/lora.rst @@ -90,7 +90,7 @@ Requests can specify the LoRA adapter as if it were any other model via the ``mo processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other LoRA adapter requests if they were provided and ``max_loras`` is set high enough). -The following is an example request +The following is an example request .. code-block:: bash diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst new file mode 100644 index 000000000000..93872397913e --- /dev/null +++ b/docs/source/serving/integrations.rst @@ -0,0 +1,11 @@ +Integrations +------------ + +.. toctree:: + :maxdepth: 1 + + run_on_sky + deploying_with_kserve + deploying_with_triton + deploying_with_bentoml + serving_with_langchain diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md new file mode 100644 index 000000000000..032fe5d03bd5 --- /dev/null +++ b/docs/source/serving/openai_compatible_server.md @@ -0,0 +1,114 @@ +# OpenAI Compatible Server + +vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API. + +You can start the server using Python, or using [Docker](deploying_with_docker.rst): +```bash +python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-hf --dtype float32 --api-key token-abc123 +``` + +To call the server, you can use the official OpenAI Python client library, or any other HTTP client. +```python +from openai import OpenAI +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="token-abc123", +) + +completion = client.chat.completions.create( + model="meta-llama/Llama-2-7b-hf", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello!"} + ] +) + +print(completion.choices[0].message) +``` + +## API Reference +Please see the [OpenAI API Reference](https://platform.openai.com/docs/api-reference) for more information on the API. We support all parameters except: +- Chat: `tools`, and `tool_choice`. +- Completions: `suffix`. + +## Extra Parameters +vLLM supports a set of parameters that are not part of the OpenAI API. +In order to use them, you can pass them as extra parameters in the OpenAI client. +Or directly merge them into the JSON payload if you are using HTTP call directly. + +```python +completion = client.chat.completions.create( + model="meta-llama/Llama-2-7b-hf", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + ], + extra_body={ + "guided_choice": ["positive", "negative"] + } +) +``` + +### Extra Parameters for Chat API +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-chat-completion-sampling-params +:end-before: end-chat-completion-sampling-params +``` + +The following extra parameters are supported: + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-chat-completion-extra-params +:end-before: end-chat-completion-extra-params +``` + +### Extra Parameters for Completions API +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-completion-sampling-params +:end-before: end-completion-sampling-params +``` + +The following extra parameters are supported: + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-completion-extra-params +:end-before: end-completion-extra-params +``` + +## Chat Template + +In order for the language model to support chat protocol, vLLM requires the model to include +a chat template in its tokenizer configuration. The chat template is a Jinja2 template that +specifies how are roles, messages, and other chat-specific tokens are encoded in the input. + +An example chat template for `meta-llama/Llama-2-7b-chat-hf` can be found [here](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/09bd0f49e16738cdfaa6e615203e126038736eb0/tokenizer_config.json#L12) + +Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model, +you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat +template, or the template in string form. Without a chat template, the server will not be able to process chat +and all chat requests will error. + +```bash +python -m vllm.entrypoints.openai.api_server \ + --model ... \ + --chat-template ./path-to-chat-template.jinja +``` + +vLLM community provides a set of chat templates for popular models. You can find them in the examples +directory [here](https://github.com/vllm-project/vllm/tree/main/examples/) + +## Command line arguments for the server + +```{argparse} +:module: vllm.entrypoints.openai.cli_args +:func: make_arg_parser +:prog: vllm-openai-server +``` \ No newline at end of file diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index e0626ca4e9da..a0685a4d38fb 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1,11 +1,8 @@ -import argparse import asyncio -import json from contextlib import asynccontextmanager import os import importlib import inspect -import ssl from prometheus_client import make_asgi_app import fastapi @@ -23,9 +20,9 @@ from vllm.entrypoints.openai.protocol import (CompletionRequest, ChatCompletionRequest, ErrorResponse) from vllm.logger import init_logger +from vllm.entrypoints.openai.cli_args import make_arg_parser from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion -from vllm.entrypoints.openai.serving_engine import LoRA TIMEOUT_KEEP_ALIVE = 5 # seconds @@ -51,109 +48,8 @@ async def lifespan(app: fastapi.FastAPI): app = fastapi.FastAPI(lifespan=lifespan) -class LoRAParserAction(argparse.Action): - - def __call__(self, parser, namespace, values, option_string=None): - lora_list = [] - for item in values: - name, path = item.split('=') - lora_list.append(LoRA(name, path)) - setattr(namespace, self.dest, lora_list) - - def parse_args(): - parser = argparse.ArgumentParser( - description="vLLM OpenAI-Compatible RESTful API server.") - parser.add_argument("--host", type=str, default=None, help="host name") - parser.add_argument("--port", type=int, default=8000, help="port number") - parser.add_argument( - "--uvicorn-log-level", - type=str, - default="info", - choices=['debug', 'info', 'warning', 'error', 'critical', 'trace'], - help="log level for uvicorn") - parser.add_argument("--allow-credentials", - action="store_true", - help="allow credentials") - parser.add_argument("--allowed-origins", - type=json.loads, - default=["*"], - help="allowed origins") - parser.add_argument("--allowed-methods", - type=json.loads, - default=["*"], - help="allowed methods") - parser.add_argument("--allowed-headers", - type=json.loads, - default=["*"], - help="allowed headers") - parser.add_argument("--api-key", - type=str, - default=None, - help="If provided, the server will require this key " - "to be presented in the header.") - parser.add_argument("--served-model-name", - type=str, - default=None, - help="The model name used in the API. If not " - "specified, the model name will be the same as " - "the huggingface name.") - parser.add_argument( - "--lora-modules", - type=str, - default=None, - nargs='+', - action=LoRAParserAction, - help="LoRA module configurations in the format name=path. " - "Multiple modules can be specified.") - parser.add_argument("--chat-template", - type=str, - default=None, - help="The file path to the chat template, " - "or the template in single-line form " - "for the specified model") - parser.add_argument("--response-role", - type=str, - default="assistant", - help="The role name to return if " - "`request.add_generation_prompt=true`.") - parser.add_argument("--ssl-keyfile", - type=str, - default=None, - help="The file path to the SSL key file") - parser.add_argument("--ssl-certfile", - type=str, - default=None, - help="The file path to the SSL cert file") - parser.add_argument("--ssl-ca-certs", - type=str, - default=None, - help="The CA certificates file") - parser.add_argument( - "--ssl-cert-reqs", - type=int, - default=int(ssl.CERT_NONE), - help="Whether client certificate is required (see stdlib ssl module's)" - ) - parser.add_argument( - "--root-path", - type=str, - default=None, - help="FastAPI root_path when app is behind a path based routing proxy") - parser.add_argument( - "--middleware", - type=str, - action="append", - default=[], - help="Additional ASGI middleware to apply to the app. " - "We accept multiple --middleware arguments. " - "The value should be an import path. " - "If a function is provided, vLLM will add it to the server " - "using @app.middleware('http'). " - "If a class is provided, vLLM will add it to the server " - "using app.add_middleware(). ") - - parser = AsyncEngineArgs.add_cli_args(parser) + parser = make_arg_parser() return parser.parse_args() diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py new file mode 100644 index 000000000000..cc71931b9795 --- /dev/null +++ b/vllm/entrypoints/openai/cli_args.py @@ -0,0 +1,118 @@ +""" +This file contains the command line arguments for the vLLM's +OpenAI-compatible server. It is kept in a separate file for documentation +purposes. +""" + +import argparse +import json +import ssl + +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.entrypoints.openai.serving_engine import LoRA + + +class LoRAParserAction(argparse.Action): + + def __call__(self, parser, namespace, values, option_string=None): + lora_list = [] + for item in values: + name, path = item.split('=') + lora_list.append(LoRA(name, path)) + setattr(namespace, self.dest, lora_list) + + +def make_arg_parser(): + parser = argparse.ArgumentParser( + description="vLLM OpenAI-Compatible RESTful API server.") + parser.add_argument("--host", type=str, default=None, help="host name") + parser.add_argument("--port", type=int, default=8000, help="port number") + parser.add_argument( + "--uvicorn-log-level", + type=str, + default="info", + choices=['debug', 'info', 'warning', 'error', 'critical', 'trace'], + help="log level for uvicorn") + parser.add_argument("--allow-credentials", + action="store_true", + help="allow credentials") + parser.add_argument("--allowed-origins", + type=json.loads, + default=["*"], + help="allowed origins") + parser.add_argument("--allowed-methods", + type=json.loads, + default=["*"], + help="allowed methods") + parser.add_argument("--allowed-headers", + type=json.loads, + default=["*"], + help="allowed headers") + parser.add_argument("--api-key", + type=str, + default=None, + help="If provided, the server will require this key " + "to be presented in the header.") + parser.add_argument("--served-model-name", + type=str, + default=None, + help="The model name used in the API. If not " + "specified, the model name will be the same as " + "the huggingface name.") + parser.add_argument( + "--lora-modules", + type=str, + default=None, + nargs='+', + action=LoRAParserAction, + help="LoRA module configurations in the format name=path. " + "Multiple modules can be specified.") + parser.add_argument("--chat-template", + type=str, + default=None, + help="The file path to the chat template, " + "or the template in single-line form " + "for the specified model") + parser.add_argument("--response-role", + type=str, + default="assistant", + help="The role name to return if " + "`request.add_generation_prompt=true`.") + parser.add_argument("--ssl-keyfile", + type=str, + default=None, + help="The file path to the SSL key file") + parser.add_argument("--ssl-certfile", + type=str, + default=None, + help="The file path to the SSL cert file") + parser.add_argument("--ssl-ca-certs", + type=str, + default=None, + help="The CA certificates file") + parser.add_argument( + "--ssl-cert-reqs", + type=int, + default=int(ssl.CERT_NONE), + help="Whether client certificate is required (see stdlib ssl module's)" + ) + parser.add_argument( + "--root-path", + type=str, + default=None, + help="FastAPI root_path when app is behind a path based routing proxy") + parser.add_argument( + "--middleware", + type=str, + action="append", + default=[], + help="Additional ASGI middleware to apply to the app. " + "We accept multiple --middleware arguments. " + "The value should be an import path. " + "If a function is provided, vLLM will add it to the server " + "using @app.middleware('http'). " + "If a class is provided, vLLM will add it to the server " + "using app.add_middleware(). ") + + parser = AsyncEngineArgs.add_cli_args(parser) + return parser diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 942188041161..1f089d524fd0 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -61,41 +61,80 @@ class ResponseFormat(BaseModel): class ChatCompletionRequest(BaseModel): - model: str + # Ordered by official OpenAI API documentation + # https://platform.openai.com/docs/api-reference/chat/create messages: List[Dict[str, str]] - temperature: Optional[float] = 0.7 - top_p: Optional[float] = 1.0 - n: Optional[int] = 1 + model: str + frequency_penalty: Optional[float] = 0.0 + logit_bias: Optional[Dict[str, float]] = None + logprobs: Optional[bool] = False + top_logprobs: Optional[int] = None max_tokens: Optional[int] = None + n: Optional[int] = 1 + presence_penalty: Optional[float] = 0.0 + response_format: Optional[ResponseFormat] = None seed: Optional[int] = None stop: Optional[Union[str, List[str]]] = Field(default_factory=list) stream: Optional[bool] = False - logprobs: Optional[bool] = False - top_logprobs: Optional[int] = None - presence_penalty: Optional[float] = 0.0 - frequency_penalty: Optional[float] = 0.0 - logit_bias: Optional[Dict[str, float]] = None + temperature: Optional[float] = 0.7 + top_p: Optional[float] = 1.0 user: Optional[str] = None - # Additional parameters supported by vLLM + + # doc: begin-chat-completion-sampling-params best_of: Optional[int] = None - top_k: Optional[int] = -1 - ignore_eos: Optional[bool] = False use_beam_search: Optional[bool] = False + top_k: Optional[int] = -1 + min_p: Optional[float] = 0.0 + repetition_penalty: Optional[float] = 1.0 + length_penalty: Optional[float] = 1.0 early_stopping: Optional[bool] = False + ignore_eos: Optional[bool] = False stop_token_ids: Optional[List[int]] = Field(default_factory=list) skip_special_tokens: Optional[bool] = True spaces_between_special_tokens: Optional[bool] = True - add_generation_prompt: Optional[bool] = True - echo: Optional[bool] = False - repetition_penalty: Optional[float] = 1.0 - min_p: Optional[float] = 0.0 - include_stop_str_in_output: Optional[bool] = False - length_penalty: Optional[float] = 1.0 - guided_json: Optional[Union[str, dict, BaseModel]] = None - guided_regex: Optional[str] = None - guided_choice: Optional[List[str]] = None - guided_grammar: Optional[str] = None - response_format: Optional[ResponseFormat] = None + # doc: end-chat-completion-sampling-params + + # doc: begin-chat-completion-extra-params + echo: Optional[bool] = Field( + default=False, + description=( + "If true, the new message will be prepended with the last message " + "if they belong to the same role."), + ) + add_generation_prompt: Optional[bool] = Field( + default=True, + description= + ("If true, the generation prompt will be added to the chat template. " + "This is a parameter used by chat template in tokenizer config of the " + "model."), + ) + include_stop_str_in_output: Optional[bool] = Field( + default=False, + description=( + "Whether to include the stop string in the output. " + "This is only applied when the stop or stop_token_ids is set."), + ) + guided_json: Optional[Union[str, dict, BaseModel]] = Field( + default=None, + description=("If specified, the output will follow the JSON schema."), + ) + guided_regex: Optional[str] = Field( + default=None, + description=( + "If specified, the output will follow the regex pattern."), + ) + guided_choice: Optional[List[str]] = Field( + default=None, + description=( + "If specified, the output will be exactly one of the choices."), + ) + guided_grammar: Optional[str] = Field( + default=None, + description=( + "If specified, the output will follow the context free grammar."), + ) + + # doc: end-chat-completion-extra-params def to_sampling_params(self) -> SamplingParams: if self.logprobs and not self.top_logprobs: @@ -157,41 +196,74 @@ class ChatCompletionRequest(BaseModel): class CompletionRequest(BaseModel): + # Ordered by official OpenAI API documentation + # https://platform.openai.com/docs/api-reference/completions/create model: str - # a string, array of strings, array of tokens, or array of token arrays prompt: Union[List[int], List[List[int]], str, List[str]] - suffix: Optional[str] = None + best_of: Optional[int] = None + echo: Optional[bool] = False + frequency_penalty: Optional[float] = 0.0 + logit_bias: Optional[Dict[str, float]] = None + logprobs: Optional[int] = None max_tokens: Optional[int] = 16 + n: Optional[int] = 1 + presence_penalty: Optional[float] = 0.0 + seed: Optional[int] = None + stop: Optional[Union[str, List[str]]] = Field(default_factory=list) + stream: Optional[bool] = False + suffix: Optional[str] = None temperature: Optional[float] = 1.0 top_p: Optional[float] = 1.0 - n: Optional[int] = 1 - stream: Optional[bool] = False - logprobs: Optional[int] = None - echo: Optional[bool] = False - stop: Optional[Union[str, List[str]]] = Field(default_factory=list) - seed: Optional[int] = None - presence_penalty: Optional[float] = 0.0 - frequency_penalty: Optional[float] = 0.0 - best_of: Optional[int] = None - logit_bias: Optional[Dict[str, float]] = None user: Optional[str] = None - # Additional parameters supported by vLLM - top_k: Optional[int] = -1 - ignore_eos: Optional[bool] = False + + # doc: begin-completion-sampling-params use_beam_search: Optional[bool] = False + top_k: Optional[int] = -1 + min_p: Optional[float] = 0.0 + repetition_penalty: Optional[float] = 1.0 + length_penalty: Optional[float] = 1.0 early_stopping: Optional[bool] = False stop_token_ids: Optional[List[int]] = Field(default_factory=list) + ignore_eos: Optional[bool] = False skip_special_tokens: Optional[bool] = True spaces_between_special_tokens: Optional[bool] = True - repetition_penalty: Optional[float] = 1.0 - min_p: Optional[float] = 0.0 - include_stop_str_in_output: Optional[bool] = False - length_penalty: Optional[float] = 1.0 - guided_json: Optional[Union[str, dict, BaseModel]] = None - guided_regex: Optional[str] = None - guided_choice: Optional[List[str]] = None - guided_grammar: Optional[str] = None - response_format: Optional[ResponseFormat] = None + # doc: end-completion-sampling-params + + # doc: begin-completion-extra-params + include_stop_str_in_output: Optional[bool] = Field( + default=False, + description=( + "Whether to include the stop string in the output. " + "This is only applied when the stop or stop_token_ids is set."), + ) + response_format: Optional[ResponseFormat] = Field( + default=None, + description= + ("Similar to chat completion, this parameter specifies the format of " + "output. Only {'type': 'json_object'} or {'type': 'text' } is " + "supported."), + ) + guided_json: Optional[Union[str, dict, BaseModel]] = Field( + default=None, + description=("If specified, the output will follow the JSON schema."), + ) + guided_regex: Optional[str] = Field( + default=None, + description=( + "If specified, the output will follow the regex pattern."), + ) + guided_choice: Optional[List[str]] = Field( + default=None, + description=( + "If specified, the output will be exactly one of the choices."), + ) + guided_grammar: Optional[str] = Field( + default=None, + description=( + "If specified, the output will follow the context free grammar."), + ) + + # doc: end-completion-extra-params def to_sampling_params(self): echo_without_generation = self.echo and self.max_tokens == 0