From 9c5c81b0daa6fd86e5d7b3f1465a6261e6d3cd54 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Sun, 23 Mar 2025 14:00:55 -0700 Subject: [PATCH] [Misc][Doc] Add note regarding loading `generation_config` by default (#15281) Signed-off-by: Roger Wang --- docs/source/getting_started/quickstart.md | 12 +++++++++++- docs/source/models/generative_models.md | 5 +++++ docs/source/serving/openai_compatible_server.md | 4 ++++ vllm/config.py | 7 +++++++ 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index 452bee2385fe3..b5246c41883ea 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -58,6 +58,11 @@ from vllm import LLM, SamplingParams ``` The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](#sampling-params). +:::{important} +By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the Hugging Face model repository if it exists. In most cases, this will provide you with the best results by default if {class}`~vllm.SamplingParams` is not specified. + +However, if vLLM's default sampling parameters are preferred, please set `generation_config="vllm"` when creating the {class}`~vllm.LLM` instance. +::: ```python prompts = [ @@ -76,7 +81,7 @@ llm = LLM(model="facebook/opt-125m") ``` :::{note} -By default, vLLM downloads models from [HuggingFace](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine. +By default, vLLM downloads models from [Hugging Face](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine. ::: Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens. @@ -107,6 +112,11 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it [here](#chat-template). ::: +:::{important} +By default, the server applies `generation_config.json` from the huggingface model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator. + +To disable this behavior, please pass `--generation-config vllm` when launching the server. +::: This server can be queried in the same format as OpenAI API. For example, to list the models: diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md index 06daa04f2deaa..c94e940b8534c 100644 --- a/docs/source/models/generative_models.md +++ b/docs/source/models/generative_models.md @@ -46,6 +46,11 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` +:::{important} +By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if {class}`~vllm.SamplingParams` is not specified. + +However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the {class}`~vllm.LLM` instance. +::: A code example can be found here: ### `LLM.beam_search` diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index a6ec05f45b69b..1cebff7e1f6e2 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -33,7 +33,11 @@ print(completion.choices[0].message) vLLM supports some parameters that are not supported by OpenAI, `top_k` for example. You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`. ::: +:::{important} +By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator. +To disable this behavior, please pass `--generation-config vllm` when launching the server. +::: ## Supported APIs We currently support the following OpenAI APIs: diff --git a/vllm/config.py b/vllm/config.py index 1552fb280a268..ea056bcc928b2 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1023,6 +1023,13 @@ class ModelConfig: "max_new_tokens") else: diff_sampling_param = {} + + if diff_sampling_param: + logger.warning_once( + "Default sampling parameters have been overridden by the " + "model's Hugging Face generation config recommended from the " + "model creator. If this is not intended, please relaunch " + "vLLM instance with `--generation-config vllm`.") return diff_sampling_param @property