mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 23:55:44 +08:00
[Doc] add online speculative decoding example (#7243)
This commit is contained in:
parent
80cbe10c59
commit
0e12cd67a8
@ -14,7 +14,7 @@ Speculative decoding is a technique which improves inter-token latency in memory
|
|||||||
Speculating with a draft model
|
Speculating with a draft model
|
||||||
------------------------------
|
------------------------------
|
||||||
|
|
||||||
The following code configures vLLM to use speculative decoding with a draft model, speculating 5 tokens at a time.
|
The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
|
||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
@ -39,6 +39,50 @@ The following code configures vLLM to use speculative decoding with a draft mode
|
|||||||
generated_text = output.outputs[0].text
|
generated_text = output.outputs[0].text
|
||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
|
|
||||||
|
To perform the same with an online mode launch the server:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
|
||||||
|
--seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \
|
||||||
|
--num_speculative_tokens 5 --gpu_memory_utilization 0.8
|
||||||
|
|
||||||
|
Then use a client:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||||
|
openai_api_key = "EMPTY"
|
||||||
|
openai_api_base = "http://localhost:8000/v1"
|
||||||
|
|
||||||
|
client = OpenAI(
|
||||||
|
# defaults to os.environ.get("OPENAI_API_KEY")
|
||||||
|
api_key=openai_api_key,
|
||||||
|
base_url=openai_api_base,
|
||||||
|
)
|
||||||
|
|
||||||
|
models = client.models.list()
|
||||||
|
model = models.data[0].id
|
||||||
|
|
||||||
|
# Completion API
|
||||||
|
stream = False
|
||||||
|
completion = client.completions.create(
|
||||||
|
model=model,
|
||||||
|
prompt="The future of AI is",
|
||||||
|
echo=False,
|
||||||
|
n=1,
|
||||||
|
stream=stream,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Completion results:")
|
||||||
|
if stream:
|
||||||
|
for c in completion:
|
||||||
|
print(c)
|
||||||
|
else:
|
||||||
|
print(completion)
|
||||||
|
|
||||||
Speculating by matching n-grams in the prompt
|
Speculating by matching n-grams in the prompt
|
||||||
---------------------------------------------
|
---------------------------------------------
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user