diff --git a/docs/source/models/spec_decode.rst b/docs/source/models/spec_decode.rst
index be901fa881b1..d3c196faff25 100644
--- a/docs/source/models/spec_decode.rst
+++ b/docs/source/models/spec_decode.rst
@@ -14,17 +14,17 @@ Speculative decoding is a technique which improves inter-token latency in memory
Speculating with a draft model
------------------------------
-The following code configures vLLM to use speculative decoding with a draft model, speculating 5 tokens at a time.
+The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
.. code-block:: python
from vllm import LLM, SamplingParams
-
+
prompts = [
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
+
llm = LLM(
model="facebook/opt-6.7b",
tensor_parallel_size=1,
@@ -33,12 +33,56 @@ The following code configures vLLM to use speculative decoding with a draft mode
use_v2_block_manager=True,
)
outputs = llm.generate(prompts, sampling_params)
-
+
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+To perform the same with an online mode launch the server:
+
+.. code-block:: bash
+
+ python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
+ --seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \
+ --num_speculative_tokens 5 --gpu_memory_utilization 0.8
+
+ Then use a client:
+
+.. code-block:: python
+
+ from openai import OpenAI
+
+ # Modify OpenAI's API key and API base to use vLLM's API server.
+ openai_api_key = "EMPTY"
+ openai_api_base = "http://localhost:8000/v1"
+
+ client = OpenAI(
+ # defaults to os.environ.get("OPENAI_API_KEY")
+ api_key=openai_api_key,
+ base_url=openai_api_base,
+ )
+
+ models = client.models.list()
+ model = models.data[0].id
+
+ # Completion API
+ stream = False
+ completion = client.completions.create(
+ model=model,
+ prompt="The future of AI is",
+ echo=False,
+ n=1,
+ stream=stream,
+ )
+
+ print("Completion results:")
+ if stream:
+ for c in completion:
+ print(c)
+ else:
+ print(completion)
+
Speculating by matching n-grams in the prompt
---------------------------------------------
@@ -48,12 +92,12 @@ matching n-grams in the prompt. For more information read `this thread. `_ or
+For more information see `this blog `_ or
`this technical report `_.
.. code-block:: python
@@ -100,9 +144,9 @@ For more information see `this blog