diff --git a/docs/source/models/spec_decode.rst b/docs/source/models/spec_decode.rst
index 87a52360c084..be901fa881b1 100644
--- a/docs/source/models/spec_decode.rst
+++ b/docs/source/models/spec_decode.rst
@@ -69,6 +69,55 @@ matching n-grams in the prompt. For more information read `this thread. `_ or
+`this technical report `_.
+
+.. code-block:: python
+
+ from vllm import LLM, SamplingParams
+
+ prompts = [
+ "The future of AI is",
+ ]
+ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+ llm = LLM(
+ model="meta-llama/Meta-Llama-3.1-70B-Instruct",
+ tensor_parallel_size=4,
+ speculative_model="ibm-fms/llama3-70b-accelerator",
+ speculative_draft_tensor_parallel_size=1,
+ use_v2_block_manager=True,
+ )
+ outputs = llm.generate(prompts, sampling_params)
+
+ for output in outputs:
+ prompt = output.prompt
+ generated_text = output.outputs[0].text
+ print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+Note that these speculative models currently need to be run without tensor parallelism, although
+it is possible to run the main model using tensor parallelism (see example above). Since the
+speculative models are relatively small, we still see significant speedups. However, this
+limitation will be fixed in a future release.
+
+A variety of speculative models of this type are available on HF hub:
+
+* `llama-13b-accelerator `_
+* `llama3-8b-accelerator `_
+* `codellama-34b-accelerator `_
+* `llama2-70b-accelerator `_
+* `llama3-70b-accelerator `_
+* `granite-3b-code-instruct-accelerator `_
+* `granite-8b-code-instruct-accelerator `_
+* `granite-7b-instruct-accelerator `_
+* `granite-20b-code-instruct-accelerator `_
+
+
Resources for vLLM contributors
-------------------------------
* `A Hacker's Guide to Speculative Decoding in vLLM `_
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index d3aec06a92fd..95a655fbbf37 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -56,6 +56,15 @@ class MLPSpeculatorLayerNorm(nn.Module):
class MLPSpeculator(nn.Module):
+ """
+ An implementation of the speculative models introduced in
+ "Accelerating Production LLMs with Combined Token/Embedding
+ Speculators"
+ https://arxiv.org/pdf/2404.19124
+
+ Trained speculators of this type are available on HF hub at:
+ https://huggingface.co/ibm-fms and https://huggingface.co/ibm-granite
+ """
def __init__(self, config: MLPSpeculatorConfig, **kwargs) -> None:
super().__init__()