From 789937af2edb6c1ff847c3cbf0c773fb06602a5f Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Tue, 6 Aug 2024 01:29:43 +0200
Subject: [PATCH] [Doc] [SpecDecode] Update MLPSpeculator documentation (#7100)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 docs/source/models/spec_decode.rst           | 49 ++++++++++++++++++++
 vllm/model_executor/models/mlp_speculator.py |  9 ++++
 2 files changed, 58 insertions(+)

diff --git a/docs/source/models/spec_decode.rst b/docs/source/models/spec_decode.rst
index 87a52360c084..be901fa881b1 100644
--- a/docs/source/models/spec_decode.rst
+++ b/docs/source/models/spec_decode.rst
@@ -69,6 +69,55 @@ matching n-grams in the prompt. For more information read `this thread. <https:/
         generated_text = output.outputs[0].text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
+Speculating using MLP speculators
+---------------------------------
+
+The following code configures vLLM to use speculative decoding where proposals are generated by
+draft models that conditioning draft predictions on both context vectors and sampled tokens.
+For more information see `this blog <https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/>`_ or 
+`this technical report <https://arxiv.org/abs/2404.19124>`_.
+
+.. code-block:: python
+
+    from vllm import LLM, SamplingParams
+
+    prompts = [
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3.1-70B-Instruct",
+        tensor_parallel_size=4,
+        speculative_model="ibm-fms/llama3-70b-accelerator",
+        speculative_draft_tensor_parallel_size=1,
+        use_v2_block_manager=True,
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+Note that these speculative models currently need to be run without tensor parallelism, although 
+it is possible to run the main model using tensor parallelism (see example above). Since the 
+speculative models are relatively small, we still see significant speedups. However, this 
+limitation will be fixed in a future release.
+
+A variety of speculative models of this type are available on HF hub:
+
+* `llama-13b-accelerator <https://huggingface.co/ibm-fms/llama-13b-accelerator>`_
+* `llama3-8b-accelerator <https://huggingface.co/ibm-fms/llama3-8b-accelerator>`_
+* `codellama-34b-accelerator <https://huggingface.co/ibm-fms/codellama-34b-accelerator>`_
+* `llama2-70b-accelerator <https://huggingface.co/ibm-fms/llama2-70b-accelerator>`_
+* `llama3-70b-accelerator <https://huggingface.co/ibm-fms/llama3-70b-accelerator>`_
+* `granite-3b-code-instruct-accelerator <https://huggingface.co/ibm-granite/granite-3b-code-instruct-accelerator>`_
+* `granite-8b-code-instruct-accelerator <https://huggingface.co/ibm-granite/granite-8b-code-instruct-accelerator>`_
+* `granite-7b-instruct-accelerator <https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator>`_
+* `granite-20b-code-instruct-accelerator <https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator>`_
+
+
 Resources for vLLM contributors
 -------------------------------
 * `A Hacker's Guide to Speculative Decoding in vLLM <https://www.youtube.com/watch?v=9wNAgpX6z_4>`_
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index d3aec06a92fd..95a655fbbf37 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -56,6 +56,15 @@ class MLPSpeculatorLayerNorm(nn.Module):
 
 
 class MLPSpeculator(nn.Module):
+    """
+    An implementation of the speculative models introduced in
+    "Accelerating Production LLMs with Combined Token/Embedding
+    Speculators"
+    https://arxiv.org/pdf/2404.19124
+
+    Trained speculators of this type are available on HF hub at:
+    https://huggingface.co/ibm-fms and https://huggingface.co/ibm-granite
+    """
 
     def __init__(self, config: MLPSpeculatorConfig, **kwargs) -> None:
         super().__init__()