From 789937af2edb6c1ff847c3cbf0c773fb06602a5f Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Tue, 6 Aug 2024 01:29:43 +0200 Subject: [PATCH] [Doc] [SpecDecode] Update MLPSpeculator documentation (#7100) Signed-off-by: Thomas Parnell --- docs/source/models/spec_decode.rst | 49 ++++++++++++++++++++ vllm/model_executor/models/mlp_speculator.py | 9 ++++ 2 files changed, 58 insertions(+) diff --git a/docs/source/models/spec_decode.rst b/docs/source/models/spec_decode.rst index 87a52360c084..be901fa881b1 100644 --- a/docs/source/models/spec_decode.rst +++ b/docs/source/models/spec_decode.rst @@ -69,6 +69,55 @@ matching n-grams in the prompt. For more information read `this thread. `_ or +`this technical report `_. + +.. code-block:: python + + from vllm import LLM, SamplingParams + + prompts = [ + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + llm = LLM( + model="meta-llama/Meta-Llama-3.1-70B-Instruct", + tensor_parallel_size=4, + speculative_model="ibm-fms/llama3-70b-accelerator", + speculative_draft_tensor_parallel_size=1, + use_v2_block_manager=True, + ) + outputs = llm.generate(prompts, sampling_params) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +Note that these speculative models currently need to be run without tensor parallelism, although +it is possible to run the main model using tensor parallelism (see example above). Since the +speculative models are relatively small, we still see significant speedups. However, this +limitation will be fixed in a future release. + +A variety of speculative models of this type are available on HF hub: + +* `llama-13b-accelerator `_ +* `llama3-8b-accelerator `_ +* `codellama-34b-accelerator `_ +* `llama2-70b-accelerator `_ +* `llama3-70b-accelerator `_ +* `granite-3b-code-instruct-accelerator `_ +* `granite-8b-code-instruct-accelerator `_ +* `granite-7b-instruct-accelerator `_ +* `granite-20b-code-instruct-accelerator `_ + + Resources for vLLM contributors ------------------------------- * `A Hacker's Guide to Speculative Decoding in vLLM `_ diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index d3aec06a92fd..95a655fbbf37 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -56,6 +56,15 @@ class MLPSpeculatorLayerNorm(nn.Module): class MLPSpeculator(nn.Module): + """ + An implementation of the speculative models introduced in + "Accelerating Production LLMs with Combined Token/Embedding + Speculators" + https://arxiv.org/pdf/2404.19124 + + Trained speculators of this type are available on HF hub at: + https://huggingface.co/ibm-fms and https://huggingface.co/ibm-granite + """ def __init__(self, config: MLPSpeculatorConfig, **kwargs) -> None: super().__init__()