From 4c2ffb28ffe7270b49ac7cf5324978950a28e7e1 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Tue, 11 Jun 2024 10:15:40 -0700 Subject: [PATCH] [Speculative decoding] Initial spec decode docs (#5400) --- docs/source/index.rst | 1 + docs/source/models/spec_decode.rst | 75 ++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 docs/source/models/spec_decode.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 0ff0ea1da1ca4..d1886d5d014b0 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -90,6 +90,7 @@ Documentation models/engine_args models/lora models/vlm + models/spec_decode models/performance .. toctree:: diff --git a/docs/source/models/spec_decode.rst b/docs/source/models/spec_decode.rst new file mode 100644 index 0000000000000..57ff4517e9b91 --- /dev/null +++ b/docs/source/models/spec_decode.rst @@ -0,0 +1,75 @@ +.. _spec_decode: + +Speculative decoding in vLLM +============================ + +.. warning:: + Please note that speculative decoding in vLLM is not yet optimized and does + not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work + to optimize it is ongoing and can be followed in `this issue. `_ + +This document shows how to use `Speculative Decoding `_ with vLLM. +Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference. + +Speculating with a draft model +------------------------------ + +The following code configures vLLM to use speculative decoding with a draft model, speculating 5 tokens at a time. + +.. code-block:: python + from vllm import LLM, SamplingParams + + prompts = [ + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + llm = LLM( + model="facebook/opt-6.7b", + tensor_parallel_size=1, + speculative_model="facebook/opt-125m", + num_speculative_tokens=5, + use_v2_block_manager=True, + ) + outputs = llm.generate(prompts, sampling_params) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +Speculating by matching n-grams in the prompt +--------------------------------------------- + +The following code configures vLLM to use speculative decoding where proposals are generated by +matching n-grams in the prompt. For more information read `this thread. `_ + +.. code-block:: python + from vllm import LLM, SamplingParams + + prompts = [ + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + llm = LLM( + model="facebook/opt-6.7b", + tensor_parallel_size=1, + speculative_model="[ngram]", + num_speculative_tokens=5, + ngram_prompt_lookup_max=4, + use_v2_block_manager=True, + ) + outputs = llm.generate(prompts, sampling_params) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +Resources for vLLM contributors +------------------------------- +* `A Hacker's Guide to Speculative Decoding in vLLM `_ +* `What is Lookahead Scheduling in vLLM? `_ +* `Information on batch expansion. `_ +* `Dynamic speculative decoding `_