mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 16:24:56 +08:00
[Doc] [SpecDecode] Update MLPSpeculator documentation (#7100)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
This commit is contained in:
parent
dfb1a15dcb
commit
789937af2e
@ -69,6 +69,55 @@ matching n-grams in the prompt. For more information read `this thread. <https:/
|
|||||||
generated_text = output.outputs[0].text
|
generated_text = output.outputs[0].text
|
||||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
|
|
||||||
|
Speculating using MLP speculators
|
||||||
|
---------------------------------
|
||||||
|
|
||||||
|
The following code configures vLLM to use speculative decoding where proposals are generated by
|
||||||
|
draft models that conditioning draft predictions on both context vectors and sampled tokens.
|
||||||
|
For more information see `this blog <https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/>`_ or
|
||||||
|
`this technical report <https://arxiv.org/abs/2404.19124>`_.
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
prompts = [
|
||||||
|
"The future of AI is",
|
||||||
|
]
|
||||||
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
|
|
||||||
|
llm = LLM(
|
||||||
|
model="meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
tensor_parallel_size=4,
|
||||||
|
speculative_model="ibm-fms/llama3-70b-accelerator",
|
||||||
|
speculative_draft_tensor_parallel_size=1,
|
||||||
|
use_v2_block_manager=True,
|
||||||
|
)
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
|
|
||||||
|
Note that these speculative models currently need to be run without tensor parallelism, although
|
||||||
|
it is possible to run the main model using tensor parallelism (see example above). Since the
|
||||||
|
speculative models are relatively small, we still see significant speedups. However, this
|
||||||
|
limitation will be fixed in a future release.
|
||||||
|
|
||||||
|
A variety of speculative models of this type are available on HF hub:
|
||||||
|
|
||||||
|
* `llama-13b-accelerator <https://huggingface.co/ibm-fms/llama-13b-accelerator>`_
|
||||||
|
* `llama3-8b-accelerator <https://huggingface.co/ibm-fms/llama3-8b-accelerator>`_
|
||||||
|
* `codellama-34b-accelerator <https://huggingface.co/ibm-fms/codellama-34b-accelerator>`_
|
||||||
|
* `llama2-70b-accelerator <https://huggingface.co/ibm-fms/llama2-70b-accelerator>`_
|
||||||
|
* `llama3-70b-accelerator <https://huggingface.co/ibm-fms/llama3-70b-accelerator>`_
|
||||||
|
* `granite-3b-code-instruct-accelerator <https://huggingface.co/ibm-granite/granite-3b-code-instruct-accelerator>`_
|
||||||
|
* `granite-8b-code-instruct-accelerator <https://huggingface.co/ibm-granite/granite-8b-code-instruct-accelerator>`_
|
||||||
|
* `granite-7b-instruct-accelerator <https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator>`_
|
||||||
|
* `granite-20b-code-instruct-accelerator <https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator>`_
|
||||||
|
|
||||||
|
|
||||||
Resources for vLLM contributors
|
Resources for vLLM contributors
|
||||||
-------------------------------
|
-------------------------------
|
||||||
* `A Hacker's Guide to Speculative Decoding in vLLM <https://www.youtube.com/watch?v=9wNAgpX6z_4>`_
|
* `A Hacker's Guide to Speculative Decoding in vLLM <https://www.youtube.com/watch?v=9wNAgpX6z_4>`_
|
||||||
|
|||||||
@ -56,6 +56,15 @@ class MLPSpeculatorLayerNorm(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class MLPSpeculator(nn.Module):
|
class MLPSpeculator(nn.Module):
|
||||||
|
"""
|
||||||
|
An implementation of the speculative models introduced in
|
||||||
|
"Accelerating Production LLMs with Combined Token/Embedding
|
||||||
|
Speculators"
|
||||||
|
https://arxiv.org/pdf/2404.19124
|
||||||
|
|
||||||
|
Trained speculators of this type are available on HF hub at:
|
||||||
|
https://huggingface.co/ibm-fms and https://huggingface.co/ibm-granite
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, config: MLPSpeculatorConfig, **kwargs) -> None:
|
def __init__(self, config: MLPSpeculatorConfig, **kwargs) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user