mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-11 20:35:32 +08:00
Improve configs - SpeculativeConfig (#16971)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
571e8dd65e
commit
d059110498
184
vllm/config.py
184
vllm/config.py
@ -2128,139 +2128,113 @@ class DeviceConfig:
|
||||
self.device = torch.device(self.device_type)
|
||||
|
||||
|
||||
SpeculativeMethod = Literal["ngram", "eagle", "medusa", "mlp_speculator",
|
||||
"draft_model"]
|
||||
SpeculativeAcceptanceMethod = Literal["rejection_sampler",
|
||||
"typical_acceptance_sampler"]
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class SpeculativeConfig:
|
||||
"""
|
||||
Configuration for speculative decoding.
|
||||
Configurable parameters include:
|
||||
- General Speculative Decoding Control:
|
||||
- num_speculative_tokens (int): The number of speculative
|
||||
tokens, if provided. It will default to the number in the draft
|
||||
model config if present, otherwise, it is required.
|
||||
- model (Optional[str]): The name of the draft model, eagle head,
|
||||
or additional weights, if provided.
|
||||
- method (Optional[str]): The name of the speculative method to use.
|
||||
If users provide and set the `model` param, the speculative method
|
||||
type will be detected automatically if possible, if `model` param
|
||||
is not provided, the method name must be provided.
|
||||
- Possible values:
|
||||
- ngram
|
||||
Related additional configuration:
|
||||
- prompt_lookup_max (Optional[int]):
|
||||
Maximum size of ngram token window when using Ngram
|
||||
proposer, required when method is set to ngram.
|
||||
- prompt_lookup_min (Optional[int]):
|
||||
Minimum size of ngram token window when using Ngram
|
||||
proposer, if provided. Defaults to 1.
|
||||
- eagle
|
||||
- medusa
|
||||
- mlp_speculator
|
||||
- draft_model
|
||||
- acceptance_method (str): The method to use for accepting draft
|
||||
tokens. This can take two possible values: 'rejection_sampler' and
|
||||
'typical_acceptance_sampler' for RejectionSampler and
|
||||
TypicalAcceptanceSampler respectively. If not specified, it
|
||||
defaults to 'rejection_sampler'.
|
||||
- Possible values:
|
||||
- rejection_sampler
|
||||
- typical_acceptance_sampler
|
||||
Related additional configuration:
|
||||
- posterior_threshold (Optional[float]):
|
||||
A threshold value that sets a lower bound on the
|
||||
posterior probability of a token in the target model
|
||||
for it to be accepted. This threshold is used only
|
||||
when we use the TypicalAcceptanceSampler for token
|
||||
acceptance.
|
||||
- posterior_alpha (Optional[float]):
|
||||
Scaling factor for entropy-based threshold, applied
|
||||
when using TypicalAcceptanceSampler.
|
||||
- draft_tensor_parallel_size (Optional[int]): The degree of the tensor
|
||||
parallelism for the draft model. Can only be 1 or the same as the
|
||||
target model's tensor parallel size.
|
||||
- disable_logprobs (bool): If set to True, token log probabilities are
|
||||
not returned during speculative decoding. If set to False, token
|
||||
log probabilities are returned according to the log probability
|
||||
settings in SamplingParams. If not specified, it defaults to True.
|
||||
"""Configuration for speculative decoding."""
|
||||
|
||||
- Draft Model Configuration:
|
||||
- quantization (Optional[str]): Quantization method that was used to
|
||||
quantize the draft model weights. If None, we assume the
|
||||
model weights are not quantized. Note that it only takes effect
|
||||
when using the draft model-based speculative method.
|
||||
- max_model_len (Optional[int]): The maximum model length of the
|
||||
draft model. Used when testing the ability to skip
|
||||
speculation for some sequences.
|
||||
- revision: The specific model version to use for the draft model. It
|
||||
can be a branch name, a tag name, or a commit id. If unspecified,
|
||||
will use the default version.
|
||||
- code_revision: The specific revision to use for the draft model code
|
||||
on Hugging Face Hub. It can be a branch name, a tag name, or a
|
||||
commit id. If unspecified, will use the default version.
|
||||
|
||||
- Advanced Control:
|
||||
- disable_mqa_scorer (bool): Disable the MQA scorer and fall back to
|
||||
batch expansion for scoring proposals. If not specified, it
|
||||
defaults to False.
|
||||
- disable_by_batch_size (Optional[int]): Disable speculative decoding
|
||||
for new incoming requests when the number of enqueued requests is
|
||||
larger than this value, if provided.
|
||||
|
||||
Although the parameters above are structured hierarchically, there is no
|
||||
need to nest them during configuration.
|
||||
|
||||
Non-configurable internal parameters include:
|
||||
- Model Configuration:
|
||||
- target_model_config (ModelConfig): The configuration of the target
|
||||
model.
|
||||
- draft_model_config (ModelConfig): The configuration of the draft
|
||||
model initialized internal.
|
||||
- Parallelism Configuration:
|
||||
- target_parallel_config (ParallelConfig): The parallel configuration
|
||||
for the target model.
|
||||
- draft_parallel_config (ParallelConfig): The parallel configuration
|
||||
for the draft model initialized internal.
|
||||
- Execution Control:
|
||||
- enable_chunked_prefill (bool): Whether vLLM is configured to use
|
||||
chunked prefill or not. Used for raising an error since it's not
|
||||
yet compatible with speculative decode.
|
||||
- disable_log_stats (bool): Whether to disable the periodic printing of
|
||||
stage times in speculative decoding.
|
||||
"""
|
||||
# speculative configs from cli args
|
||||
# General speculative decoding control
|
||||
num_speculative_tokens: int = field(default=None,
|
||||
init=True) # type: ignore
|
||||
method: Optional[str] = None
|
||||
acceptance_method: str = "rejection_sampler"
|
||||
draft_tensor_parallel_size: Optional[int] = None
|
||||
disable_logprobs: bool = True
|
||||
|
||||
"""The number of speculative tokens, if provided. It will default to the
|
||||
number in the draft model config if present, otherwise, it is required."""
|
||||
model: Optional[str] = None
|
||||
quantization: Optional[str] = None
|
||||
max_model_len: Optional[int] = None
|
||||
revision: Optional[str] = None
|
||||
code_revision: Optional[str] = None
|
||||
"""The name of the draft model, eagle head, or additional weights, if
|
||||
provided."""
|
||||
method: Optional[SpeculativeMethod] = None
|
||||
"""The name of the speculative method to use. If users provide and set the
|
||||
`model` param, the speculative method type will be detected automatically
|
||||
if possible, if `model` param is not provided, the method name must be
|
||||
provided.
|
||||
|
||||
If using `ngram` method, the related configuration `prompt_lookup_max` and
|
||||
`prompt_lookup_min` should be considered."""
|
||||
acceptance_method: SpeculativeAcceptanceMethod = "rejection_sampler"
|
||||
"""The method to use for accepting draft tokens:\n
|
||||
- "rejection_sampler" maps to `RejectionSampler`.\n
|
||||
- "typical_acceptance_sampler" maps to `TypicalAcceptanceSampler`.
|
||||
|
||||
If using `typical_acceptance_sampler`, the related configuration
|
||||
`posterior_threshold` and `posterior_alpha` should be considered."""
|
||||
draft_tensor_parallel_size: Optional[int] = None
|
||||
"""The degree of the tensor parallelism for the draft model. Can only be 1
|
||||
or the same as the target model's tensor parallel size."""
|
||||
disable_logprobs: bool = True
|
||||
"""If set to True, token log probabilities are not returned during
|
||||
speculative decoding. If set to False, token log probabilities are returned
|
||||
according to the log probability settings in SamplingParams."""
|
||||
|
||||
# Draft model configuration
|
||||
quantization: Optional[str] = None
|
||||
"""Quantization method that was used to quantize the draft model weights.
|
||||
If `None`, we assume the model weights are not quantized. Note that it only
|
||||
takes effect when using the draft model-based speculative method."""
|
||||
max_model_len: Optional[int] = None
|
||||
"""The maximum model length of the draft model. Used when testing the
|
||||
ability to skip speculation for some sequences."""
|
||||
revision: Optional[str] = None
|
||||
"""The specific model version to use for the draft model. It can be a
|
||||
branch name, a tag name, or a commit id. If unspecified, will use the
|
||||
default version."""
|
||||
code_revision: Optional[str] = None
|
||||
"""The specific revision to use for the draft model code on Hugging Face
|
||||
Hub. It can be a branch name, a tag name, or a commit id. If unspecified,
|
||||
will use the default version."""
|
||||
|
||||
# Advanced control
|
||||
disable_mqa_scorer: bool = False
|
||||
"""Disable the MQA scorer and fall back to batch expansion for scoring
|
||||
proposals."""
|
||||
disable_by_batch_size: Optional[int] = None
|
||||
"""Disable speculative decoding for new incoming requests when the number
|
||||
of enqueued requests is larger than this value, if provided."""
|
||||
|
||||
# Ngram proposer configuration
|
||||
prompt_lookup_max: Optional[int] = None
|
||||
"""Maximum size of ngram token window when using Ngram proposer, required
|
||||
when method is set to ngram."""
|
||||
prompt_lookup_min: Optional[int] = None
|
||||
"""Minimum size of ngram token window when using Ngram proposer, if
|
||||
provided. Defaults to 1."""
|
||||
|
||||
# Typical acceptance sampler configuration
|
||||
posterior_threshold: Optional[float] = None
|
||||
"""A threshold value that sets a lower bound on the posterior probability
|
||||
of a token in the target model for it to be accepted. This threshold is
|
||||
used only when we use the `TypicalAcceptanceSampler` for token acceptance.
|
||||
"""
|
||||
posterior_alpha: Optional[float] = None
|
||||
"""Scaling factor for entropy-based threshold, applied when using
|
||||
`TypicalAcceptanceSampler`."""
|
||||
|
||||
# required configuration params passed from engine
|
||||
target_model_config: ModelConfig = field(default=None,
|
||||
init=True) # type: ignore
|
||||
"""The configuration of the target model."""
|
||||
target_parallel_config: ParallelConfig = field(default=None,
|
||||
init=True) # type: ignore
|
||||
"""The parallel configuration for the target model."""
|
||||
enable_chunked_prefill: bool = field(default=None,
|
||||
init=True) # type: ignore
|
||||
"""Whether vLLM is configured to use chunked prefill or not. Used for
|
||||
raising an error since it's not yet compatible with speculative decode."""
|
||||
disable_log_stats: bool = field(default=None, init=True) # type: ignore
|
||||
"""Whether to disable the periodic printing of stage times in speculative
|
||||
decoding."""
|
||||
|
||||
# params generated in the post-init stage
|
||||
draft_model_config: ModelConfig = field(default=None,
|
||||
init=True) # type: ignore
|
||||
"""The configuration of the draft model initialized internal."""
|
||||
draft_parallel_config: ParallelConfig = field(default=None,
|
||||
init=True) # type: ignore
|
||||
"""The parallel configuration for the draft model initialized internal."""
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
|
||||
@ -768,11 +768,18 @@ class EngineArgs:
|
||||
help=('Maximum number of forward steps per '
|
||||
'scheduler call.'))
|
||||
|
||||
parser.add_argument('--speculative-config',
|
||||
type=json.loads,
|
||||
default=None,
|
||||
help='The configurations for speculative decoding.'
|
||||
' Should be a JSON string.')
|
||||
# Speculative arguments
|
||||
speculative_group = parser.add_argument_group(
|
||||
title="SpeculativeConfig",
|
||||
description=SpeculativeConfig.__doc__,
|
||||
)
|
||||
speculative_group.add_argument(
|
||||
'--speculative-config',
|
||||
type=json.loads,
|
||||
default=None,
|
||||
help='The configurations for speculative decoding.'
|
||||
' Should be a JSON string.')
|
||||
|
||||
parser.add_argument(
|
||||
'--ignore-patterns',
|
||||
action="append",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user