mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-12 18:55:36 +08:00
Improve configs - SpeculativeConfig (#16971)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
571e8dd65e
commit
d059110498
184
vllm/config.py
184
vllm/config.py
@ -2128,139 +2128,113 @@ class DeviceConfig:
|
|||||||
self.device = torch.device(self.device_type)
|
self.device = torch.device(self.device_type)
|
||||||
|
|
||||||
|
|
||||||
|
SpeculativeMethod = Literal["ngram", "eagle", "medusa", "mlp_speculator",
|
||||||
|
"draft_model"]
|
||||||
|
SpeculativeAcceptanceMethod = Literal["rejection_sampler",
|
||||||
|
"typical_acceptance_sampler"]
|
||||||
|
|
||||||
|
|
||||||
|
@config
|
||||||
@dataclass
|
@dataclass
|
||||||
class SpeculativeConfig:
|
class SpeculativeConfig:
|
||||||
"""
|
"""Configuration for speculative decoding."""
|
||||||
Configuration for speculative decoding.
|
|
||||||
Configurable parameters include:
|
|
||||||
- General Speculative Decoding Control:
|
|
||||||
- num_speculative_tokens (int): The number of speculative
|
|
||||||
tokens, if provided. It will default to the number in the draft
|
|
||||||
model config if present, otherwise, it is required.
|
|
||||||
- model (Optional[str]): The name of the draft model, eagle head,
|
|
||||||
or additional weights, if provided.
|
|
||||||
- method (Optional[str]): The name of the speculative method to use.
|
|
||||||
If users provide and set the `model` param, the speculative method
|
|
||||||
type will be detected automatically if possible, if `model` param
|
|
||||||
is not provided, the method name must be provided.
|
|
||||||
- Possible values:
|
|
||||||
- ngram
|
|
||||||
Related additional configuration:
|
|
||||||
- prompt_lookup_max (Optional[int]):
|
|
||||||
Maximum size of ngram token window when using Ngram
|
|
||||||
proposer, required when method is set to ngram.
|
|
||||||
- prompt_lookup_min (Optional[int]):
|
|
||||||
Minimum size of ngram token window when using Ngram
|
|
||||||
proposer, if provided. Defaults to 1.
|
|
||||||
- eagle
|
|
||||||
- medusa
|
|
||||||
- mlp_speculator
|
|
||||||
- draft_model
|
|
||||||
- acceptance_method (str): The method to use for accepting draft
|
|
||||||
tokens. This can take two possible values: 'rejection_sampler' and
|
|
||||||
'typical_acceptance_sampler' for RejectionSampler and
|
|
||||||
TypicalAcceptanceSampler respectively. If not specified, it
|
|
||||||
defaults to 'rejection_sampler'.
|
|
||||||
- Possible values:
|
|
||||||
- rejection_sampler
|
|
||||||
- typical_acceptance_sampler
|
|
||||||
Related additional configuration:
|
|
||||||
- posterior_threshold (Optional[float]):
|
|
||||||
A threshold value that sets a lower bound on the
|
|
||||||
posterior probability of a token in the target model
|
|
||||||
for it to be accepted. This threshold is used only
|
|
||||||
when we use the TypicalAcceptanceSampler for token
|
|
||||||
acceptance.
|
|
||||||
- posterior_alpha (Optional[float]):
|
|
||||||
Scaling factor for entropy-based threshold, applied
|
|
||||||
when using TypicalAcceptanceSampler.
|
|
||||||
- draft_tensor_parallel_size (Optional[int]): The degree of the tensor
|
|
||||||
parallelism for the draft model. Can only be 1 or the same as the
|
|
||||||
target model's tensor parallel size.
|
|
||||||
- disable_logprobs (bool): If set to True, token log probabilities are
|
|
||||||
not returned during speculative decoding. If set to False, token
|
|
||||||
log probabilities are returned according to the log probability
|
|
||||||
settings in SamplingParams. If not specified, it defaults to True.
|
|
||||||
|
|
||||||
- Draft Model Configuration:
|
# General speculative decoding control
|
||||||
- quantization (Optional[str]): Quantization method that was used to
|
|
||||||
quantize the draft model weights. If None, we assume the
|
|
||||||
model weights are not quantized. Note that it only takes effect
|
|
||||||
when using the draft model-based speculative method.
|
|
||||||
- max_model_len (Optional[int]): The maximum model length of the
|
|
||||||
draft model. Used when testing the ability to skip
|
|
||||||
speculation for some sequences.
|
|
||||||
- revision: The specific model version to use for the draft model. It
|
|
||||||
can be a branch name, a tag name, or a commit id. If unspecified,
|
|
||||||
will use the default version.
|
|
||||||
- code_revision: The specific revision to use for the draft model code
|
|
||||||
on Hugging Face Hub. It can be a branch name, a tag name, or a
|
|
||||||
commit id. If unspecified, will use the default version.
|
|
||||||
|
|
||||||
- Advanced Control:
|
|
||||||
- disable_mqa_scorer (bool): Disable the MQA scorer and fall back to
|
|
||||||
batch expansion for scoring proposals. If not specified, it
|
|
||||||
defaults to False.
|
|
||||||
- disable_by_batch_size (Optional[int]): Disable speculative decoding
|
|
||||||
for new incoming requests when the number of enqueued requests is
|
|
||||||
larger than this value, if provided.
|
|
||||||
|
|
||||||
Although the parameters above are structured hierarchically, there is no
|
|
||||||
need to nest them during configuration.
|
|
||||||
|
|
||||||
Non-configurable internal parameters include:
|
|
||||||
- Model Configuration:
|
|
||||||
- target_model_config (ModelConfig): The configuration of the target
|
|
||||||
model.
|
|
||||||
- draft_model_config (ModelConfig): The configuration of the draft
|
|
||||||
model initialized internal.
|
|
||||||
- Parallelism Configuration:
|
|
||||||
- target_parallel_config (ParallelConfig): The parallel configuration
|
|
||||||
for the target model.
|
|
||||||
- draft_parallel_config (ParallelConfig): The parallel configuration
|
|
||||||
for the draft model initialized internal.
|
|
||||||
- Execution Control:
|
|
||||||
- enable_chunked_prefill (bool): Whether vLLM is configured to use
|
|
||||||
chunked prefill or not. Used for raising an error since it's not
|
|
||||||
yet compatible with speculative decode.
|
|
||||||
- disable_log_stats (bool): Whether to disable the periodic printing of
|
|
||||||
stage times in speculative decoding.
|
|
||||||
"""
|
|
||||||
# speculative configs from cli args
|
|
||||||
num_speculative_tokens: int = field(default=None,
|
num_speculative_tokens: int = field(default=None,
|
||||||
init=True) # type: ignore
|
init=True) # type: ignore
|
||||||
method: Optional[str] = None
|
"""The number of speculative tokens, if provided. It will default to the
|
||||||
acceptance_method: str = "rejection_sampler"
|
number in the draft model config if present, otherwise, it is required."""
|
||||||
draft_tensor_parallel_size: Optional[int] = None
|
|
||||||
disable_logprobs: bool = True
|
|
||||||
|
|
||||||
model: Optional[str] = None
|
model: Optional[str] = None
|
||||||
quantization: Optional[str] = None
|
"""The name of the draft model, eagle head, or additional weights, if
|
||||||
max_model_len: Optional[int] = None
|
provided."""
|
||||||
revision: Optional[str] = None
|
method: Optional[SpeculativeMethod] = None
|
||||||
code_revision: Optional[str] = None
|
"""The name of the speculative method to use. If users provide and set the
|
||||||
|
`model` param, the speculative method type will be detected automatically
|
||||||
|
if possible, if `model` param is not provided, the method name must be
|
||||||
|
provided.
|
||||||
|
|
||||||
|
If using `ngram` method, the related configuration `prompt_lookup_max` and
|
||||||
|
`prompt_lookup_min` should be considered."""
|
||||||
|
acceptance_method: SpeculativeAcceptanceMethod = "rejection_sampler"
|
||||||
|
"""The method to use for accepting draft tokens:\n
|
||||||
|
- "rejection_sampler" maps to `RejectionSampler`.\n
|
||||||
|
- "typical_acceptance_sampler" maps to `TypicalAcceptanceSampler`.
|
||||||
|
|
||||||
|
If using `typical_acceptance_sampler`, the related configuration
|
||||||
|
`posterior_threshold` and `posterior_alpha` should be considered."""
|
||||||
|
draft_tensor_parallel_size: Optional[int] = None
|
||||||
|
"""The degree of the tensor parallelism for the draft model. Can only be 1
|
||||||
|
or the same as the target model's tensor parallel size."""
|
||||||
|
disable_logprobs: bool = True
|
||||||
|
"""If set to True, token log probabilities are not returned during
|
||||||
|
speculative decoding. If set to False, token log probabilities are returned
|
||||||
|
according to the log probability settings in SamplingParams."""
|
||||||
|
|
||||||
|
# Draft model configuration
|
||||||
|
quantization: Optional[str] = None
|
||||||
|
"""Quantization method that was used to quantize the draft model weights.
|
||||||
|
If `None`, we assume the model weights are not quantized. Note that it only
|
||||||
|
takes effect when using the draft model-based speculative method."""
|
||||||
|
max_model_len: Optional[int] = None
|
||||||
|
"""The maximum model length of the draft model. Used when testing the
|
||||||
|
ability to skip speculation for some sequences."""
|
||||||
|
revision: Optional[str] = None
|
||||||
|
"""The specific model version to use for the draft model. It can be a
|
||||||
|
branch name, a tag name, or a commit id. If unspecified, will use the
|
||||||
|
default version."""
|
||||||
|
code_revision: Optional[str] = None
|
||||||
|
"""The specific revision to use for the draft model code on Hugging Face
|
||||||
|
Hub. It can be a branch name, a tag name, or a commit id. If unspecified,
|
||||||
|
will use the default version."""
|
||||||
|
|
||||||
|
# Advanced control
|
||||||
disable_mqa_scorer: bool = False
|
disable_mqa_scorer: bool = False
|
||||||
|
"""Disable the MQA scorer and fall back to batch expansion for scoring
|
||||||
|
proposals."""
|
||||||
disable_by_batch_size: Optional[int] = None
|
disable_by_batch_size: Optional[int] = None
|
||||||
|
"""Disable speculative decoding for new incoming requests when the number
|
||||||
|
of enqueued requests is larger than this value, if provided."""
|
||||||
|
|
||||||
|
# Ngram proposer configuration
|
||||||
prompt_lookup_max: Optional[int] = None
|
prompt_lookup_max: Optional[int] = None
|
||||||
|
"""Maximum size of ngram token window when using Ngram proposer, required
|
||||||
|
when method is set to ngram."""
|
||||||
prompt_lookup_min: Optional[int] = None
|
prompt_lookup_min: Optional[int] = None
|
||||||
|
"""Minimum size of ngram token window when using Ngram proposer, if
|
||||||
|
provided. Defaults to 1."""
|
||||||
|
|
||||||
|
# Typical acceptance sampler configuration
|
||||||
posterior_threshold: Optional[float] = None
|
posterior_threshold: Optional[float] = None
|
||||||
|
"""A threshold value that sets a lower bound on the posterior probability
|
||||||
|
of a token in the target model for it to be accepted. This threshold is
|
||||||
|
used only when we use the `TypicalAcceptanceSampler` for token acceptance.
|
||||||
|
"""
|
||||||
posterior_alpha: Optional[float] = None
|
posterior_alpha: Optional[float] = None
|
||||||
|
"""Scaling factor for entropy-based threshold, applied when using
|
||||||
|
`TypicalAcceptanceSampler`."""
|
||||||
|
|
||||||
# required configuration params passed from engine
|
# required configuration params passed from engine
|
||||||
target_model_config: ModelConfig = field(default=None,
|
target_model_config: ModelConfig = field(default=None,
|
||||||
init=True) # type: ignore
|
init=True) # type: ignore
|
||||||
|
"""The configuration of the target model."""
|
||||||
target_parallel_config: ParallelConfig = field(default=None,
|
target_parallel_config: ParallelConfig = field(default=None,
|
||||||
init=True) # type: ignore
|
init=True) # type: ignore
|
||||||
|
"""The parallel configuration for the target model."""
|
||||||
enable_chunked_prefill: bool = field(default=None,
|
enable_chunked_prefill: bool = field(default=None,
|
||||||
init=True) # type: ignore
|
init=True) # type: ignore
|
||||||
|
"""Whether vLLM is configured to use chunked prefill or not. Used for
|
||||||
|
raising an error since it's not yet compatible with speculative decode."""
|
||||||
disable_log_stats: bool = field(default=None, init=True) # type: ignore
|
disable_log_stats: bool = field(default=None, init=True) # type: ignore
|
||||||
|
"""Whether to disable the periodic printing of stage times in speculative
|
||||||
|
decoding."""
|
||||||
|
|
||||||
# params generated in the post-init stage
|
# params generated in the post-init stage
|
||||||
draft_model_config: ModelConfig = field(default=None,
|
draft_model_config: ModelConfig = field(default=None,
|
||||||
init=True) # type: ignore
|
init=True) # type: ignore
|
||||||
|
"""The configuration of the draft model initialized internal."""
|
||||||
draft_parallel_config: ParallelConfig = field(default=None,
|
draft_parallel_config: ParallelConfig = field(default=None,
|
||||||
init=True) # type: ignore
|
init=True) # type: ignore
|
||||||
|
"""The parallel configuration for the draft model initialized internal."""
|
||||||
|
|
||||||
def compute_hash(self) -> str:
|
def compute_hash(self) -> str:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -768,11 +768,18 @@ class EngineArgs:
|
|||||||
help=('Maximum number of forward steps per '
|
help=('Maximum number of forward steps per '
|
||||||
'scheduler call.'))
|
'scheduler call.'))
|
||||||
|
|
||||||
parser.add_argument('--speculative-config',
|
# Speculative arguments
|
||||||
|
speculative_group = parser.add_argument_group(
|
||||||
|
title="SpeculativeConfig",
|
||||||
|
description=SpeculativeConfig.__doc__,
|
||||||
|
)
|
||||||
|
speculative_group.add_argument(
|
||||||
|
'--speculative-config',
|
||||||
type=json.loads,
|
type=json.loads,
|
||||||
default=None,
|
default=None,
|
||||||
help='The configurations for speculative decoding.'
|
help='The configurations for speculative decoding.'
|
||||||
' Should be a JSON string.')
|
' Should be a JSON string.')
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--ignore-patterns',
|
'--ignore-patterns',
|
||||||
action="append",
|
action="append",
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user