mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-14 00:16:00 +08:00
Update default neuron config for speculation (#18274)
Signed-off-by: Elaine Zhao <elaineyz@amazon.com> Co-authored-by: Shashwat Srijan <sssrijan@amazon.com> Co-authored-by: Aakash Shetty <sheaak@amazon.com>
This commit is contained in:
parent
e2d7d31244
commit
ebed81fbf5
@ -502,7 +502,7 @@ def _get_default_neuron_config(model_config: ModelConfig,
|
|||||||
max_context_length=scheduler_config.max_model_len,
|
max_context_length=scheduler_config.max_model_len,
|
||||||
seq_len=scheduler_config.max_model_len,
|
seq_len=scheduler_config.max_model_len,
|
||||||
enable_bucketing=True,
|
enable_bucketing=True,
|
||||||
is_continuous_batching=(batch_size > 1),
|
is_continuous_batching=True,
|
||||||
quantized=False,
|
quantized=False,
|
||||||
torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
|
torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
|
||||||
padding_side="right",
|
padding_side="right",
|
||||||
@ -520,6 +520,7 @@ def _get_default_speculation_config(model_config: ModelConfig,
|
|||||||
args."""
|
args."""
|
||||||
neuron_config = dict(
|
neuron_config = dict(
|
||||||
tp_degree=parallel_config.tensor_parallel_size,
|
tp_degree=parallel_config.tensor_parallel_size,
|
||||||
|
ctx_batch_size=1,
|
||||||
batch_size=scheduler_config.max_num_seqs,
|
batch_size=scheduler_config.max_num_seqs,
|
||||||
max_context_length=scheduler_config.max_model_len,
|
max_context_length=scheduler_config.max_model_len,
|
||||||
seq_len=scheduler_config.max_model_len,
|
seq_len=scheduler_config.max_model_len,
|
||||||
@ -527,6 +528,7 @@ def _get_default_speculation_config(model_config: ModelConfig,
|
|||||||
trace_tokengen_model=False,
|
trace_tokengen_model=False,
|
||||||
enable_fused_speculation=True,
|
enable_fused_speculation=True,
|
||||||
enable_bucketing=True,
|
enable_bucketing=True,
|
||||||
|
is_continuous_batching=True,
|
||||||
quantized=False,
|
quantized=False,
|
||||||
torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
|
torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
|
||||||
on_device_sampling_config=dict(
|
on_device_sampling_config=dict(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user