From ebed81fbf5e4549b26b6baf74cea0fe4551dd915 Mon Sep 17 00:00:00 2001 From: aws-elaineyz Date: Thu, 22 May 2025 02:18:55 -0700 Subject: [PATCH] Update default neuron config for speculation (#18274) Signed-off-by: Elaine Zhao Co-authored-by: Shashwat Srijan Co-authored-by: Aakash Shetty --- vllm/model_executor/model_loader/neuronx_distributed.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py index b98cea7fe6e1..3a4d93c8c13f 100644 --- a/vllm/model_executor/model_loader/neuronx_distributed.py +++ b/vllm/model_executor/model_loader/neuronx_distributed.py @@ -502,7 +502,7 @@ def _get_default_neuron_config(model_config: ModelConfig, max_context_length=scheduler_config.max_model_len, seq_len=scheduler_config.max_model_len, enable_bucketing=True, - is_continuous_batching=(batch_size > 1), + is_continuous_batching=True, quantized=False, torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], padding_side="right", @@ -520,6 +520,7 @@ def _get_default_speculation_config(model_config: ModelConfig, args.""" neuron_config = dict( tp_degree=parallel_config.tensor_parallel_size, + ctx_batch_size=1, batch_size=scheduler_config.max_num_seqs, max_context_length=scheduler_config.max_model_len, seq_len=scheduler_config.max_model_len, @@ -527,6 +528,7 @@ def _get_default_speculation_config(model_config: ModelConfig, trace_tokengen_model=False, enable_fused_speculation=True, enable_bucketing=True, + is_continuous_batching=True, quantized=False, torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], on_device_sampling_config=dict(