From ebed81fbf5e4549b26b6baf74cea0fe4551dd915 Mon Sep 17 00:00:00 2001
From: aws-elaineyz <elaineyz@amazon.com>
Date: Thu, 22 May 2025 02:18:55 -0700
Subject: [PATCH] Update default neuron config for speculation (#18274)

Signed-off-by: Elaine Zhao <elaineyz@amazon.com>
Co-authored-by: Shashwat Srijan <sssrijan@amazon.com>
Co-authored-by: Aakash Shetty <sheaak@amazon.com>
---
 vllm/model_executor/model_loader/neuronx_distributed.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py
index b98cea7fe6e1..3a4d93c8c13f 100644
--- a/vllm/model_executor/model_loader/neuronx_distributed.py
+++ b/vllm/model_executor/model_loader/neuronx_distributed.py
@@ -502,7 +502,7 @@ def _get_default_neuron_config(model_config: ModelConfig,
         max_context_length=scheduler_config.max_model_len,
         seq_len=scheduler_config.max_model_len,
         enable_bucketing=True,
-        is_continuous_batching=(batch_size > 1),
+        is_continuous_batching=True,
         quantized=False,
         torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
         padding_side="right",
@@ -520,6 +520,7 @@ def _get_default_speculation_config(model_config: ModelConfig,
     args."""
     neuron_config = dict(
         tp_degree=parallel_config.tensor_parallel_size,
+        ctx_batch_size=1,
         batch_size=scheduler_config.max_num_seqs,
         max_context_length=scheduler_config.max_model_len,
         seq_len=scheduler_config.max_model_len,
@@ -527,6 +528,7 @@ def _get_default_speculation_config(model_config: ModelConfig,
         trace_tokengen_model=False,
         enable_fused_speculation=True,
         enable_bucketing=True,
+        is_continuous_batching=True,
         quantized=False,
         torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
         on_device_sampling_config=dict(