From 6a417b8600d4d1e57698a91b71a38446e8fc5c45 Mon Sep 17 00:00:00 2001
From: ajayvohra2005 <ajayvohr@amazon.com>
Date: Thu, 20 Feb 2025 13:59:36 -0500
Subject: [PATCH] fix neuron performance issue (#13589)

---
 vllm/worker/neuron_worker.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index 5f0eb0019eee2..95e7acd025f02 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -76,7 +76,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
         # Set the number of GPU blocks to be the same as the maximum number of
         # sequences that can be processed in a single batch. This is equivalent
         # to schedule without PagedAttention.
-        num_gpu_blocks = self.scheduler_config.max_num_seqs
+        num_gpu_blocks = self.scheduler_config.max_num_seqs + 1
 
         # Swap not yet supported with Neuron backend.
         num_cpu_blocks = 0
@@ -90,7 +90,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
 
         # Different values are not tested.
         assert num_cpu_blocks == 0
-        assert num_gpu_blocks == self.scheduler_config.max_num_seqs
+        assert num_gpu_blocks == self.scheduler_config.max_num_seqs + 1
 
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks