diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 5f0eb0019eee2..95e7acd025f02 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -76,7 +76,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): # Set the number of GPU blocks to be the same as the maximum number of # sequences that can be processed in a single batch. This is equivalent # to schedule without PagedAttention. - num_gpu_blocks = self.scheduler_config.max_num_seqs + num_gpu_blocks = self.scheduler_config.max_num_seqs + 1 # Swap not yet supported with Neuron backend. num_cpu_blocks = 0 @@ -90,7 +90,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): # Different values are not tested. assert num_cpu_blocks == 0 - assert num_gpu_blocks == self.scheduler_config.max_num_seqs + assert num_gpu_blocks == self.scheduler_config.max_num_seqs + 1 self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks