fix neuron performance issue (#13589)

2025-12-25 18:28:49 +08:00 · 2025-02-20 13:59:36 -05:00 · 2025-02-20 13:59:36 -05:00 · 6a417b8600
commit 6a417b8600
parent d3ea50113c
1 changed files with 2 additions and 2 deletions
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@ -76,7 +76,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
        # Set the number of GPU blocks to be the same as the maximum number of
        # sequences that can be processed in a single batch. This is equivalent
        # to schedule without PagedAttention.
-        num_gpu_blocks = self.scheduler_config.max_num_seqs
+        num_gpu_blocks = self.scheduler_config.max_num_seqs + 1

        # Swap not yet supported with Neuron backend.
        num_cpu_blocks = 0
@ -90,7 +90,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):

        # Different values are not tested.
        assert num_cpu_blocks == 0
-        assert num_gpu_blocks == self.scheduler_config.max_num_seqs
+        assert num_gpu_blocks == self.scheduler_config.max_num_seqs + 1

        self.cache_config.num_gpu_blocks = num_gpu_blocks
        self.cache_config.num_cpu_blocks = num_cpu_blocks