mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-25 18:28:49 +08:00
fix neuron performance issue (#13589)
This commit is contained in:
parent
d3ea50113c
commit
6a417b8600
@ -76,7 +76,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
|
||||
# Set the number of GPU blocks to be the same as the maximum number of
|
||||
# sequences that can be processed in a single batch. This is equivalent
|
||||
# to schedule without PagedAttention.
|
||||
num_gpu_blocks = self.scheduler_config.max_num_seqs
|
||||
num_gpu_blocks = self.scheduler_config.max_num_seqs + 1
|
||||
|
||||
# Swap not yet supported with Neuron backend.
|
||||
num_cpu_blocks = 0
|
||||
@ -90,7 +90,7 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
|
||||
|
||||
# Different values are not tested.
|
||||
assert num_cpu_blocks == 0
|
||||
assert num_gpu_blocks == self.scheduler_config.max_num_seqs
|
||||
assert num_gpu_blocks == self.scheduler_config.max_num_seqs + 1
|
||||
|
||||
self.cache_config.num_gpu_blocks = num_gpu_blocks
|
||||
self.cache_config.num_cpu_blocks = num_cpu_blocks
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user