diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d99387542da1..48d01fcfd8f5 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -316,7 +316,7 @@ class EngineArgs: parser.add_argument('--block-size', type=int, default=EngineArgs.block_size, - choices=[8, 16, 32], + choices=[8, 16, 32, 128, 256, 512, 1024, 2048], help='Token block size for contiguous chunks of ' 'tokens.') diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index 5d4c4f497f47..b45d5d86b54f 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -100,9 +100,8 @@ class NeuronExecutorAsync(NeuronExecutor, ExecutorAsyncBase): self, execute_model_req: ExecuteModelRequest, ) -> List[SamplerOutput]: - output = await make_async( - self.driver_worker.execute_model - )(seq_group_metadata_list=execute_model_req.seq_group_metadata_list, ) + output = await make_async(self.driver_worker.execute_model + )(execute_model_req=execute_model_req, ) return output async def check_health_async(self) -> None: diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index 6448e5ff4ac5..4f3fed2dbd72 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -197,6 +197,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): virtual_engine: int = 0, finished_requests_ids: Optional[List[str]] = None ) -> ModelInputForNeuron: + multi_modal_kwargs = None # NOTE: We assume that all sequences in the group are all prompts or # all decodes. is_prompt = seq_group_metadata_list[0].is_prompt diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index f7525e049ee3..3b0ded36ca1b 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -89,6 +89,9 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): return WorkerInput(num_seq_groups=len( execute_model_req.seq_group_metadata_list), ) + def execute_worker(self, worker_input: WorkerInput) -> None: + pass + def get_cache_block_size_bytes(self) -> int: """Determine the size in bytes of a cache block.