vllm/vllm/platforms/neuron.py
Harry Mellor 823ab79633
Update pre-commit hooks (#12475)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-01-27 17:23:08 -07:00

56 lines
1.7 KiB
Python

from typing import TYPE_CHECKING, Optional
from vllm.logger import init_logger
from .interface import Platform, PlatformEnum
if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None
logger = init_logger(__name__)
class NeuronPlatform(Platform):
_enum = PlatformEnum.NEURON
device_name: str = "neuron"
device_type: str = "neuron"
ray_device_key: str = "neuron_cores"
supported_quantization: list[str] = ["neuron_quant"]
device_control_env_var: str = "NEURON_RT_VISIBLE_CORES"
@classmethod
def get_device_name(cls, device_id: int = 0) -> str:
return "neuron"
@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
return False
@classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
parallel_config = vllm_config.parallel_config
if parallel_config.worker_cls == "auto":
parallel_config.worker_cls = \
"vllm.worker.neuron_worker.NeuronWorker"
if parallel_config.world_size > 1:
parallel_config.distributed_executor_backend = "uni"
assert (vllm_config.lora_config
is None), "LoRA is not supported for Neuron backend."
assert (not vllm_config.speculative_config
), "Speculative decoding not yet supported for Neuron backend."
cache_config = vllm_config.cache_config
if cache_config:
# neuron needs block_size = max_model_len
vllm_config.cache_config.block_size = \
vllm_config.model_config.max_model_len
@classmethod
def is_pin_memory_available(cls) -> bool:
logger.warning("Pin memory is not supported on Neuron.")
return False