mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-01-06 19:05:18 +08:00
[Platform] platform agnostic for EngineArgs initialization (#11225)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
parent
59c9b6ebeb
commit
e88db68cf5
@ -112,9 +112,7 @@ class EngineArgs:
|
||||
pipeline_parallel_size: int = 1
|
||||
tensor_parallel_size: int = 1
|
||||
max_parallel_loading_workers: Optional[int] = None
|
||||
# NOTE(kzawora): default block size for Gaudi should be 128
|
||||
# smaller sizes still work, but very inefficiently
|
||||
block_size: int = 16 if not current_platform.is_hpu() else 128
|
||||
block_size: Optional[int] = None
|
||||
enable_prefix_caching: Optional[bool] = None
|
||||
disable_sliding_window: bool = False
|
||||
use_v2_block_manager: bool = True
|
||||
@ -1036,9 +1034,7 @@ class EngineArgs:
|
||||
self.enable_prefix_caching = False
|
||||
|
||||
cache_config = CacheConfig(
|
||||
# neuron needs block_size = max_model_len
|
||||
block_size=self.block_size if self.device != "neuron" else
|
||||
(self.max_model_len if self.max_model_len is not None else 0),
|
||||
block_size=self.block_size,
|
||||
gpu_memory_utilization=self.gpu_memory_utilization,
|
||||
swap_space=self.swap_space,
|
||||
cache_dtype=self.kv_cache_dtype,
|
||||
|
||||
@ -60,6 +60,9 @@ class CpuPlatform(Platform):
|
||||
|
||||
cache_config = vllm_config.cache_config
|
||||
|
||||
if cache_config and cache_config.block_size is None:
|
||||
cache_config.block_size = 16
|
||||
|
||||
kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
|
||||
|
||||
if kv_cache_space >= 0:
|
||||
|
||||
@ -137,6 +137,10 @@ class CudaPlatformBase(Platform):
|
||||
else:
|
||||
parallel_config.worker_cls = "vllm.worker.worker.Worker"
|
||||
|
||||
cache_config = vllm_config.cache_config
|
||||
if cache_config and cache_config.block_size is None:
|
||||
cache_config.block_size = 16
|
||||
|
||||
|
||||
# NVML utils
|
||||
# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
|
||||
|
||||
@ -48,6 +48,12 @@ class HpuPlatform(Platform):
|
||||
if parallel_config.worker_cls == "auto":
|
||||
parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker"
|
||||
|
||||
# NOTE(kzawora): default block size for Gaudi should be 128
|
||||
# smaller sizes still work, but very inefficiently
|
||||
cache_config = vllm_config.cache_config
|
||||
if cache_config and cache_config.block_size is None:
|
||||
cache_config.block_size = 128
|
||||
|
||||
@classmethod
|
||||
def is_pin_memory_available(cls):
|
||||
logger.warning("Pin memory is not supported on HPU.")
|
||||
|
||||
@ -33,6 +33,12 @@ class NeuronPlatform(Platform):
|
||||
parallel_config.worker_cls = \
|
||||
"vllm.worker.neuron_worker.NeuronWorker"
|
||||
|
||||
cache_config = vllm_config.cache_config
|
||||
if cache_config:
|
||||
# neuron needs block_size = max_model_len
|
||||
vllm_config.cache_config.block_size = \
|
||||
vllm_config.model_config.max_model_len
|
||||
|
||||
@classmethod
|
||||
def is_pin_memory_available(cls) -> bool:
|
||||
logger.warning("Pin memory is not supported on Neuron.")
|
||||
|
||||
@ -87,6 +87,9 @@ class OpenVinoPlatform(Platform):
|
||||
# check and update cache config
|
||||
ov_core = ov.Core()
|
||||
cache_config = vllm_config.cache_config
|
||||
if cache_config and cache_config.block_size is None:
|
||||
cache_config.block_size = 16
|
||||
|
||||
if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
|
||||
if not OpenVinoPlatform.is_openvino_cpu():
|
||||
logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"
|
||||
|
||||
@ -84,6 +84,10 @@ class RocmPlatform(Platform):
|
||||
|
||||
@classmethod
|
||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||
cache_config = vllm_config.cache_config
|
||||
if cache_config and cache_config.block_size is None:
|
||||
cache_config.block_size = 16
|
||||
|
||||
parallel_config = vllm_config.parallel_config
|
||||
scheduler_config = vllm_config.scheduler_config
|
||||
if parallel_config.worker_cls == "auto":
|
||||
|
||||
@ -46,6 +46,11 @@ class TpuPlatform(Platform):
|
||||
@classmethod
|
||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||
from vllm.config import CompilationLevel
|
||||
|
||||
cache_config = vllm_config.cache_config
|
||||
if cache_config and cache_config.block_size is None:
|
||||
cache_config.block_size = 16
|
||||
|
||||
compilation_config = vllm_config.compilation_config
|
||||
if compilation_config.level == CompilationLevel.NO_COMPILATION:
|
||||
# TPU does not support NO_COMPILATION
|
||||
|
||||
@ -51,6 +51,10 @@ class XPUPlatform(Platform):
|
||||
|
||||
@classmethod
|
||||
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
|
||||
cache_config = vllm_config.cache_config
|
||||
if cache_config and cache_config.block_size is None:
|
||||
cache_config.block_size = 16
|
||||
|
||||
# check and update model config
|
||||
model_config = vllm_config.model_config
|
||||
if model_config.dtype == torch.bfloat16:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user