mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-01-28 04:47:13 +08:00
[Feature] Change cache.py with pydantic validation (#26390)
Signed-off-by: Vinay Damodaran <vrdn@hey.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
e09d1753ec
commit
b25d7b5657
@ -3,13 +3,11 @@
|
||||
|
||||
import hashlib
|
||||
from dataclasses import field
|
||||
from typing import TYPE_CHECKING, Any, Literal, Optional, get_args
|
||||
from typing import TYPE_CHECKING, Any, Literal, Optional
|
||||
|
||||
from pydantic import SkipValidation, model_validator
|
||||
from pydantic import Field, SkipValidation, field_validator
|
||||
from pydantic.dataclasses import dataclass
|
||||
from typing_extensions import Self
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config.utils import config
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import GiB_bytes, get_cpu_memory
|
||||
@ -39,7 +37,7 @@ class CacheConfig:
|
||||
This config has no static default. If left unspecified by the user, it will
|
||||
be set in `Platform.check_and_update_config()` based on the current
|
||||
platform."""
|
||||
gpu_memory_utilization: float = 0.9
|
||||
gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1)
|
||||
"""The fraction of GPU memory to be used for the model executor, which can
|
||||
range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
|
||||
utilization. If unspecified, will use the default value of 0.9. This is a
|
||||
@ -47,7 +45,7 @@ class CacheConfig:
|
||||
not matter if you have another vLLM instance running on the same GPU. For
|
||||
example, if you have two vLLM instances running on the same GPU, you can
|
||||
set the GPU memory utilization to 0.5 for each instance."""
|
||||
swap_space: float = 4
|
||||
swap_space: float = Field(default=4, ge=0)
|
||||
"""Size of the CPU swap space per GPU (in GiB)."""
|
||||
cache_dtype: CacheDType = "auto"
|
||||
"""Data type for kv cache storage. If "auto", will use model data type.
|
||||
@ -73,7 +71,7 @@ class CacheConfig:
|
||||
- "sha256" uses Pickle for object serialization before hashing.\n
|
||||
- "sha256_cbor" provides a reproducible, cross-language compatible hash. It
|
||||
serializes objects using canonical CBOR and hashes them with SHA-256."""
|
||||
cpu_offload_gb: float = 0
|
||||
cpu_offload_gb: float = Field(default=0, ge=0)
|
||||
"""The space in GiB to offload to CPU, per GPU. Default is 0, which means
|
||||
no offloading. Intuitively, this argument can be seen as a virtual way to
|
||||
increase the GPU memory size. For example, if you have one 24 GB GPU and
|
||||
@ -147,74 +145,33 @@ class CacheConfig:
|
||||
hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
|
||||
return hash_str
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
self.swap_space_bytes = self.swap_space * GiB_bytes
|
||||
|
||||
self._verify_cache_dtype()
|
||||
self._verify_prefix_caching()
|
||||
|
||||
def metrics_info(self):
|
||||
# convert cache_config to dict(key: str, value: str) for prometheus
|
||||
# metrics info
|
||||
return {key: str(value) for key, value in self.__dict__.items()}
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _verify_args(self) -> Self:
|
||||
if self.cpu_offload_gb < 0:
|
||||
raise ValueError(
|
||||
f"CPU offload space must be non-negative, but got {self.cpu_offload_gb}"
|
||||
)
|
||||
|
||||
if self.gpu_memory_utilization > 1.0:
|
||||
raise ValueError(
|
||||
"GPU memory utilization must be less than 1.0. Got "
|
||||
f"{self.gpu_memory_utilization}."
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
def _verify_cache_dtype(self) -> None:
|
||||
if self.cache_dtype == "auto":
|
||||
pass
|
||||
elif self.cache_dtype in get_args(CacheDType):
|
||||
if self.cache_dtype.startswith("fp8"):
|
||||
logger.info(
|
||||
"Using fp8 data type to store kv cache. It reduces the GPU "
|
||||
"memory footprint and boosts the performance. "
|
||||
"Meanwhile, it may cause accuracy drop without a proper "
|
||||
"scaling factor."
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
|
||||
|
||||
def _verify_prefix_caching(self) -> None:
|
||||
if not self.enable_prefix_caching:
|
||||
return
|
||||
|
||||
if self.sliding_window is not None and not envs.VLLM_USE_V1:
|
||||
raise NotImplementedError(
|
||||
"Prefix caching is not supported with sliding window. "
|
||||
"Run with --disable-sliding-window to use prefix caching."
|
||||
)
|
||||
|
||||
if self.enable_prefix_caching and self.prefix_caching_hash_algo not in get_args(
|
||||
PrefixCachingHashAlgo
|
||||
):
|
||||
raise ValueError(
|
||||
"Unknown prefix caching hash algorithm: "
|
||||
f"{self.prefix_caching_hash_algo}. Must be one of "
|
||||
f"{get_args(PrefixCachingHashAlgo)}."
|
||||
@field_validator("cache_dtype", mode="after")
|
||||
@classmethod
|
||||
def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType:
|
||||
if cache_dtype.startswith("fp8"):
|
||||
logger.info(
|
||||
"Using fp8 data type to store kv cache. It reduces the GPU "
|
||||
"memory footprint and boosts the performance. "
|
||||
"Meanwhile, it may cause accuracy drop without a proper "
|
||||
"scaling factor."
|
||||
)
|
||||
return cache_dtype
|
||||
|
||||
def verify_with_parallel_config(
|
||||
self,
|
||||
parallel_config: ParallelConfig,
|
||||
) -> None:
|
||||
swap_space_bytes = self.swap_space * GiB_bytes
|
||||
total_cpu_memory = get_cpu_memory()
|
||||
# FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
|
||||
# group are in the same node. However, the GPUs may span multiple nodes.
|
||||
num_gpus_per_node = parallel_config.tensor_parallel_size
|
||||
cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
|
||||
cpu_memory_usage = swap_space_bytes * num_gpus_per_node
|
||||
|
||||
msg = (
|
||||
f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the "
|
||||
|
||||
@ -27,6 +27,7 @@ import huggingface_hub
|
||||
import regex as re
|
||||
import torch
|
||||
from pydantic import TypeAdapter, ValidationError
|
||||
from pydantic.fields import FieldInfo
|
||||
from typing_extensions import TypeIs, deprecated
|
||||
|
||||
import vllm.envs as envs
|
||||
@ -209,6 +210,13 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
|
||||
# Get the default value of the field
|
||||
if field.default is not MISSING:
|
||||
default = field.default
|
||||
# Handle pydantic.Field defaults
|
||||
if isinstance(default, FieldInfo):
|
||||
default = (
|
||||
default.default
|
||||
if default.default_factory is None
|
||||
else default.default_factory()
|
||||
)
|
||||
elif field.default_factory is not MISSING:
|
||||
default = field.default_factory()
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user