[Feature] Change cache.py with pydantic validation (#26390)

Signed-off-by: Vinay Damodaran <vrdn@hey.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-01-28 04:47:13 +08:00 · 2025-10-08 11:12:59 -07:00 · 2025-10-08 11:12:59 -07:00 · b25d7b5657
commit b25d7b5657
parent e09d1753ec
2 changed files with 25 additions and 60 deletions
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@ -3,13 +3,11 @@

 import hashlib
 from dataclasses import field
-from typing import TYPE_CHECKING, Any, Literal, Optional, get_args
+from typing import TYPE_CHECKING, Any, Literal, Optional

-from pydantic import SkipValidation, model_validator
+from pydantic import Field, SkipValidation, field_validator
 from pydantic.dataclasses import dataclass
-from typing_extensions import Self

-import vllm.envs as envs
 from vllm.config.utils import config
 from vllm.logger import init_logger
 from vllm.utils import GiB_bytes, get_cpu_memory
@ -39,7 +37,7 @@ class CacheConfig:
    This config has no static default. If left unspecified by the user, it will
    be set in `Platform.check_and_update_config()` based on the current
    platform."""
-    gpu_memory_utilization: float = 0.9
+    gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1)
    """The fraction of GPU memory to be used for the model executor, which can
    range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
    utilization. If unspecified, will use the default value of 0.9. This is a
@ -47,7 +45,7 @@ class CacheConfig:
    not matter if you have another vLLM instance running on the same GPU. For
    example, if you have two vLLM instances running on the same GPU, you can
    set the GPU memory utilization to 0.5 for each instance."""
-    swap_space: float = 4
+    swap_space: float = Field(default=4, ge=0)
    """Size of the CPU swap space per GPU (in GiB)."""
    cache_dtype: CacheDType = "auto"
    """Data type for kv cache storage. If "auto", will use model data type.
@ -73,7 +71,7 @@ class CacheConfig:
    - "sha256" uses Pickle for object serialization before hashing.\n
    - "sha256_cbor" provides a reproducible, cross-language compatible hash. It
    serializes objects using canonical CBOR and hashes them with SHA-256."""
-    cpu_offload_gb: float = 0
+    cpu_offload_gb: float = Field(default=0, ge=0)
    """The space in GiB to offload to CPU, per GPU. Default is 0, which means
    no offloading. Intuitively, this argument can be seen as a virtual way to
    increase the GPU memory size. For example, if you have one 24 GB GPU and
@ -147,74 +145,33 @@ class CacheConfig:
        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
        return hash_str

-    def __post_init__(self) -> None:
-        self.swap_space_bytes = self.swap_space * GiB_bytes
-
-        self._verify_cache_dtype()
-        self._verify_prefix_caching()
-
    def metrics_info(self):
        # convert cache_config to dict(key: str, value: str) for prometheus
        # metrics info
        return {key: str(value) for key, value in self.__dict__.items()}

-    @model_validator(mode="after")
-    def _verify_args(self) -> Self:
-        if self.cpu_offload_gb < 0:
-            raise ValueError(
-                f"CPU offload space must be non-negative, but got {self.cpu_offload_gb}"
-            )
-
-        if self.gpu_memory_utilization > 1.0:
-            raise ValueError(
-                "GPU memory utilization must be less than 1.0. Got "
-                f"{self.gpu_memory_utilization}."
-            )
-
-        return self
-
-    def _verify_cache_dtype(self) -> None:
-        if self.cache_dtype == "auto":
-            pass
-        elif self.cache_dtype in get_args(CacheDType):
-            if self.cache_dtype.startswith("fp8"):
-                logger.info(
-                    "Using fp8 data type to store kv cache. It reduces the GPU "
-                    "memory footprint and boosts the performance. "
-                    "Meanwhile, it may cause accuracy drop without a proper "
-                    "scaling factor."
-                )
-        else:
-            raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
-
-    def _verify_prefix_caching(self) -> None:
-        if not self.enable_prefix_caching:
-            return
-
-        if self.sliding_window is not None and not envs.VLLM_USE_V1:
-            raise NotImplementedError(
-                "Prefix caching is not supported with sliding window. "
-                "Run with --disable-sliding-window to use prefix caching."
-            )
-
-        if self.enable_prefix_caching and self.prefix_caching_hash_algo not in get_args(
-            PrefixCachingHashAlgo
-        ):
-            raise ValueError(
-                "Unknown prefix caching hash algorithm: "
-                f"{self.prefix_caching_hash_algo}. Must be one of "
-                f"{get_args(PrefixCachingHashAlgo)}."
+    @field_validator("cache_dtype", mode="after")
+    @classmethod
+    def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType:
+        if cache_dtype.startswith("fp8"):
+            logger.info(
+                "Using fp8 data type to store kv cache. It reduces the GPU "
+                "memory footprint and boosts the performance. "
+                "Meanwhile, it may cause accuracy drop without a proper "
+                "scaling factor."
            )
+        return cache_dtype

    def verify_with_parallel_config(
        self,
        parallel_config: ParallelConfig,
    ) -> None:
+        swap_space_bytes = self.swap_space * GiB_bytes
        total_cpu_memory = get_cpu_memory()
        # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
        # group are in the same node. However, the GPUs may span multiple nodes.
        num_gpus_per_node = parallel_config.tensor_parallel_size
-        cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
+        cpu_memory_usage = swap_space_bytes * num_gpus_per_node

        msg = (
            f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the "
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -27,6 +27,7 @@ import huggingface_hub
 import regex as re
 import torch
 from pydantic import TypeAdapter, ValidationError
+from pydantic.fields import FieldInfo
 from typing_extensions import TypeIs, deprecated

 import vllm.envs as envs
@ -209,6 +210,13 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
        # Get the default value of the field
        if field.default is not MISSING:
            default = field.default
+            # Handle pydantic.Field defaults
+            if isinstance(default, FieldInfo):
+                default = (
+                    default.default
+                    if default.default_factory is None
+                    else default.default_factory()
+                )
        elif field.default_factory is not MISSING:
            default = field.default_factory()