ruff format

Signed-off-by: Vladimir Anisimov <vanisimov@nvidia.com>
2026-07-05 12:07:13 +08:00 · 2025-12-24 02:32:12 -08:00 · 2025-12-24 02:32:12 -08:00 · 2c92ed30cd
commit 2c92ed30cd
parent 2817110aa3
2 changed files with 28 additions and 26 deletions
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@ -1865,7 +1865,7 @@ class TestFindBestGroupSize:
    """
    Tests for the _find_best_group_size function which finds optimal
    KV cache group sizes while preferring larger groups.
-    
+
    Key behaviors:
    - Prefers LARGER group sizes
    - Enforces group_size >= 3 unless overhead exceeds 10%
@ -1913,12 +1913,12 @@ class TestFindBestGroupSize:
        """
        full_spec = new_kv_cache_spec()
        sw_spec = new_sliding_window_spec(sliding_window=512)
-        
+
        same_type_layers = {
            sw_spec: [f"sw_{i}" for i in range(25)],
            full_spec: [f"full_{i}" for i in range(5)],
        }
-        
+
        result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
        # GCD(25, 5) = 5, so group_size=5 gives 0 padding
        # Larger sizes like 25 would give padding for full layers
@ -1932,12 +1932,12 @@ class TestFindBestGroupSize:
        """
        full_spec = new_kv_cache_spec()
        local_spec = new_sliding_window_spec(sliding_window=256)
-        
+
        same_type_layers = {
            local_spec: [f"local_{i}" for i in range(24)],
            full_spec: [f"full_{i}" for i in range(8)],
        }
-        
+
        result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
        # GCD(24, 8) = 8, both 4 and 8 give 0 padding
        # Prefer 8 (larger group size = fewer groups)
@ -1951,12 +1951,12 @@ class TestFindBestGroupSize:
        """
        full_spec = new_kv_cache_spec()
        sw_spec = new_sliding_window_spec(sliding_window=512)
-        
+
        same_type_layers = {
            full_spec: [f"full_{i}" for i in range(20)],
            sw_spec: [f"sw_{i}" for i in range(30)],
        }
-        
+
        result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
        # GCD(20, 30) = 10, both 5 and 10 divide evenly
        # Prefer 10 (larger = fewer groups)
@ -1970,12 +1970,12 @@ class TestFindBestGroupSize:
        """
        full_spec = new_kv_cache_spec()
        sw_spec = new_sliding_window_spec(sliding_window=512)
-        
+
        same_type_layers = {
            sw_spec: [f"sw_{i}" for i in range(12)],
            full_spec: [f"full_{i}" for i in range(13)],
        }
-        
+
        result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
        # group_size=13: 1 padding for sw, 0 for full
        # 1 padding out of 25 total = 4% overhead, well under 10%
@ -1984,25 +1984,24 @@ class TestFindBestGroupSize:
    def test_fallback_when_overhead_exceeds_threshold(self, vllm_config):
        """
        When enforcing min_group_size >= 3 adds > 10% overhead, fallback to 1.
-        
+
        Example: 1 full + 5 sw layers.
        - group_size=1: 0 padding (optimal baseline)
        - group_size=3: need to pad 2 full layers + 1 sw layer = 3 padding layers
          That's 3 padding out of 6 total = 10% overhead, way over 10%
        - group_size=5: need to pad 4 full layers = 4 padding layers
          That's 4 padding out of 6 total = 67% overhead
-        
+
        So group_size=1 should be chosen as the fallback.
        """
        full_spec = new_kv_cache_spec()
        sw_spec = new_sliding_window_spec(sliding_window=512)
-        
+
        same_type_layers = {
            full_spec: ["full_0"],  # 1 full layer
            sw_spec: [f"sw_{i}" for i in range(5)],  # 5 sw layers
        }
-        
+
        result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
        # group_size >= 3 would add > 10% overhead, so fallback to 1
        assert result == 1
-
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@ -941,23 +941,24 @@ def is_kv_cache_type_attention_free(kv_cache_spec: dict[str, KVCacheSpec]) -> bo


 def _find_best_group_size(
-        same_type_layers: dict["KVCacheSpec", list[str]],
-        vllm_config: "VllmConfig",
-        min_preferred_group_size: int = 3,
-        overhead_threshold: float = 0.10) -> int:
+    same_type_layers: dict["KVCacheSpec", list[str]],
+    vllm_config: "VllmConfig",
+    min_preferred_group_size: int = 3,
+    overhead_threshold: float = 0.10,
+) -> int:
    """
    Find the optimal group size that minimizes padding memory, preferring
    larger group sizes.

    For each layer type, padding = (group_size - count % group_size) % group_size
-    weighted by that layer's max_memory_usage_bytes. Different layer types 
+    weighted by that layer's max_memory_usage_bytes. Different layer types
    contribute differently to total padding based on their actual memory usage
    (e.g., full attention vs sliding window).

    This function prefers LARGER group sizes. Empirically, small group sizes (1-2)
    lead to KV cache memory being concentrated in just a few large tensors, which
    can reduce performance due to memory allocation patterns.
-    
+
    The algorithm enforces group_size >= min_preferred_group_size (default 3),
    unless doing so would add more than overhead_threshold (default 10%) extra
    padding memory compared to the optimal unconstrained group size.
@ -973,7 +974,7 @@ def _find_best_group_size(

    Returns:
        The optimal group size (minimizes padding, ties broken by larger group size)
-    
+
    Raises:
        ValueError: If same_type_layers is empty
    """
@ -999,12 +1000,11 @@ def _find_best_group_size(

    def find_best_in_range(start: int, end: int) -> int:
        """Find best group size in [start, end] range.
-        
+
        Prefers larger group sizes when padding is equal.
        Key: (padding_memory, -group_size) so larger group_size wins ties.
        """
-        return min(range(start, end + 1),
-                   key=lambda gs: (calc_padding_memory(gs), -gs))
+        return min(range(start, end + 1), key=lambda gs: (calc_padding_memory(gs), -gs))

    # Calculate baseline: optimal group size with no minimum constraint
    baseline_group_size = find_best_in_range(1, max_layers)
@ -1020,8 +1020,11 @@ def _find_best_group_size(

    # Check if enforcing the minimum preference adds too much overhead
    # Overhead is measured relative to total memory
-    overhead = (preferred_padding - baseline_padding) / total_base_memory \
-        if total_base_memory > 0 else 0.0
+    overhead = (
+        (preferred_padding - baseline_padding) / total_base_memory
+        if total_base_memory > 0
+        else 0.0
+    )

    if overhead > overhead_threshold:
        # Fallback to baseline (allowing smaller group sizes)