ruff format

Signed-off-by: Vladimir Anisimov <vanisimov@nvidia.com>
This commit is contained in:
Vladimir Anisimov 2025-12-24 02:32:12 -08:00
parent 2817110aa3
commit 2c92ed30cd
2 changed files with 28 additions and 26 deletions

View File

@ -1865,7 +1865,7 @@ class TestFindBestGroupSize:
"""
Tests for the _find_best_group_size function which finds optimal
KV cache group sizes while preferring larger groups.
Key behaviors:
- Prefers LARGER group sizes
- Enforces group_size >= 3 unless overhead exceeds 10%
@ -1913,12 +1913,12 @@ class TestFindBestGroupSize:
"""
full_spec = new_kv_cache_spec()
sw_spec = new_sliding_window_spec(sliding_window=512)
same_type_layers = {
sw_spec: [f"sw_{i}" for i in range(25)],
full_spec: [f"full_{i}" for i in range(5)],
}
result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
# GCD(25, 5) = 5, so group_size=5 gives 0 padding
# Larger sizes like 25 would give padding for full layers
@ -1932,12 +1932,12 @@ class TestFindBestGroupSize:
"""
full_spec = new_kv_cache_spec()
local_spec = new_sliding_window_spec(sliding_window=256)
same_type_layers = {
local_spec: [f"local_{i}" for i in range(24)],
full_spec: [f"full_{i}" for i in range(8)],
}
result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
# GCD(24, 8) = 8, both 4 and 8 give 0 padding
# Prefer 8 (larger group size = fewer groups)
@ -1951,12 +1951,12 @@ class TestFindBestGroupSize:
"""
full_spec = new_kv_cache_spec()
sw_spec = new_sliding_window_spec(sliding_window=512)
same_type_layers = {
full_spec: [f"full_{i}" for i in range(20)],
sw_spec: [f"sw_{i}" for i in range(30)],
}
result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
# GCD(20, 30) = 10, both 5 and 10 divide evenly
# Prefer 10 (larger = fewer groups)
@ -1970,12 +1970,12 @@ class TestFindBestGroupSize:
"""
full_spec = new_kv_cache_spec()
sw_spec = new_sliding_window_spec(sliding_window=512)
same_type_layers = {
sw_spec: [f"sw_{i}" for i in range(12)],
full_spec: [f"full_{i}" for i in range(13)],
}
result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
# group_size=13: 1 padding for sw, 0 for full
# 1 padding out of 25 total = 4% overhead, well under 10%
@ -1984,25 +1984,24 @@ class TestFindBestGroupSize:
def test_fallback_when_overhead_exceeds_threshold(self, vllm_config):
"""
When enforcing min_group_size >= 3 adds > 10% overhead, fallback to 1.
Example: 1 full + 5 sw layers.
- group_size=1: 0 padding (optimal baseline)
- group_size=3: need to pad 2 full layers + 1 sw layer = 3 padding layers
That's 3 padding out of 6 total = 10% overhead, way over 10%
- group_size=5: need to pad 4 full layers = 4 padding layers
That's 4 padding out of 6 total = 67% overhead
So group_size=1 should be chosen as the fallback.
"""
full_spec = new_kv_cache_spec()
sw_spec = new_sliding_window_spec(sliding_window=512)
same_type_layers = {
full_spec: ["full_0"], # 1 full layer
sw_spec: [f"sw_{i}" for i in range(5)], # 5 sw layers
}
result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
# group_size >= 3 would add > 10% overhead, so fallback to 1
assert result == 1

View File

@ -941,23 +941,24 @@ def is_kv_cache_type_attention_free(kv_cache_spec: dict[str, KVCacheSpec]) -> bo
def _find_best_group_size(
same_type_layers: dict["KVCacheSpec", list[str]],
vllm_config: "VllmConfig",
min_preferred_group_size: int = 3,
overhead_threshold: float = 0.10) -> int:
same_type_layers: dict["KVCacheSpec", list[str]],
vllm_config: "VllmConfig",
min_preferred_group_size: int = 3,
overhead_threshold: float = 0.10,
) -> int:
"""
Find the optimal group size that minimizes padding memory, preferring
larger group sizes.
For each layer type, padding = (group_size - count % group_size) % group_size
weighted by that layer's max_memory_usage_bytes. Different layer types
weighted by that layer's max_memory_usage_bytes. Different layer types
contribute differently to total padding based on their actual memory usage
(e.g., full attention vs sliding window).
This function prefers LARGER group sizes. Empirically, small group sizes (1-2)
lead to KV cache memory being concentrated in just a few large tensors, which
can reduce performance due to memory allocation patterns.
The algorithm enforces group_size >= min_preferred_group_size (default 3),
unless doing so would add more than overhead_threshold (default 10%) extra
padding memory compared to the optimal unconstrained group size.
@ -973,7 +974,7 @@ def _find_best_group_size(
Returns:
The optimal group size (minimizes padding, ties broken by larger group size)
Raises:
ValueError: If same_type_layers is empty
"""
@ -999,12 +1000,11 @@ def _find_best_group_size(
def find_best_in_range(start: int, end: int) -> int:
"""Find best group size in [start, end] range.
Prefers larger group sizes when padding is equal.
Key: (padding_memory, -group_size) so larger group_size wins ties.
"""
return min(range(start, end + 1),
key=lambda gs: (calc_padding_memory(gs), -gs))
return min(range(start, end + 1), key=lambda gs: (calc_padding_memory(gs), -gs))
# Calculate baseline: optimal group size with no minimum constraint
baseline_group_size = find_best_in_range(1, max_layers)
@ -1020,8 +1020,11 @@ def _find_best_group_size(
# Check if enforcing the minimum preference adds too much overhead
# Overhead is measured relative to total memory
overhead = (preferred_padding - baseline_padding) / total_base_memory \
if total_base_memory > 0 else 0.0
overhead = (
(preferred_padding - baseline_padding) / total_base_memory
if total_base_memory > 0
else 0.0
)
if overhead > overhead_threshold:
# Fallback to baseline (allowing smaller group sizes)