mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-22 08:54:36 +08:00
ruff format
Signed-off-by: Vladimir Anisimov <vanisimov@nvidia.com>
This commit is contained in:
parent
2817110aa3
commit
2c92ed30cd
@ -1865,7 +1865,7 @@ class TestFindBestGroupSize:
|
||||
"""
|
||||
Tests for the _find_best_group_size function which finds optimal
|
||||
KV cache group sizes while preferring larger groups.
|
||||
|
||||
|
||||
Key behaviors:
|
||||
- Prefers LARGER group sizes
|
||||
- Enforces group_size >= 3 unless overhead exceeds 10%
|
||||
@ -1913,12 +1913,12 @@ class TestFindBestGroupSize:
|
||||
"""
|
||||
full_spec = new_kv_cache_spec()
|
||||
sw_spec = new_sliding_window_spec(sliding_window=512)
|
||||
|
||||
|
||||
same_type_layers = {
|
||||
sw_spec: [f"sw_{i}" for i in range(25)],
|
||||
full_spec: [f"full_{i}" for i in range(5)],
|
||||
}
|
||||
|
||||
|
||||
result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
|
||||
# GCD(25, 5) = 5, so group_size=5 gives 0 padding
|
||||
# Larger sizes like 25 would give padding for full layers
|
||||
@ -1932,12 +1932,12 @@ class TestFindBestGroupSize:
|
||||
"""
|
||||
full_spec = new_kv_cache_spec()
|
||||
local_spec = new_sliding_window_spec(sliding_window=256)
|
||||
|
||||
|
||||
same_type_layers = {
|
||||
local_spec: [f"local_{i}" for i in range(24)],
|
||||
full_spec: [f"full_{i}" for i in range(8)],
|
||||
}
|
||||
|
||||
|
||||
result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
|
||||
# GCD(24, 8) = 8, both 4 and 8 give 0 padding
|
||||
# Prefer 8 (larger group size = fewer groups)
|
||||
@ -1951,12 +1951,12 @@ class TestFindBestGroupSize:
|
||||
"""
|
||||
full_spec = new_kv_cache_spec()
|
||||
sw_spec = new_sliding_window_spec(sliding_window=512)
|
||||
|
||||
|
||||
same_type_layers = {
|
||||
full_spec: [f"full_{i}" for i in range(20)],
|
||||
sw_spec: [f"sw_{i}" for i in range(30)],
|
||||
}
|
||||
|
||||
|
||||
result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
|
||||
# GCD(20, 30) = 10, both 5 and 10 divide evenly
|
||||
# Prefer 10 (larger = fewer groups)
|
||||
@ -1970,12 +1970,12 @@ class TestFindBestGroupSize:
|
||||
"""
|
||||
full_spec = new_kv_cache_spec()
|
||||
sw_spec = new_sliding_window_spec(sliding_window=512)
|
||||
|
||||
|
||||
same_type_layers = {
|
||||
sw_spec: [f"sw_{i}" for i in range(12)],
|
||||
full_spec: [f"full_{i}" for i in range(13)],
|
||||
}
|
||||
|
||||
|
||||
result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
|
||||
# group_size=13: 1 padding for sw, 0 for full
|
||||
# 1 padding out of 25 total = 4% overhead, well under 10%
|
||||
@ -1984,25 +1984,24 @@ class TestFindBestGroupSize:
|
||||
def test_fallback_when_overhead_exceeds_threshold(self, vllm_config):
|
||||
"""
|
||||
When enforcing min_group_size >= 3 adds > 10% overhead, fallback to 1.
|
||||
|
||||
|
||||
Example: 1 full + 5 sw layers.
|
||||
- group_size=1: 0 padding (optimal baseline)
|
||||
- group_size=3: need to pad 2 full layers + 1 sw layer = 3 padding layers
|
||||
That's 3 padding out of 6 total = 10% overhead, way over 10%
|
||||
- group_size=5: need to pad 4 full layers = 4 padding layers
|
||||
That's 4 padding out of 6 total = 67% overhead
|
||||
|
||||
|
||||
So group_size=1 should be chosen as the fallback.
|
||||
"""
|
||||
full_spec = new_kv_cache_spec()
|
||||
sw_spec = new_sliding_window_spec(sliding_window=512)
|
||||
|
||||
|
||||
same_type_layers = {
|
||||
full_spec: ["full_0"], # 1 full layer
|
||||
sw_spec: [f"sw_{i}" for i in range(5)], # 5 sw layers
|
||||
}
|
||||
|
||||
|
||||
result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
|
||||
# group_size >= 3 would add > 10% overhead, so fallback to 1
|
||||
assert result == 1
|
||||
|
||||
|
||||
@ -941,23 +941,24 @@ def is_kv_cache_type_attention_free(kv_cache_spec: dict[str, KVCacheSpec]) -> bo
|
||||
|
||||
|
||||
def _find_best_group_size(
|
||||
same_type_layers: dict["KVCacheSpec", list[str]],
|
||||
vllm_config: "VllmConfig",
|
||||
min_preferred_group_size: int = 3,
|
||||
overhead_threshold: float = 0.10) -> int:
|
||||
same_type_layers: dict["KVCacheSpec", list[str]],
|
||||
vllm_config: "VllmConfig",
|
||||
min_preferred_group_size: int = 3,
|
||||
overhead_threshold: float = 0.10,
|
||||
) -> int:
|
||||
"""
|
||||
Find the optimal group size that minimizes padding memory, preferring
|
||||
larger group sizes.
|
||||
|
||||
For each layer type, padding = (group_size - count % group_size) % group_size
|
||||
weighted by that layer's max_memory_usage_bytes. Different layer types
|
||||
weighted by that layer's max_memory_usage_bytes. Different layer types
|
||||
contribute differently to total padding based on their actual memory usage
|
||||
(e.g., full attention vs sliding window).
|
||||
|
||||
This function prefers LARGER group sizes. Empirically, small group sizes (1-2)
|
||||
lead to KV cache memory being concentrated in just a few large tensors, which
|
||||
can reduce performance due to memory allocation patterns.
|
||||
|
||||
|
||||
The algorithm enforces group_size >= min_preferred_group_size (default 3),
|
||||
unless doing so would add more than overhead_threshold (default 10%) extra
|
||||
padding memory compared to the optimal unconstrained group size.
|
||||
@ -973,7 +974,7 @@ def _find_best_group_size(
|
||||
|
||||
Returns:
|
||||
The optimal group size (minimizes padding, ties broken by larger group size)
|
||||
|
||||
|
||||
Raises:
|
||||
ValueError: If same_type_layers is empty
|
||||
"""
|
||||
@ -999,12 +1000,11 @@ def _find_best_group_size(
|
||||
|
||||
def find_best_in_range(start: int, end: int) -> int:
|
||||
"""Find best group size in [start, end] range.
|
||||
|
||||
|
||||
Prefers larger group sizes when padding is equal.
|
||||
Key: (padding_memory, -group_size) so larger group_size wins ties.
|
||||
"""
|
||||
return min(range(start, end + 1),
|
||||
key=lambda gs: (calc_padding_memory(gs), -gs))
|
||||
return min(range(start, end + 1), key=lambda gs: (calc_padding_memory(gs), -gs))
|
||||
|
||||
# Calculate baseline: optimal group size with no minimum constraint
|
||||
baseline_group_size = find_best_in_range(1, max_layers)
|
||||
@ -1020,8 +1020,11 @@ def _find_best_group_size(
|
||||
|
||||
# Check if enforcing the minimum preference adds too much overhead
|
||||
# Overhead is measured relative to total memory
|
||||
overhead = (preferred_padding - baseline_padding) / total_base_memory \
|
||||
if total_base_memory > 0 else 0.0
|
||||
overhead = (
|
||||
(preferred_padding - baseline_padding) / total_base_memory
|
||||
if total_base_memory > 0
|
||||
else 0.0
|
||||
)
|
||||
|
||||
if overhead > overhead_threshold:
|
||||
# Fallback to baseline (allowing smaller group sizes)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user