mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-29 00:07:10 +08:00
ruff format
Signed-off-by: Vladimir Anisimov <vanisimov@nvidia.com>
This commit is contained in:
parent
2817110aa3
commit
2c92ed30cd
@ -1865,7 +1865,7 @@ class TestFindBestGroupSize:
|
|||||||
"""
|
"""
|
||||||
Tests for the _find_best_group_size function which finds optimal
|
Tests for the _find_best_group_size function which finds optimal
|
||||||
KV cache group sizes while preferring larger groups.
|
KV cache group sizes while preferring larger groups.
|
||||||
|
|
||||||
Key behaviors:
|
Key behaviors:
|
||||||
- Prefers LARGER group sizes
|
- Prefers LARGER group sizes
|
||||||
- Enforces group_size >= 3 unless overhead exceeds 10%
|
- Enforces group_size >= 3 unless overhead exceeds 10%
|
||||||
@ -1913,12 +1913,12 @@ class TestFindBestGroupSize:
|
|||||||
"""
|
"""
|
||||||
full_spec = new_kv_cache_spec()
|
full_spec = new_kv_cache_spec()
|
||||||
sw_spec = new_sliding_window_spec(sliding_window=512)
|
sw_spec = new_sliding_window_spec(sliding_window=512)
|
||||||
|
|
||||||
same_type_layers = {
|
same_type_layers = {
|
||||||
sw_spec: [f"sw_{i}" for i in range(25)],
|
sw_spec: [f"sw_{i}" for i in range(25)],
|
||||||
full_spec: [f"full_{i}" for i in range(5)],
|
full_spec: [f"full_{i}" for i in range(5)],
|
||||||
}
|
}
|
||||||
|
|
||||||
result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
|
result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
|
||||||
# GCD(25, 5) = 5, so group_size=5 gives 0 padding
|
# GCD(25, 5) = 5, so group_size=5 gives 0 padding
|
||||||
# Larger sizes like 25 would give padding for full layers
|
# Larger sizes like 25 would give padding for full layers
|
||||||
@ -1932,12 +1932,12 @@ class TestFindBestGroupSize:
|
|||||||
"""
|
"""
|
||||||
full_spec = new_kv_cache_spec()
|
full_spec = new_kv_cache_spec()
|
||||||
local_spec = new_sliding_window_spec(sliding_window=256)
|
local_spec = new_sliding_window_spec(sliding_window=256)
|
||||||
|
|
||||||
same_type_layers = {
|
same_type_layers = {
|
||||||
local_spec: [f"local_{i}" for i in range(24)],
|
local_spec: [f"local_{i}" for i in range(24)],
|
||||||
full_spec: [f"full_{i}" for i in range(8)],
|
full_spec: [f"full_{i}" for i in range(8)],
|
||||||
}
|
}
|
||||||
|
|
||||||
result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
|
result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
|
||||||
# GCD(24, 8) = 8, both 4 and 8 give 0 padding
|
# GCD(24, 8) = 8, both 4 and 8 give 0 padding
|
||||||
# Prefer 8 (larger group size = fewer groups)
|
# Prefer 8 (larger group size = fewer groups)
|
||||||
@ -1951,12 +1951,12 @@ class TestFindBestGroupSize:
|
|||||||
"""
|
"""
|
||||||
full_spec = new_kv_cache_spec()
|
full_spec = new_kv_cache_spec()
|
||||||
sw_spec = new_sliding_window_spec(sliding_window=512)
|
sw_spec = new_sliding_window_spec(sliding_window=512)
|
||||||
|
|
||||||
same_type_layers = {
|
same_type_layers = {
|
||||||
full_spec: [f"full_{i}" for i in range(20)],
|
full_spec: [f"full_{i}" for i in range(20)],
|
||||||
sw_spec: [f"sw_{i}" for i in range(30)],
|
sw_spec: [f"sw_{i}" for i in range(30)],
|
||||||
}
|
}
|
||||||
|
|
||||||
result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
|
result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
|
||||||
# GCD(20, 30) = 10, both 5 and 10 divide evenly
|
# GCD(20, 30) = 10, both 5 and 10 divide evenly
|
||||||
# Prefer 10 (larger = fewer groups)
|
# Prefer 10 (larger = fewer groups)
|
||||||
@ -1970,12 +1970,12 @@ class TestFindBestGroupSize:
|
|||||||
"""
|
"""
|
||||||
full_spec = new_kv_cache_spec()
|
full_spec = new_kv_cache_spec()
|
||||||
sw_spec = new_sliding_window_spec(sliding_window=512)
|
sw_spec = new_sliding_window_spec(sliding_window=512)
|
||||||
|
|
||||||
same_type_layers = {
|
same_type_layers = {
|
||||||
sw_spec: [f"sw_{i}" for i in range(12)],
|
sw_spec: [f"sw_{i}" for i in range(12)],
|
||||||
full_spec: [f"full_{i}" for i in range(13)],
|
full_spec: [f"full_{i}" for i in range(13)],
|
||||||
}
|
}
|
||||||
|
|
||||||
result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
|
result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
|
||||||
# group_size=13: 1 padding for sw, 0 for full
|
# group_size=13: 1 padding for sw, 0 for full
|
||||||
# 1 padding out of 25 total = 4% overhead, well under 10%
|
# 1 padding out of 25 total = 4% overhead, well under 10%
|
||||||
@ -1984,25 +1984,24 @@ class TestFindBestGroupSize:
|
|||||||
def test_fallback_when_overhead_exceeds_threshold(self, vllm_config):
|
def test_fallback_when_overhead_exceeds_threshold(self, vllm_config):
|
||||||
"""
|
"""
|
||||||
When enforcing min_group_size >= 3 adds > 10% overhead, fallback to 1.
|
When enforcing min_group_size >= 3 adds > 10% overhead, fallback to 1.
|
||||||
|
|
||||||
Example: 1 full + 5 sw layers.
|
Example: 1 full + 5 sw layers.
|
||||||
- group_size=1: 0 padding (optimal baseline)
|
- group_size=1: 0 padding (optimal baseline)
|
||||||
- group_size=3: need to pad 2 full layers + 1 sw layer = 3 padding layers
|
- group_size=3: need to pad 2 full layers + 1 sw layer = 3 padding layers
|
||||||
That's 3 padding out of 6 total = 10% overhead, way over 10%
|
That's 3 padding out of 6 total = 10% overhead, way over 10%
|
||||||
- group_size=5: need to pad 4 full layers = 4 padding layers
|
- group_size=5: need to pad 4 full layers = 4 padding layers
|
||||||
That's 4 padding out of 6 total = 67% overhead
|
That's 4 padding out of 6 total = 67% overhead
|
||||||
|
|
||||||
So group_size=1 should be chosen as the fallback.
|
So group_size=1 should be chosen as the fallback.
|
||||||
"""
|
"""
|
||||||
full_spec = new_kv_cache_spec()
|
full_spec = new_kv_cache_spec()
|
||||||
sw_spec = new_sliding_window_spec(sliding_window=512)
|
sw_spec = new_sliding_window_spec(sliding_window=512)
|
||||||
|
|
||||||
same_type_layers = {
|
same_type_layers = {
|
||||||
full_spec: ["full_0"], # 1 full layer
|
full_spec: ["full_0"], # 1 full layer
|
||||||
sw_spec: [f"sw_{i}" for i in range(5)], # 5 sw layers
|
sw_spec: [f"sw_{i}" for i in range(5)], # 5 sw layers
|
||||||
}
|
}
|
||||||
|
|
||||||
result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
|
result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
|
||||||
# group_size >= 3 would add > 10% overhead, so fallback to 1
|
# group_size >= 3 would add > 10% overhead, so fallback to 1
|
||||||
assert result == 1
|
assert result == 1
|
||||||
|
|
||||||
|
|||||||
@ -941,23 +941,24 @@ def is_kv_cache_type_attention_free(kv_cache_spec: dict[str, KVCacheSpec]) -> bo
|
|||||||
|
|
||||||
|
|
||||||
def _find_best_group_size(
|
def _find_best_group_size(
|
||||||
same_type_layers: dict["KVCacheSpec", list[str]],
|
same_type_layers: dict["KVCacheSpec", list[str]],
|
||||||
vllm_config: "VllmConfig",
|
vllm_config: "VllmConfig",
|
||||||
min_preferred_group_size: int = 3,
|
min_preferred_group_size: int = 3,
|
||||||
overhead_threshold: float = 0.10) -> int:
|
overhead_threshold: float = 0.10,
|
||||||
|
) -> int:
|
||||||
"""
|
"""
|
||||||
Find the optimal group size that minimizes padding memory, preferring
|
Find the optimal group size that minimizes padding memory, preferring
|
||||||
larger group sizes.
|
larger group sizes.
|
||||||
|
|
||||||
For each layer type, padding = (group_size - count % group_size) % group_size
|
For each layer type, padding = (group_size - count % group_size) % group_size
|
||||||
weighted by that layer's max_memory_usage_bytes. Different layer types
|
weighted by that layer's max_memory_usage_bytes. Different layer types
|
||||||
contribute differently to total padding based on their actual memory usage
|
contribute differently to total padding based on their actual memory usage
|
||||||
(e.g., full attention vs sliding window).
|
(e.g., full attention vs sliding window).
|
||||||
|
|
||||||
This function prefers LARGER group sizes. Empirically, small group sizes (1-2)
|
This function prefers LARGER group sizes. Empirically, small group sizes (1-2)
|
||||||
lead to KV cache memory being concentrated in just a few large tensors, which
|
lead to KV cache memory being concentrated in just a few large tensors, which
|
||||||
can reduce performance due to memory allocation patterns.
|
can reduce performance due to memory allocation patterns.
|
||||||
|
|
||||||
The algorithm enforces group_size >= min_preferred_group_size (default 3),
|
The algorithm enforces group_size >= min_preferred_group_size (default 3),
|
||||||
unless doing so would add more than overhead_threshold (default 10%) extra
|
unless doing so would add more than overhead_threshold (default 10%) extra
|
||||||
padding memory compared to the optimal unconstrained group size.
|
padding memory compared to the optimal unconstrained group size.
|
||||||
@ -973,7 +974,7 @@ def _find_best_group_size(
|
|||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The optimal group size (minimizes padding, ties broken by larger group size)
|
The optimal group size (minimizes padding, ties broken by larger group size)
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If same_type_layers is empty
|
ValueError: If same_type_layers is empty
|
||||||
"""
|
"""
|
||||||
@ -999,12 +1000,11 @@ def _find_best_group_size(
|
|||||||
|
|
||||||
def find_best_in_range(start: int, end: int) -> int:
|
def find_best_in_range(start: int, end: int) -> int:
|
||||||
"""Find best group size in [start, end] range.
|
"""Find best group size in [start, end] range.
|
||||||
|
|
||||||
Prefers larger group sizes when padding is equal.
|
Prefers larger group sizes when padding is equal.
|
||||||
Key: (padding_memory, -group_size) so larger group_size wins ties.
|
Key: (padding_memory, -group_size) so larger group_size wins ties.
|
||||||
"""
|
"""
|
||||||
return min(range(start, end + 1),
|
return min(range(start, end + 1), key=lambda gs: (calc_padding_memory(gs), -gs))
|
||||||
key=lambda gs: (calc_padding_memory(gs), -gs))
|
|
||||||
|
|
||||||
# Calculate baseline: optimal group size with no minimum constraint
|
# Calculate baseline: optimal group size with no minimum constraint
|
||||||
baseline_group_size = find_best_in_range(1, max_layers)
|
baseline_group_size = find_best_in_range(1, max_layers)
|
||||||
@ -1020,8 +1020,11 @@ def _find_best_group_size(
|
|||||||
|
|
||||||
# Check if enforcing the minimum preference adds too much overhead
|
# Check if enforcing the minimum preference adds too much overhead
|
||||||
# Overhead is measured relative to total memory
|
# Overhead is measured relative to total memory
|
||||||
overhead = (preferred_padding - baseline_padding) / total_base_memory \
|
overhead = (
|
||||||
if total_base_memory > 0 else 0.0
|
(preferred_padding - baseline_padding) / total_base_memory
|
||||||
|
if total_base_memory > 0
|
||||||
|
else 0.0
|
||||||
|
)
|
||||||
|
|
||||||
if overhead > overhead_threshold:
|
if overhead > overhead_threshold:
|
||||||
# Fallback to baseline (allowing smaller group sizes)
|
# Fallback to baseline (allowing smaller group sizes)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user