From b0be6298c42bc5e49c74c5522feea81817d1da87 Mon Sep 17 00:00:00 2001 From: Vladimir Anisimov Date: Tue, 23 Dec 2025 09:07:32 -0800 Subject: [PATCH 1/3] Implement optimal group size calculation for KV cache layers, preferring larger sizes to minimize padding. Introduce `_find_best_group_size` function with overhead management and update related tests for accuracy. Signed-off-by: Vladimir Anisimov --- tests/v1/core/test_kv_cache_utils.py | 153 ++++++++++++++++++++++++++- vllm/v1/core/kv_cache_utils.py | 114 ++++++++++++++++---- 2 files changed, 248 insertions(+), 19 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index c84a51b6883dc..fc16b4b12bad7 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -1292,8 +1292,12 @@ def test_allocate_with_lookahead(): def test_get_kv_cache_config_one_worker(): # pass max_model_len to pass check_enough_kv_cache_memory - model_config = ModelConfig(max_model_len=16) + # Use max_model_len=256 and max_num_batched_tokens=4 so that + # full attention layers (16 blocks) >> sliding window layers (2 blocks), + # making the overhead calculations work correctly for grouping + model_config = ModelConfig(max_model_len=256) vllm_config = VllmConfig(model_config=model_config) + vllm_config.scheduler_config.max_num_batched_tokens = 4 mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2 # all layers are full attention -> single group @@ -1855,3 +1859,150 @@ def test_auto_fit_max_model_len_not_triggered(): vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32] ) assert vllm_config.model_config.max_model_len == 16 + + +class TestFindBestGroupSize: + """ + Tests for the _find_best_group_size function which finds optimal + KV cache group sizes while preferring larger groups. + + Key behaviors: + - Prefers LARGER group sizes + - Enforces group_size >= 3 unless overhead exceeds 10% + - Raises ValueError on empty input + """ + + @pytest.fixture + def vllm_config(self): + """Create a minimal VllmConfig for testing.""" + model_config = ModelConfig(max_model_len=4096) + return VllmConfig(model_config=model_config) + + def test_empty_input_raises(self, vllm_config): + """Empty input should raise ValueError.""" + with pytest.raises(ValueError, match="must not be empty"): + kv_cache_utils._find_best_group_size({}, vllm_config) + + def test_single_layer_type_returns_layer_count(self, vllm_config): + """Homogeneous layers: group_size == num_layers (single group optimal).""" + spec = new_kv_cache_spec() + same_type_layers = {spec: [f"layer_{i}" for i in range(5)]} + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) + # With 5 homogeneous layers, optimal is group_size=5 (one group, no padding) + assert result == 5 + + def test_single_layer_returns_one(self, vllm_config): + """Single layer returns 1.""" + spec = new_kv_cache_spec() + same_type_layers = {spec: ["layer_0"]} + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) + assert result == 1 + + def test_two_layers_returns_two(self, vllm_config): + """Two homogeneous layers -> group_size=2.""" + spec = new_kv_cache_spec() + same_type_layers = {spec: ["layer_0", "layer_1"]} + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) + # max_layers=2, min_preferred=3 >= max_layers, so returns max_layers=2 + assert result == 2 + + def test_gemma3_pattern_regression(self, vllm_config): + """ + Regression test: Gemma3-like model with 5:1 sw/full pattern. + 25 sw + 5 full: group_size=5 gives 0 padding for both. + """ + full_spec = new_kv_cache_spec() + sw_spec = new_sliding_window_spec(sliding_window=512) + + same_type_layers = { + sw_spec: [f"sw_{i}" for i in range(25)], + full_spec: [f"full_{i}" for i in range(5)], + } + + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) + # GCD(25, 5) = 5, so group_size=5 gives 0 padding + # Larger sizes like 25 would give padding for full layers + assert result == 5 + + def test_llama4_pattern_regression(self, vllm_config): + """ + Regression test: LLaMA4-like model with 3:1 local/full pattern. + 24 local + 8 full: group_size=8 gives 0 padding for both. + Prefer 8 over 4 because 8 is larger (fewer groups). + """ + full_spec = new_kv_cache_spec() + local_spec = new_sliding_window_spec(sliding_window=256) + + same_type_layers = { + local_spec: [f"local_{i}" for i in range(24)], + full_spec: [f"full_{i}" for i in range(8)], + } + + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) + # GCD(24, 8) = 8, both 4 and 8 give 0 padding + # Prefer 8 (larger group size = fewer groups) + assert result == 8 + + def test_mixed_20_30_prefers_larger_group(self, vllm_config): + """ + 20 full + 30 sw layers. + Both group_size=5 and 10 give zero padding. + Prefer 10 because it's larger (fewer groups). + """ + full_spec = new_kv_cache_spec() + sw_spec = new_sliding_window_spec(sliding_window=512) + + same_type_layers = { + full_spec: [f"full_{i}" for i in range(20)], + sw_spec: [f"sw_{i}" for i in range(30)], + } + + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) + # GCD(20, 30) = 10, both 5 and 10 divide evenly + # Prefer 10 (larger = fewer groups) + assert result == 10 + + def test_eagle_gpt_oss_20b_pattern_regression(self, vllm_config): + """ + Regression test: GPT-OSS-20B + Eagle pattern (12 sw + 13 full). + group_size=13: 1 padding layer for sw (small overhead), 0 for full. + This is acceptable overhead, so prefer 13. + """ + full_spec = new_kv_cache_spec() + sw_spec = new_sliding_window_spec(sliding_window=512) + + same_type_layers = { + sw_spec: [f"sw_{i}" for i in range(12)], + full_spec: [f"full_{i}" for i in range(13)], + } + + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) + # group_size=13: 1 padding for sw, 0 for full + # 1 padding out of 25 total = 4% overhead, well under 10% + assert result == 13 + + def test_fallback_when_overhead_exceeds_threshold(self, vllm_config): + """ + When enforcing min_group_size >= 3 adds > 10% overhead, fallback to 1. + + Example: 1 full + 5 sw layers. + - group_size=1: 0 padding (optimal baseline) + - group_size=3: need to pad 2 full layers + 1 sw layer = 3 padding layers + That's 3 padding out of 6 total = 10% overhead, way over 10% + - group_size=5: need to pad 4 full layers = 4 padding layers + That's 4 padding out of 6 total = 67% overhead + + So group_size=1 should be chosen as the fallback. + """ + full_spec = new_kv_cache_spec() + sw_spec = new_sliding_window_spec(sliding_window=512) + + same_type_layers = { + full_spec: ["full_0"], # 1 full layer + sw_spec: [f"sw_{i}" for i in range(5)], # 5 sw layers + } + + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) + # group_size >= 3 would add > 10% overhead, so fallback to 1 + assert result == 1 + diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 85afff38e486a..9d9f8a95ea297 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -940,8 +940,99 @@ def is_kv_cache_type_attention_free(kv_cache_spec: dict[str, KVCacheSpec]) -> bo return not kv_cache_spec +def _find_best_group_size( + same_type_layers: dict["KVCacheSpec", list[str]], + vllm_config: "VllmConfig", + min_preferred_group_size: int = 3, + overhead_threshold: float = 0.10) -> int: + """ + Find the optimal group size that minimizes padding memory, preferring + larger group sizes (fewer tensors). + + For each layer type, padding = (group_size - count % group_size) % group_size + weighted by that layer's max_memory_usage_bytes. Different layer types + contribute differently to total padding based on their actual memory usage + (e.g., full attention vs sliding window). + + This function prefers LARGER group sizes. Empirically, small group sizes (1-2) + lead to KV cache memory being concentrated in just a few large tensors, which + can reduce performance due to memory allocation patterns. + + The algorithm enforces group_size >= min_preferred_group_size (default 3), + unless doing so would add more than overhead_threshold (default 10%) extra + padding memory compared to the optimal unconstrained group size. + + Args: + same_type_layers: Dict mapping KVCacheSpec to list of layer names. + Must not be empty. + vllm_config: The global VllmConfig, used to compute max_memory_usage_bytes + min_preferred_group_size: Preferred minimum group size (default 3). + Group sizes below this are avoided unless overhead exceeds threshold. + overhead_threshold: Maximum allowed overhead ratio (default 0.10 = 10%) + before falling back to smaller group sizes. + + Returns: + The optimal group size (minimizes padding, ties broken by larger group size) + + Raises: + ValueError: If same_type_layers is empty + """ + if not same_type_layers: + raise ValueError("same_type_layers must not be empty") + + # Extract (layer_count, max_memory_usage_bytes) per spec + # max_memory_usage_bytes properly weights full attention vs sliding window + layer_info = [ + (len(layers), spec.max_memory_usage_bytes(vllm_config)) + for spec, layers in same_type_layers.items() + ] + + max_layers = max(count for count, _ in layer_info) + total_base_memory = sum(count * mem_size for count, mem_size in layer_info) + + def calc_padding_memory(group_size: int) -> int: + """Total padding memory, weighted by each layer type's memory size.""" + return sum( + ((group_size - count % group_size) % group_size) * mem_size + for count, mem_size in layer_info + ) + + def find_best_in_range(start: int, end: int) -> int: + """Find best group size in [start, end] range. + + Prefers larger group sizes (fewer tensors) when padding is equal. + Key: (padding_memory, -group_size) so larger group_size wins ties. + """ + return min(range(start, end + 1), + key=lambda gs: (calc_padding_memory(gs), -gs)) + + # Calculate baseline: optimal group size with no minimum constraint + baseline_group_size = find_best_in_range(1, max_layers) + baseline_padding = calc_padding_memory(baseline_group_size) + + # If preferred minimum is >= max_layers, just use max_layers + if min_preferred_group_size >= max_layers: + return max_layers + + # Calculate preferred: optimal group size with minimum constraint + preferred_group_size = find_best_in_range(min_preferred_group_size, max_layers) + preferred_padding = calc_padding_memory(preferred_group_size) + + # Check if enforcing the minimum preference adds too much overhead + # Overhead is measured relative to total memory + overhead = (preferred_padding - baseline_padding) / total_base_memory \ + if total_base_memory > 0 else 0.0 + + if overhead > overhead_threshold: + # Fallback to baseline (allowing smaller group sizes) + return baseline_group_size + + return preferred_group_size + + def _get_kv_cache_groups_uniform_page_size( kv_cache_spec: dict[str, KVCacheSpec], + vllm_config: "VllmConfig", ) -> list[KVCacheGroupSpec]: """ Generates the KV cache groups for hybrid models with multiple @@ -1017,23 +1108,10 @@ def _get_kv_cache_groups_uniform_page_size( # E.g., (full.0, full.1), (sw.0, sw.1, sw.2) # split to 3 groups with 2 layers each: # (full.0, full.1), (sw.0, sw.2), (sw.1, padding). - # FIXME(Chen): At the moment of writing this code (2025-06-02), all - # open-source hybrid model follows a n:1 pattern between different attention - # types (e.g., Gemma3 5:1 between sw and full, LLaMA4 3:1 between local and - # full), so we can use the "1" in the n:1 pattern as the group size, which - # is the minimum number of layers among all attention types. Need a better - # strategy if we want to support more complex patterns (e.g., 20 full + 30 - # sw, where the group size should be 10). - min_num_layers = min([len(layers) for layers in same_type_layers.values()]) - group_size = min_num_layers - max_num_layers = max([len(layers) for layers in same_type_layers.values()]) - if max_num_layers < min_num_layers * 1.25: - # If the number of layers is not much larger than the minimum number of layers, - # use the maximum number of layers as the group size to avoid too many padding - # layers. A typical example is gpt-oss-20b + eagle, with 12 sw + 13 full. We - # pad it to (13 sw, 13 full) instead of (12 sw, 24 full). 1.25 is just a - # magic number to avoid too many padding layers. - group_size = max_num_layers + # Find optimal group_size by trying all options and choosing the one with + # minimal padding (weighted by layer memory size). Prefers larger group sizes + # (fewer tensors) and enforces group_size >= 3 unless overhead exceeds 20%. + group_size = _find_best_group_size(same_type_layers, vllm_config) grouped_layers = [] for layers in same_type_layers.values(): num_padding_layers = group_size - len(layers) % group_size @@ -1239,7 +1317,7 @@ def get_kv_cache_groups( # have the same physical memory per block per layer. Split the layers # into groups with the same number of layers, and thus same total page # size. - return _get_kv_cache_groups_uniform_page_size(kv_cache_spec) + return _get_kv_cache_groups_uniform_page_size(kv_cache_spec, vllm_config) def generate_scheduler_kv_cache_config( From 2817110aa377247a34a646aac955ccf56de5a66f Mon Sep 17 00:00:00 2001 From: Vladimir Anisimov Date: Wed, 24 Dec 2025 02:09:52 -0800 Subject: [PATCH 2/3] some docstring updates Signed-off-by: Vladimir Anisimov --- vllm/v1/core/kv_cache_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 9d9f8a95ea297..ab746cab63660 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -947,7 +947,7 @@ def _find_best_group_size( overhead_threshold: float = 0.10) -> int: """ Find the optimal group size that minimizes padding memory, preferring - larger group sizes (fewer tensors). + larger group sizes. For each layer type, padding = (group_size - count % group_size) % group_size weighted by that layer's max_memory_usage_bytes. Different layer types @@ -1000,7 +1000,7 @@ def _find_best_group_size( def find_best_in_range(start: int, end: int) -> int: """Find best group size in [start, end] range. - Prefers larger group sizes (fewer tensors) when padding is equal. + Prefers larger group sizes when padding is equal. Key: (padding_memory, -group_size) so larger group_size wins ties. """ return min(range(start, end + 1), @@ -1110,7 +1110,7 @@ def _get_kv_cache_groups_uniform_page_size( # (full.0, full.1), (sw.0, sw.2), (sw.1, padding). # Find optimal group_size by trying all options and choosing the one with # minimal padding (weighted by layer memory size). Prefers larger group sizes - # (fewer tensors) and enforces group_size >= 3 unless overhead exceeds 20%. + # and enforces group_size >= 3 unless overhead exceeds the threshold. group_size = _find_best_group_size(same_type_layers, vllm_config) grouped_layers = [] for layers in same_type_layers.values(): From 2c92ed30cd8edaf3e6045e9f5ed981b2651a2a88 Mon Sep 17 00:00:00 2001 From: Vladimir Anisimov Date: Wed, 24 Dec 2025 02:32:12 -0800 Subject: [PATCH 3/3] ruff format Signed-off-by: Vladimir Anisimov --- tests/v1/core/test_kv_cache_utils.py | 27 +++++++++++++-------------- vllm/v1/core/kv_cache_utils.py | 27 +++++++++++++++------------ 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index fc16b4b12bad7..44c92ef93c3b5 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -1865,7 +1865,7 @@ class TestFindBestGroupSize: """ Tests for the _find_best_group_size function which finds optimal KV cache group sizes while preferring larger groups. - + Key behaviors: - Prefers LARGER group sizes - Enforces group_size >= 3 unless overhead exceeds 10% @@ -1913,12 +1913,12 @@ class TestFindBestGroupSize: """ full_spec = new_kv_cache_spec() sw_spec = new_sliding_window_spec(sliding_window=512) - + same_type_layers = { sw_spec: [f"sw_{i}" for i in range(25)], full_spec: [f"full_{i}" for i in range(5)], } - + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) # GCD(25, 5) = 5, so group_size=5 gives 0 padding # Larger sizes like 25 would give padding for full layers @@ -1932,12 +1932,12 @@ class TestFindBestGroupSize: """ full_spec = new_kv_cache_spec() local_spec = new_sliding_window_spec(sliding_window=256) - + same_type_layers = { local_spec: [f"local_{i}" for i in range(24)], full_spec: [f"full_{i}" for i in range(8)], } - + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) # GCD(24, 8) = 8, both 4 and 8 give 0 padding # Prefer 8 (larger group size = fewer groups) @@ -1951,12 +1951,12 @@ class TestFindBestGroupSize: """ full_spec = new_kv_cache_spec() sw_spec = new_sliding_window_spec(sliding_window=512) - + same_type_layers = { full_spec: [f"full_{i}" for i in range(20)], sw_spec: [f"sw_{i}" for i in range(30)], } - + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) # GCD(20, 30) = 10, both 5 and 10 divide evenly # Prefer 10 (larger = fewer groups) @@ -1970,12 +1970,12 @@ class TestFindBestGroupSize: """ full_spec = new_kv_cache_spec() sw_spec = new_sliding_window_spec(sliding_window=512) - + same_type_layers = { sw_spec: [f"sw_{i}" for i in range(12)], full_spec: [f"full_{i}" for i in range(13)], } - + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) # group_size=13: 1 padding for sw, 0 for full # 1 padding out of 25 total = 4% overhead, well under 10% @@ -1984,25 +1984,24 @@ class TestFindBestGroupSize: def test_fallback_when_overhead_exceeds_threshold(self, vllm_config): """ When enforcing min_group_size >= 3 adds > 10% overhead, fallback to 1. - + Example: 1 full + 5 sw layers. - group_size=1: 0 padding (optimal baseline) - group_size=3: need to pad 2 full layers + 1 sw layer = 3 padding layers That's 3 padding out of 6 total = 10% overhead, way over 10% - group_size=5: need to pad 4 full layers = 4 padding layers That's 4 padding out of 6 total = 67% overhead - + So group_size=1 should be chosen as the fallback. """ full_spec = new_kv_cache_spec() sw_spec = new_sliding_window_spec(sliding_window=512) - + same_type_layers = { full_spec: ["full_0"], # 1 full layer sw_spec: [f"sw_{i}" for i in range(5)], # 5 sw layers } - + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) # group_size >= 3 would add > 10% overhead, so fallback to 1 assert result == 1 - diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index ab746cab63660..3665ef68202a0 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -941,23 +941,24 @@ def is_kv_cache_type_attention_free(kv_cache_spec: dict[str, KVCacheSpec]) -> bo def _find_best_group_size( - same_type_layers: dict["KVCacheSpec", list[str]], - vllm_config: "VllmConfig", - min_preferred_group_size: int = 3, - overhead_threshold: float = 0.10) -> int: + same_type_layers: dict["KVCacheSpec", list[str]], + vllm_config: "VllmConfig", + min_preferred_group_size: int = 3, + overhead_threshold: float = 0.10, +) -> int: """ Find the optimal group size that minimizes padding memory, preferring larger group sizes. For each layer type, padding = (group_size - count % group_size) % group_size - weighted by that layer's max_memory_usage_bytes. Different layer types + weighted by that layer's max_memory_usage_bytes. Different layer types contribute differently to total padding based on their actual memory usage (e.g., full attention vs sliding window). This function prefers LARGER group sizes. Empirically, small group sizes (1-2) lead to KV cache memory being concentrated in just a few large tensors, which can reduce performance due to memory allocation patterns. - + The algorithm enforces group_size >= min_preferred_group_size (default 3), unless doing so would add more than overhead_threshold (default 10%) extra padding memory compared to the optimal unconstrained group size. @@ -973,7 +974,7 @@ def _find_best_group_size( Returns: The optimal group size (minimizes padding, ties broken by larger group size) - + Raises: ValueError: If same_type_layers is empty """ @@ -999,12 +1000,11 @@ def _find_best_group_size( def find_best_in_range(start: int, end: int) -> int: """Find best group size in [start, end] range. - + Prefers larger group sizes when padding is equal. Key: (padding_memory, -group_size) so larger group_size wins ties. """ - return min(range(start, end + 1), - key=lambda gs: (calc_padding_memory(gs), -gs)) + return min(range(start, end + 1), key=lambda gs: (calc_padding_memory(gs), -gs)) # Calculate baseline: optimal group size with no minimum constraint baseline_group_size = find_best_in_range(1, max_layers) @@ -1020,8 +1020,11 @@ def _find_best_group_size( # Check if enforcing the minimum preference adds too much overhead # Overhead is measured relative to total memory - overhead = (preferred_padding - baseline_padding) / total_base_memory \ - if total_base_memory > 0 else 0.0 + overhead = ( + (preferred_padding - baseline_padding) / total_base_memory + if total_base_memory > 0 + else 0.0 + ) if overhead > overhead_threshold: # Fallback to baseline (allowing smaller group sizes)