diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index c84a51b6883dc..44c92ef93c3b5 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -1292,8 +1292,12 @@ def test_allocate_with_lookahead(): def test_get_kv_cache_config_one_worker(): # pass max_model_len to pass check_enough_kv_cache_memory - model_config = ModelConfig(max_model_len=16) + # Use max_model_len=256 and max_num_batched_tokens=4 so that + # full attention layers (16 blocks) >> sliding window layers (2 blocks), + # making the overhead calculations work correctly for grouping + model_config = ModelConfig(max_model_len=256) vllm_config = VllmConfig(model_config=model_config) + vllm_config.scheduler_config.max_num_batched_tokens = 4 mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2 # all layers are full attention -> single group @@ -1855,3 +1859,149 @@ def test_auto_fit_max_model_len_not_triggered(): vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32] ) assert vllm_config.model_config.max_model_len == 16 + + +class TestFindBestGroupSize: + """ + Tests for the _find_best_group_size function which finds optimal + KV cache group sizes while preferring larger groups. + + Key behaviors: + - Prefers LARGER group sizes + - Enforces group_size >= 3 unless overhead exceeds 10% + - Raises ValueError on empty input + """ + + @pytest.fixture + def vllm_config(self): + """Create a minimal VllmConfig for testing.""" + model_config = ModelConfig(max_model_len=4096) + return VllmConfig(model_config=model_config) + + def test_empty_input_raises(self, vllm_config): + """Empty input should raise ValueError.""" + with pytest.raises(ValueError, match="must not be empty"): + kv_cache_utils._find_best_group_size({}, vllm_config) + + def test_single_layer_type_returns_layer_count(self, vllm_config): + """Homogeneous layers: group_size == num_layers (single group optimal).""" + spec = new_kv_cache_spec() + same_type_layers = {spec: [f"layer_{i}" for i in range(5)]} + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) + # With 5 homogeneous layers, optimal is group_size=5 (one group, no padding) + assert result == 5 + + def test_single_layer_returns_one(self, vllm_config): + """Single layer returns 1.""" + spec = new_kv_cache_spec() + same_type_layers = {spec: ["layer_0"]} + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) + assert result == 1 + + def test_two_layers_returns_two(self, vllm_config): + """Two homogeneous layers -> group_size=2.""" + spec = new_kv_cache_spec() + same_type_layers = {spec: ["layer_0", "layer_1"]} + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) + # max_layers=2, min_preferred=3 >= max_layers, so returns max_layers=2 + assert result == 2 + + def test_gemma3_pattern_regression(self, vllm_config): + """ + Regression test: Gemma3-like model with 5:1 sw/full pattern. + 25 sw + 5 full: group_size=5 gives 0 padding for both. + """ + full_spec = new_kv_cache_spec() + sw_spec = new_sliding_window_spec(sliding_window=512) + + same_type_layers = { + sw_spec: [f"sw_{i}" for i in range(25)], + full_spec: [f"full_{i}" for i in range(5)], + } + + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) + # GCD(25, 5) = 5, so group_size=5 gives 0 padding + # Larger sizes like 25 would give padding for full layers + assert result == 5 + + def test_llama4_pattern_regression(self, vllm_config): + """ + Regression test: LLaMA4-like model with 3:1 local/full pattern. + 24 local + 8 full: group_size=8 gives 0 padding for both. + Prefer 8 over 4 because 8 is larger (fewer groups). + """ + full_spec = new_kv_cache_spec() + local_spec = new_sliding_window_spec(sliding_window=256) + + same_type_layers = { + local_spec: [f"local_{i}" for i in range(24)], + full_spec: [f"full_{i}" for i in range(8)], + } + + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) + # GCD(24, 8) = 8, both 4 and 8 give 0 padding + # Prefer 8 (larger group size = fewer groups) + assert result == 8 + + def test_mixed_20_30_prefers_larger_group(self, vllm_config): + """ + 20 full + 30 sw layers. + Both group_size=5 and 10 give zero padding. + Prefer 10 because it's larger (fewer groups). + """ + full_spec = new_kv_cache_spec() + sw_spec = new_sliding_window_spec(sliding_window=512) + + same_type_layers = { + full_spec: [f"full_{i}" for i in range(20)], + sw_spec: [f"sw_{i}" for i in range(30)], + } + + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) + # GCD(20, 30) = 10, both 5 and 10 divide evenly + # Prefer 10 (larger = fewer groups) + assert result == 10 + + def test_eagle_gpt_oss_20b_pattern_regression(self, vllm_config): + """ + Regression test: GPT-OSS-20B + Eagle pattern (12 sw + 13 full). + group_size=13: 1 padding layer for sw (small overhead), 0 for full. + This is acceptable overhead, so prefer 13. + """ + full_spec = new_kv_cache_spec() + sw_spec = new_sliding_window_spec(sliding_window=512) + + same_type_layers = { + sw_spec: [f"sw_{i}" for i in range(12)], + full_spec: [f"full_{i}" for i in range(13)], + } + + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) + # group_size=13: 1 padding for sw, 0 for full + # 1 padding out of 25 total = 4% overhead, well under 10% + assert result == 13 + + def test_fallback_when_overhead_exceeds_threshold(self, vllm_config): + """ + When enforcing min_group_size >= 3 adds > 10% overhead, fallback to 1. + + Example: 1 full + 5 sw layers. + - group_size=1: 0 padding (optimal baseline) + - group_size=3: need to pad 2 full layers + 1 sw layer = 3 padding layers + That's 3 padding out of 6 total = 10% overhead, way over 10% + - group_size=5: need to pad 4 full layers = 4 padding layers + That's 4 padding out of 6 total = 67% overhead + + So group_size=1 should be chosen as the fallback. + """ + full_spec = new_kv_cache_spec() + sw_spec = new_sliding_window_spec(sliding_window=512) + + same_type_layers = { + full_spec: ["full_0"], # 1 full layer + sw_spec: [f"sw_{i}" for i in range(5)], # 5 sw layers + } + + result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config) + # group_size >= 3 would add > 10% overhead, so fallback to 1 + assert result == 1 diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 1480a1f798ea0..b4255f5e4f57f 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -946,8 +946,102 @@ def is_kv_cache_type_attention_free(kv_cache_spec: dict[str, KVCacheSpec]) -> bo return not kv_cache_spec +def _find_best_group_size( + same_type_layers: dict["KVCacheSpec", list[str]], + vllm_config: "VllmConfig", + min_preferred_group_size: int = 3, + overhead_threshold: float = 0.10, +) -> int: + """ + Find the optimal group size that minimizes padding memory, preferring + larger group sizes. + + For each layer type, padding = (group_size - count % group_size) % group_size + weighted by that layer's max_memory_usage_bytes. Different layer types + contribute differently to total padding based on their actual memory usage + (e.g., full attention vs sliding window). + + This function prefers LARGER group sizes. Empirically, small group sizes (1-2) + lead to KV cache memory being concentrated in just a few large tensors, which + can reduce performance due to memory allocation patterns. + + The algorithm enforces group_size >= min_preferred_group_size (default 3), + unless doing so would add more than overhead_threshold (default 10%) extra + padding memory compared to the optimal unconstrained group size. + + Args: + same_type_layers: Dict mapping KVCacheSpec to list of layer names. + Must not be empty. + vllm_config: The global VllmConfig, used to compute max_memory_usage_bytes + min_preferred_group_size: Preferred minimum group size (default 3). + Group sizes below this are avoided unless overhead exceeds threshold. + overhead_threshold: Maximum allowed overhead ratio (default 0.10 = 10%) + before falling back to smaller group sizes. + + Returns: + The optimal group size (minimizes padding, ties broken by larger group size) + + Raises: + ValueError: If same_type_layers is empty + """ + if not same_type_layers: + raise ValueError("same_type_layers must not be empty") + + # Extract (layer_count, max_memory_usage_bytes) per spec + # max_memory_usage_bytes properly weights full attention vs sliding window + layer_info = [ + (len(layers), spec.max_memory_usage_bytes(vllm_config)) + for spec, layers in same_type_layers.items() + ] + + max_layers = max(count for count, _ in layer_info) + total_base_memory = sum(count * mem_size for count, mem_size in layer_info) + + def calc_padding_memory(group_size: int) -> int: + """Total padding memory, weighted by each layer type's memory size.""" + return sum( + ((group_size - count % group_size) % group_size) * mem_size + for count, mem_size in layer_info + ) + + def find_best_in_range(start: int, end: int) -> int: + """Find best group size in [start, end] range. + + Prefers larger group sizes when padding is equal. + Key: (padding_memory, -group_size) so larger group_size wins ties. + """ + return min(range(start, end + 1), key=lambda gs: (calc_padding_memory(gs), -gs)) + + # Calculate baseline: optimal group size with no minimum constraint + baseline_group_size = find_best_in_range(1, max_layers) + baseline_padding = calc_padding_memory(baseline_group_size) + + # If preferred minimum is >= max_layers, just use max_layers + if min_preferred_group_size >= max_layers: + return max_layers + + # Calculate preferred: optimal group size with minimum constraint + preferred_group_size = find_best_in_range(min_preferred_group_size, max_layers) + preferred_padding = calc_padding_memory(preferred_group_size) + + # Check if enforcing the minimum preference adds too much overhead + # Overhead is measured relative to total memory + overhead = ( + (preferred_padding - baseline_padding) / total_base_memory + if total_base_memory > 0 + else 0.0 + ) + + if overhead > overhead_threshold: + # Fallback to baseline (allowing smaller group sizes) + return baseline_group_size + + return preferred_group_size + + def _get_kv_cache_groups_uniform_page_size( kv_cache_spec: dict[str, KVCacheSpec], + vllm_config: "VllmConfig", ) -> list[KVCacheGroupSpec]: """ Generates the KV cache groups for hybrid models with multiple @@ -1023,23 +1117,10 @@ def _get_kv_cache_groups_uniform_page_size( # E.g., (full.0, full.1), (sw.0, sw.1, sw.2) # split to 3 groups with 2 layers each: # (full.0, full.1), (sw.0, sw.2), (sw.1, padding). - # FIXME(Chen): At the moment of writing this code (2025-06-02), all - # open-source hybrid model follows a n:1 pattern between different attention - # types (e.g., Gemma3 5:1 between sw and full, LLaMA4 3:1 between local and - # full), so we can use the "1" in the n:1 pattern as the group size, which - # is the minimum number of layers among all attention types. Need a better - # strategy if we want to support more complex patterns (e.g., 20 full + 30 - # sw, where the group size should be 10). - min_num_layers = min([len(layers) for layers in same_type_layers.values()]) - group_size = min_num_layers - max_num_layers = max([len(layers) for layers in same_type_layers.values()]) - if max_num_layers < min_num_layers * 1.25: - # If the number of layers is not much larger than the minimum number of layers, - # use the maximum number of layers as the group size to avoid too many padding - # layers. A typical example is gpt-oss-20b + eagle, with 12 sw + 13 full. We - # pad it to (13 sw, 13 full) instead of (12 sw, 24 full). 1.25 is just a - # magic number to avoid too many padding layers. - group_size = max_num_layers + # Find optimal group_size by trying all options and choosing the one with + # minimal padding (weighted by layer memory size). Prefers larger group sizes + # and enforces group_size >= 3 unless overhead exceeds the threshold. + group_size = _find_best_group_size(same_type_layers, vllm_config) grouped_layers = [] for layers in same_type_layers.values(): num_padding_layers = group_size - len(layers) % group_size @@ -1245,7 +1326,7 @@ def get_kv_cache_groups( # have the same physical memory per block per layer. Split the layers # into groups with the same number of layers, and thus same total page # size. - return _get_kv_cache_groups_uniform_page_size(kv_cache_spec) + return _get_kv_cache_groups_uniform_page_size(kv_cache_spec, vllm_config) def generate_scheduler_kv_cache_config(