Merge 437ac4e0477dd8ddad8ddccacd7c7965d24bb029 into 254f6b986720c92ddf97fbb1a6a6465da8e87e29

2026-07-07 00:07:24 +08:00 · 2025-12-25 00:07:13 +00:00 · 2025-12-25 00:07:13 +00:00 · 85489c9979
commit 85489c9979
parent 254f6b9867 437ac4e047
2 changed files with 250 additions and 19 deletions
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@ -1292,8 +1292,12 @@ def test_allocate_with_lookahead():

 def test_get_kv_cache_config_one_worker():
    # pass max_model_len to pass check_enough_kv_cache_memory
-    model_config = ModelConfig(max_model_len=16)
+    # Use max_model_len=256 and max_num_batched_tokens=4 so that
+    # full attention layers (16 blocks) >> sliding window layers (2 blocks),
+    # making the overhead calculations work correctly for grouping
+    model_config = ModelConfig(max_model_len=256)
    vllm_config = VllmConfig(model_config=model_config)
+    vllm_config.scheduler_config.max_num_batched_tokens = 4

    mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
    # all layers are full attention -> single group
@ -1855,3 +1859,149 @@ def test_auto_fit_max_model_len_not_triggered():
        vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32]
    )
    assert vllm_config.model_config.max_model_len == 16
+
+
+class TestFindBestGroupSize:
+    """
+    Tests for the _find_best_group_size function which finds optimal
+    KV cache group sizes while preferring larger groups.
+
+    Key behaviors:
+    - Prefers LARGER group sizes
+    - Enforces group_size >= 3 unless overhead exceeds 10%
+    - Raises ValueError on empty input
+    """
+
+    @pytest.fixture
+    def vllm_config(self):
+        """Create a minimal VllmConfig for testing."""
+        model_config = ModelConfig(max_model_len=4096)
+        return VllmConfig(model_config=model_config)
+
+    def test_empty_input_raises(self, vllm_config):
+        """Empty input should raise ValueError."""
+        with pytest.raises(ValueError, match="must not be empty"):
+            kv_cache_utils._find_best_group_size({}, vllm_config)
+
+    def test_single_layer_type_returns_layer_count(self, vllm_config):
+        """Homogeneous layers: group_size == num_layers (single group optimal)."""
+        spec = new_kv_cache_spec()
+        same_type_layers = {spec: [f"layer_{i}" for i in range(5)]}
+        result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
+        # With 5 homogeneous layers, optimal is group_size=5 (one group, no padding)
+        assert result == 5
+
+    def test_single_layer_returns_one(self, vllm_config):
+        """Single layer returns 1."""
+        spec = new_kv_cache_spec()
+        same_type_layers = {spec: ["layer_0"]}
+        result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
+        assert result == 1
+
+    def test_two_layers_returns_two(self, vllm_config):
+        """Two homogeneous layers -> group_size=2."""
+        spec = new_kv_cache_spec()
+        same_type_layers = {spec: ["layer_0", "layer_1"]}
+        result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
+        # max_layers=2, min_preferred=3 >= max_layers, so returns max_layers=2
+        assert result == 2
+
+    def test_gemma3_pattern_regression(self, vllm_config):
+        """
+        Regression test: Gemma3-like model with 5:1 sw/full pattern.
+        25 sw + 5 full: group_size=5 gives 0 padding for both.
+        """
+        full_spec = new_kv_cache_spec()
+        sw_spec = new_sliding_window_spec(sliding_window=512)
+
+        same_type_layers = {
+            sw_spec: [f"sw_{i}" for i in range(25)],
+            full_spec: [f"full_{i}" for i in range(5)],
+        }
+
+        result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
+        # GCD(25, 5) = 5, so group_size=5 gives 0 padding
+        # Larger sizes like 25 would give padding for full layers
+        assert result == 5
+
+    def test_llama4_pattern_regression(self, vllm_config):
+        """
+        Regression test: LLaMA4-like model with 3:1 local/full pattern.
+        24 local + 8 full: group_size=8 gives 0 padding for both.
+        Prefer 8 over 4 because 8 is larger (fewer groups).
+        """
+        full_spec = new_kv_cache_spec()
+        local_spec = new_sliding_window_spec(sliding_window=256)
+
+        same_type_layers = {
+            local_spec: [f"local_{i}" for i in range(24)],
+            full_spec: [f"full_{i}" for i in range(8)],
+        }
+
+        result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
+        # GCD(24, 8) = 8, both 4 and 8 give 0 padding
+        # Prefer 8 (larger group size = fewer groups)
+        assert result == 8
+
+    def test_mixed_20_30_prefers_larger_group(self, vllm_config):
+        """
+        20 full + 30 sw layers.
+        Both group_size=5 and 10 give zero padding.
+        Prefer 10 because it's larger (fewer groups).
+        """
+        full_spec = new_kv_cache_spec()
+        sw_spec = new_sliding_window_spec(sliding_window=512)
+
+        same_type_layers = {
+            full_spec: [f"full_{i}" for i in range(20)],
+            sw_spec: [f"sw_{i}" for i in range(30)],
+        }
+
+        result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
+        # GCD(20, 30) = 10, both 5 and 10 divide evenly
+        # Prefer 10 (larger = fewer groups)
+        assert result == 10
+
+    def test_eagle_gpt_oss_20b_pattern_regression(self, vllm_config):
+        """
+        Regression test: GPT-OSS-20B + Eagle pattern (12 sw + 13 full).
+        group_size=13: 1 padding layer for sw (small overhead), 0 for full.
+        This is acceptable overhead, so prefer 13.
+        """
+        full_spec = new_kv_cache_spec()
+        sw_spec = new_sliding_window_spec(sliding_window=512)
+
+        same_type_layers = {
+            sw_spec: [f"sw_{i}" for i in range(12)],
+            full_spec: [f"full_{i}" for i in range(13)],
+        }
+
+        result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
+        # group_size=13: 1 padding for sw, 0 for full
+        # 1 padding out of 25 total = 4% overhead, well under 10%
+        assert result == 13
+
+    def test_fallback_when_overhead_exceeds_threshold(self, vllm_config):
+        """
+        When enforcing min_group_size >= 3 adds > 10% overhead, fallback to 1.
+
+        Example: 1 full + 5 sw layers.
+        - group_size=1: 0 padding (optimal baseline)
+        - group_size=3: need to pad 2 full layers + 1 sw layer = 3 padding layers
+          That's 3 padding out of 6 total = 10% overhead, way over 10%
+        - group_size=5: need to pad 4 full layers = 4 padding layers
+          That's 4 padding out of 6 total = 67% overhead
+
+        So group_size=1 should be chosen as the fallback.
+        """
+        full_spec = new_kv_cache_spec()
+        sw_spec = new_sliding_window_spec(sliding_window=512)
+
+        same_type_layers = {
+            full_spec: ["full_0"],  # 1 full layer
+            sw_spec: [f"sw_{i}" for i in range(5)],  # 5 sw layers
+        }
+
+        result = kv_cache_utils._find_best_group_size(same_type_layers, vllm_config)
+        # group_size >= 3 would add > 10% overhead, so fallback to 1
+        assert result == 1
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@ -946,8 +946,102 @@ def is_kv_cache_type_attention_free(kv_cache_spec: dict[str, KVCacheSpec]) -> bo
    return not kv_cache_spec


+def _find_best_group_size(
+    same_type_layers: dict["KVCacheSpec", list[str]],
+    vllm_config: "VllmConfig",
+    min_preferred_group_size: int = 3,
+    overhead_threshold: float = 0.10,
+) -> int:
+    """
+    Find the optimal group size that minimizes padding memory, preferring
+    larger group sizes.
+
+    For each layer type, padding = (group_size - count % group_size) % group_size
+    weighted by that layer's max_memory_usage_bytes. Different layer types
+    contribute differently to total padding based on their actual memory usage
+    (e.g., full attention vs sliding window).
+
+    This function prefers LARGER group sizes. Empirically, small group sizes (1-2)
+    lead to KV cache memory being concentrated in just a few large tensors, which
+    can reduce performance due to memory allocation patterns.
+
+    The algorithm enforces group_size >= min_preferred_group_size (default 3),
+    unless doing so would add more than overhead_threshold (default 10%) extra
+    padding memory compared to the optimal unconstrained group size.
+
+    Args:
+        same_type_layers: Dict mapping KVCacheSpec to list of layer names.
+            Must not be empty.
+        vllm_config: The global VllmConfig, used to compute max_memory_usage_bytes
+        min_preferred_group_size: Preferred minimum group size (default 3).
+            Group sizes below this are avoided unless overhead exceeds threshold.
+        overhead_threshold: Maximum allowed overhead ratio (default 0.10 = 10%)
+            before falling back to smaller group sizes.
+
+    Returns:
+        The optimal group size (minimizes padding, ties broken by larger group size)
+
+    Raises:
+        ValueError: If same_type_layers is empty
+    """
+    if not same_type_layers:
+        raise ValueError("same_type_layers must not be empty")
+
+    # Extract (layer_count, max_memory_usage_bytes) per spec
+    # max_memory_usage_bytes properly weights full attention vs sliding window
+    layer_info = [
+        (len(layers), spec.max_memory_usage_bytes(vllm_config))
+        for spec, layers in same_type_layers.items()
+    ]
+
+    max_layers = max(count for count, _ in layer_info)
+    total_base_memory = sum(count * mem_size for count, mem_size in layer_info)
+
+    def calc_padding_memory(group_size: int) -> int:
+        """Total padding memory, weighted by each layer type's memory size."""
+        return sum(
+            ((group_size - count % group_size) % group_size) * mem_size
+            for count, mem_size in layer_info
+        )
+
+    def find_best_in_range(start: int, end: int) -> int:
+        """Find best group size in [start, end] range.
+
+        Prefers larger group sizes when padding is equal.
+        Key: (padding_memory, -group_size) so larger group_size wins ties.
+        """
+        return min(range(start, end + 1), key=lambda gs: (calc_padding_memory(gs), -gs))
+
+    # Calculate baseline: optimal group size with no minimum constraint
+    baseline_group_size = find_best_in_range(1, max_layers)
+    baseline_padding = calc_padding_memory(baseline_group_size)
+
+    # If preferred minimum is >= max_layers, just use max_layers
+    if min_preferred_group_size >= max_layers:
+        return max_layers
+
+    # Calculate preferred: optimal group size with minimum constraint
+    preferred_group_size = find_best_in_range(min_preferred_group_size, max_layers)
+    preferred_padding = calc_padding_memory(preferred_group_size)
+
+    # Check if enforcing the minimum preference adds too much overhead
+    # Overhead is measured relative to total memory
+    overhead = (
+        (preferred_padding - baseline_padding) / total_base_memory
+        if total_base_memory > 0
+        else 0.0
+    )
+
+    if overhead > overhead_threshold:
+        # Fallback to baseline (allowing smaller group sizes)
+        return baseline_group_size
+
+    return preferred_group_size
+
+
 def _get_kv_cache_groups_uniform_page_size(
    kv_cache_spec: dict[str, KVCacheSpec],
+    vllm_config: "VllmConfig",
 ) -> list[KVCacheGroupSpec]:
    """
    Generates the KV cache groups for hybrid models with multiple
@ -1023,23 +1117,10 @@ def _get_kv_cache_groups_uniform_page_size(
    # E.g., (full.0, full.1), (sw.0, sw.1, sw.2)
    # split to 3 groups with 2 layers each:
    # (full.0, full.1), (sw.0, sw.2), (sw.1, padding).
-    # FIXME(Chen): At the moment of writing this code (2025-06-02), all
-    # open-source hybrid model follows a n:1 pattern between different attention
-    # types (e.g., Gemma3 5:1 between sw and full, LLaMA4 3:1 between local and
-    # full), so we can use the "1" in the n:1 pattern as the group size, which
-    # is the minimum number of layers among all attention types. Need a better
-    # strategy if we want to support more complex patterns (e.g., 20 full + 30
-    # sw, where the group size should be 10).
-    min_num_layers = min([len(layers) for layers in same_type_layers.values()])
-    group_size = min_num_layers
-    max_num_layers = max([len(layers) for layers in same_type_layers.values()])
-    if max_num_layers < min_num_layers * 1.25:
-        # If the number of layers is not much larger than the minimum number of layers,
-        # use the maximum number of layers as the group size to avoid too many padding
-        # layers. A typical example is gpt-oss-20b + eagle, with 12 sw + 13 full. We
-        # pad it to (13 sw, 13 full) instead of (12 sw, 24 full). 1.25 is just a
-        # magic number to avoid too many padding layers.
-        group_size = max_num_layers
+    # Find optimal group_size by trying all options and choosing the one with
+    # minimal padding (weighted by layer memory size). Prefers larger group sizes
+    # and enforces group_size >= 3 unless overhead exceeds the threshold.
+    group_size = _find_best_group_size(same_type_layers, vllm_config)
    grouped_layers = []
    for layers in same_type_layers.values():
        num_padding_layers = group_size - len(layers) % group_size
@ -1245,7 +1326,7 @@ def get_kv_cache_groups(
    # have the same physical memory per block per layer. Split the layers
    # into groups with the same number of layers, and thus same total page
    # size.
-    return _get_kv_cache_groups_uniform_page_size(kv_cache_spec)
+    return _get_kv_cache_groups_uniform_page_size(kv_cache_spec, vllm_config)


 def generate_scheduler_kv_cache_config(