mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 00:24:55 +08:00
Core Features: - Add pin_prefix parameter to SamplingParams for per-request prefix pinning - Implement pinned prefix caching in V1 engine KVCacheManager - Add pinned_prefix_cap_ratio (default 0.2) to control memory usage - Add enable_pinned_prefix global gate for conservative rollouts - Protect pinned blocks from LRU eviction in BlockPool Bug Fixes: - Fix multi-group budget bug with round-robin pinning strategy - Ensure global cap is never exceeded even with multiple KV cache groups - Use logical pinned depth (min across groups) for accurate reporting Management APIs: - Add HTTP endpoint POST /unpin_all_pinned_prefixes for memory reclamation - Implement complete call chain: API -> AsyncLLM -> EngineCore -> Scheduler -> KVCacheManager - Remove per-request unpin to keep API surface minimal Code Quality: - Replace manual @field_validator with Field(ge=0, le=1) for cleaner validation - Add comprehensive test coverage (unit + integration + E2E) - Add test_multi_group_prefix_pinning_respects_global_cap() for multi-group validation - Add test_unpin_all_pinned_prefixes_clears_pool() for unpin API validation Resolves: #23083 Signed-off-by: dongbo910220 <1275604947@qq.com>
19 lines
549 B
Python
19 lines
549 B
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
import pytest
|
|
|
|
from vllm.config import CacheConfig
|
|
|
|
|
|
def test_invalid_cap_ratio_over_one():
|
|
# pinned_prefix_cap_ratio > 1.0 should raise ValueError
|
|
with pytest.raises(ValueError):
|
|
_ = CacheConfig(pinned_prefix_cap_ratio=1.5)
|
|
|
|
|
|
def test_negative_cap_ratio_raises():
|
|
# negative value should raise because ratio must be within [0, 1]
|
|
with pytest.raises(ValueError):
|
|
_ = CacheConfig(pinned_prefix_cap_ratio=-0.1)
|