[Hybrid allocator + kv connector] revert connector test changes related to hybrid allocator (#28011)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
This commit is contained in:
Kuntai Du 2025-11-05 02:36:31 -08:00 committed by GitHub
parent 16b37f3119
commit 86dca07d9b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 0 additions and 19 deletions

View File

@ -891,7 +891,6 @@ def test_kv_connector_basic():
scheduler = create_scheduler( scheduler = create_scheduler(
enable_prefix_caching=True, enable_prefix_caching=True,
use_kv_connector=True, use_kv_connector=True,
disable_hybrid_kv_cache_manager=True,
) )
NUM_TOTAL_BLOCKS = scheduler.kv_cache_manager.block_pool.get_num_free_blocks() NUM_TOTAL_BLOCKS = scheduler.kv_cache_manager.block_pool.get_num_free_blocks()
BLOCK_SIZE = scheduler.cache_config.block_size BLOCK_SIZE = scheduler.cache_config.block_size
@ -1017,7 +1016,6 @@ def test_external_prefix_cache_metrics():
scheduler = create_scheduler( scheduler = create_scheduler(
enable_prefix_caching=False, enable_prefix_caching=False,
use_kv_connector=True, use_kv_connector=True,
disable_hybrid_kv_cache_manager=True,
) )
# Mock connector to simulate a partial external cache hit # Mock connector to simulate a partial external cache hit
@ -1082,7 +1080,6 @@ def test_kv_connector_unable_to_allocate():
use_kv_connector=True, use_kv_connector=True,
block_size=BLOCK_SIZE, block_size=BLOCK_SIZE,
num_blocks=NUM_BLOCKS, num_blocks=NUM_BLOCKS,
disable_hybrid_kv_cache_manager=True,
) )
NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2 NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
scheduler.connector.get_num_new_matched_tokens = Mock(name="method") scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
@ -1166,7 +1163,6 @@ def test_kv_connector_handles_preemption():
use_kv_connector=True, use_kv_connector=True,
block_size=BLOCK_SIZE, block_size=BLOCK_SIZE,
num_blocks=NUM_BLOCKS, num_blocks=NUM_BLOCKS,
disable_hybrid_kv_cache_manager=True,
) )
NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
@ -1383,7 +1379,6 @@ def create_scheduler_with_priority(
block_size: int = 16, block_size: int = 16,
max_model_len: int | None = None, max_model_len: int | None = None,
num_speculative_tokens: int | None = None, num_speculative_tokens: int | None = None,
disable_hybrid_kv_cache_manager: bool = False,
) -> Scheduler: ) -> Scheduler:
"""Create scheduler with priority policy enabled. """Create scheduler with priority policy enabled.
@ -1408,7 +1403,6 @@ def create_scheduler_with_priority(
disable_chunked_mm_input=disable_chunked_mm_input, disable_chunked_mm_input=disable_chunked_mm_input,
enable_chunked_prefill=True, enable_chunked_prefill=True,
policy="priority", # Enable priority scheduling policy="priority", # Enable priority scheduling
disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager,
) )
model_config = ModelConfig( model_config = ModelConfig(
model=model, model=model,
@ -2015,7 +2009,6 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv():
num_blocks=5, # Can hold 64 tokens (first block is null) num_blocks=5, # Can hold 64 tokens (first block is null)
block_size=16, # Standard block size block_size=16, # Standard block size
use_kv_connector=True, use_kv_connector=True,
disable_hybrid_kv_cache_manager=True,
) )
# Create a request and schedule it # Create a request and schedule it

View File

@ -46,7 +46,6 @@ def create_scheduler(
num_speculative_tokens: int | None = None, num_speculative_tokens: int | None = None,
skip_tokenizer_init: bool = False, skip_tokenizer_init: bool = False,
async_scheduling: bool = False, async_scheduling: bool = False,
disable_hybrid_kv_cache_manager: bool = False,
) -> Scheduler | AsyncScheduler: ) -> Scheduler | AsyncScheduler:
"""Create scheduler under test. """Create scheduler under test.
@ -71,7 +70,6 @@ def create_scheduler(
disable_chunked_mm_input=disable_chunked_mm_input, disable_chunked_mm_input=disable_chunked_mm_input,
enable_chunked_prefill=True, enable_chunked_prefill=True,
async_scheduling=async_scheduling, async_scheduling=async_scheduling,
disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager,
) )
model_config = ModelConfig( model_config = ModelConfig(
model=model, model=model,

View File

@ -136,7 +136,6 @@ run_tests_for_model() {
vllm serve $model_name \ vllm serve $model_name \
--port $PORT \ --port $PORT \
--enforce-eager \ --enforce-eager \
--disable-hybrid-kv-cache-manager \
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
--tensor-parallel-size $PREFILLER_TP_SIZE \ --tensor-parallel-size $PREFILLER_TP_SIZE \
--kv-transfer-config '$KV_CONFIG'" --kv-transfer-config '$KV_CONFIG'"
@ -179,7 +178,6 @@ run_tests_for_model() {
--port $PORT \ --port $PORT \
--enforce-eager \ --enforce-eager \
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
--disable-hybrid-kv-cache-manager \
--kv-transfer-config '$KV_CONFIG'" --kv-transfer-config '$KV_CONFIG'"
# DP-EP attention mode # DP-EP attention mode

View File

@ -85,7 +85,6 @@ run_tests_for_model() {
--port $PREFILL_PORT \ --port $PREFILL_PORT \
--enforce-eager \ --enforce-eager \
--gpu-memory-utilization 0.2 \ --gpu-memory-utilization 0.2 \
--disable-hybrid-kv-cache-manager \
--kv-transfer-config '$KV_CONFIG'" --kv-transfer-config '$KV_CONFIG'"
if [ -n "$model_args" ]; then if [ -n "$model_args" ]; then
@ -104,7 +103,6 @@ run_tests_for_model() {
--port $DECODE_PORT \ --port $DECODE_PORT \
--enforce-eager \ --enforce-eager \
--gpu-memory-utilization 0.2 \ --gpu-memory-utilization 0.2 \
--disable-hybrid-kv-cache-manager \
--kv-transfer-config '$KV_CONFIG'" --kv-transfer-config '$KV_CONFIG'"
if [ -n "$model_args" ]; then if [ -n "$model_args" ]; then

View File

@ -114,7 +114,6 @@ def test_multi_shared_storage_connector_consistency():
enforce_eager=True, enforce_eager=True,
gpu_memory_utilization=0.5, gpu_memory_utilization=0.5,
kv_transfer_config=kv_transfer_config, kv_transfer_config=kv_transfer_config,
disable_hybrid_kv_cache_manager=True,
) )
# Run generation - this should trigger saving KV cache # Run generation - this should trigger saving KV cache
_ = llm.generate(PROMPTS, SAMPLING_PARAMS) _ = llm.generate(PROMPTS, SAMPLING_PARAMS)

View File

@ -1020,7 +1020,6 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
"gpu_memory_utilization": 0.5, "gpu_memory_utilization": 0.5,
"kv_transfer_config": kv_transfer_config, "kv_transfer_config": kv_transfer_config,
"distributed_executor_backend": distributed_executor_backend, "distributed_executor_backend": distributed_executor_backend,
"disable_hybrid_kv_cache_manager": True,
} }
timeout = 6 timeout = 6

View File

@ -132,7 +132,6 @@ def test_shared_storage_connector_hashes(tmp_path):
enforce_eager=True, enforce_eager=True,
kv_transfer_config=kv_transfer_config, kv_transfer_config=kv_transfer_config,
limit_mm_per_prompt={"image": 2}, limit_mm_per_prompt={"image": 2},
disable_hybrid_kv_cache_manager=True,
) )
# don't put this import at the top level # don't put this import at the top level

View File

@ -91,9 +91,6 @@ def create_vllm_config(
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
max_model_len=max_model_len, max_model_len=max_model_len,
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
# Disable hybrid KV cache manager for testing
# Should be removed after we support hybrid KV cache manager-based testing.
disable_hybrid_kv_cache_manager=True,
) )
model_config = ModelConfig( model_config = ModelConfig(
model=model, model=model,