From 86dca07d9bfd9f892f820d2d9a45b102d1f4a096 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 5 Nov 2025 02:36:31 -0800 Subject: [PATCH] [Hybrid allocator + kv connector] revert connector test changes related to hybrid allocator (#28011) Signed-off-by: KuntaiDu --- tests/v1/core/test_scheduler.py | 7 ------- tests/v1/core/utils.py | 2 -- .../v1/kv_connector/nixl_integration/run_accuracy_test.sh | 2 -- .../v1/kv_connector/nixl_integration/run_edge_case_test.sh | 2 -- tests/v1/kv_connector/unit/test_multi_connector.py | 1 - tests/v1/kv_connector/unit/test_nixl_connector.py | 1 - .../v1/kv_connector/unit/test_shared_storage_connector.py | 1 - tests/v1/kv_connector/unit/utils.py | 3 --- 8 files changed, 19 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 92e3831b9c7a6..749cf7dc8397e 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -891,7 +891,6 @@ def test_kv_connector_basic(): scheduler = create_scheduler( enable_prefix_caching=True, use_kv_connector=True, - disable_hybrid_kv_cache_manager=True, ) NUM_TOTAL_BLOCKS = scheduler.kv_cache_manager.block_pool.get_num_free_blocks() BLOCK_SIZE = scheduler.cache_config.block_size @@ -1017,7 +1016,6 @@ def test_external_prefix_cache_metrics(): scheduler = create_scheduler( enable_prefix_caching=False, use_kv_connector=True, - disable_hybrid_kv_cache_manager=True, ) # Mock connector to simulate a partial external cache hit @@ -1082,7 +1080,6 @@ def test_kv_connector_unable_to_allocate(): use_kv_connector=True, block_size=BLOCK_SIZE, num_blocks=NUM_BLOCKS, - disable_hybrid_kv_cache_manager=True, ) NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2 scheduler.connector.get_num_new_matched_tokens = Mock(name="method") @@ -1166,7 +1163,6 @@ def test_kv_connector_handles_preemption(): use_kv_connector=True, block_size=BLOCK_SIZE, num_blocks=NUM_BLOCKS, - disable_hybrid_kv_cache_manager=True, ) NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE @@ -1383,7 +1379,6 @@ def create_scheduler_with_priority( block_size: int = 16, max_model_len: int | None = None, num_speculative_tokens: int | None = None, - disable_hybrid_kv_cache_manager: bool = False, ) -> Scheduler: """Create scheduler with priority policy enabled. @@ -1408,7 +1403,6 @@ def create_scheduler_with_priority( disable_chunked_mm_input=disable_chunked_mm_input, enable_chunked_prefill=True, policy="priority", # Enable priority scheduling - disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager, ) model_config = ModelConfig( model=model, @@ -2015,7 +2009,6 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(): num_blocks=5, # Can hold 64 tokens (first block is null) block_size=16, # Standard block size use_kv_connector=True, - disable_hybrid_kv_cache_manager=True, ) # Create a request and schedule it diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index 3f5e1b9eeaf73..6e739d6b0e77a 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -46,7 +46,6 @@ def create_scheduler( num_speculative_tokens: int | None = None, skip_tokenizer_init: bool = False, async_scheduling: bool = False, - disable_hybrid_kv_cache_manager: bool = False, ) -> Scheduler | AsyncScheduler: """Create scheduler under test. @@ -71,7 +70,6 @@ def create_scheduler( disable_chunked_mm_input=disable_chunked_mm_input, enable_chunked_prefill=True, async_scheduling=async_scheduling, - disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager, ) model_config = ModelConfig( model=model, diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index a756858e2cc51..a9817313cf022 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -136,7 +136,6 @@ run_tests_for_model() { vllm serve $model_name \ --port $PORT \ --enforce-eager \ - --disable-hybrid-kv-cache-manager \ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ --tensor-parallel-size $PREFILLER_TP_SIZE \ --kv-transfer-config '$KV_CONFIG'" @@ -179,7 +178,6 @@ run_tests_for_model() { --port $PORT \ --enforce-eager \ --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ - --disable-hybrid-kv-cache-manager \ --kv-transfer-config '$KV_CONFIG'" # DP-EP attention mode diff --git a/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh b/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh index a3eeedb2e5146..c48b452e24cd4 100755 --- a/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh @@ -85,7 +85,6 @@ run_tests_for_model() { --port $PREFILL_PORT \ --enforce-eager \ --gpu-memory-utilization 0.2 \ - --disable-hybrid-kv-cache-manager \ --kv-transfer-config '$KV_CONFIG'" if [ -n "$model_args" ]; then @@ -104,7 +103,6 @@ run_tests_for_model() { --port $DECODE_PORT \ --enforce-eager \ --gpu-memory-utilization 0.2 \ - --disable-hybrid-kv-cache-manager \ --kv-transfer-config '$KV_CONFIG'" if [ -n "$model_args" ]; then diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py index 6748532afd971..1c1ac915c758e 100644 --- a/tests/v1/kv_connector/unit/test_multi_connector.py +++ b/tests/v1/kv_connector/unit/test_multi_connector.py @@ -114,7 +114,6 @@ def test_multi_shared_storage_connector_consistency(): enforce_eager=True, gpu_memory_utilization=0.5, kv_transfer_config=kv_transfer_config, - disable_hybrid_kv_cache_manager=True, ) # Run generation - this should trigger saving KV cache _ = llm.generate(PROMPTS, SAMPLING_PARAMS) diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 1f3fdafc644d8..475cf2285e394 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -1020,7 +1020,6 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend): "gpu_memory_utilization": 0.5, "kv_transfer_config": kv_transfer_config, "distributed_executor_backend": distributed_executor_backend, - "disable_hybrid_kv_cache_manager": True, } timeout = 6 diff --git a/tests/v1/kv_connector/unit/test_shared_storage_connector.py b/tests/v1/kv_connector/unit/test_shared_storage_connector.py index 6040ed5a6806d..e7013a794a8c6 100644 --- a/tests/v1/kv_connector/unit/test_shared_storage_connector.py +++ b/tests/v1/kv_connector/unit/test_shared_storage_connector.py @@ -132,7 +132,6 @@ def test_shared_storage_connector_hashes(tmp_path): enforce_eager=True, kv_transfer_config=kv_transfer_config, limit_mm_per_prompt={"image": 2}, - disable_hybrid_kv_cache_manager=True, ) # don't put this import at the top level diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index c1c0e13f77539..f0031643aa9d4 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -91,9 +91,6 @@ def create_vllm_config( max_num_batched_tokens=max_num_batched_tokens, max_model_len=max_model_len, enable_chunked_prefill=enable_chunked_prefill, - # Disable hybrid KV cache manager for testing - # Should be removed after we support hybrid KV cache manager-based testing. - disable_hybrid_kv_cache_manager=True, ) model_config = ModelConfig( model=model,