From edb5286ea57e8807172c4ce2df3864d41dae61e2 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Fri, 2 May 2025 04:07:03 -0400 Subject: [PATCH] [BugFix] Fix Memory Leak (#17567) Signed-off-by: rshaw@neuralmagic.com --- tests/v1/core/test_scheduler.py | 77 +++++++++++++++++++++++++++++++++ vllm/v1/core/sched/scheduler.py | 5 ++- 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index ee4e95856f233..9987688b02fad 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1165,3 +1165,80 @@ def test_kv_connector_handles_preemption(): # All memory should be freed since nothing is running. assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \ == NUM_BLOCKS - 1 + + +def make_output(scheduler: Scheduler): + return ModelRunnerOutput( + req_ids=[req.request_id for req in scheduler.running], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(scheduler.running) + }, + sampled_token_ids=[[1000]] * len(scheduler.running), + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + + +def assert_scheduler_empty(scheduler: Scheduler): + """Confirm the scheduler is "empty" - i.e. no leaks.""" + # Scheduler Metadata. + assert len(scheduler.requests) == 0 + assert len(scheduler.waiting) == 0 + assert len(scheduler.running) == 0 + assert len(scheduler.finished_req_ids) == 0 + assert len(scheduler._cached_reqs_data) == 0 + + # EncoderCacheManager. + assert len(scheduler.encoder_cache_manager.freed) == 0 + assert len(scheduler.encoder_cache_manager.cached) == 0 + + # KVCache Manager. + assert len(scheduler.kv_cache_manager.req_to_blocks) == 0 + assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0 + assert len(scheduler.kv_cache_manager.num_cached_block) == 0 + num_free_blocks = ( + scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks) + assert num_free_blocks == ( + scheduler.kv_cache_manager.block_pool.num_gpu_blocks - 1) + + # NOTE(rob): just the ref count on blocks will be 0. The hash + # value, etc will remain since we lazily evict for prefix cache. + for block in scheduler.kv_cache_manager.block_pool.blocks: + assert block.ref_cnt == 0 + # assert block._block_hash is None + # assert ( + # len(scheduler.kv_cache_manager.block_pool.cached_block_hash_to_block + # ) == 0) + + +def test_memory_leak(): + """Test that we do not have a memory leak.""" + + scheduler = create_scheduler(enable_prefix_caching=True) + + NUM_REQUESTS = 5 + NUM_TOKENS = 10 + MAX_TOKENS = 10 + requests = create_requests(num_requests=NUM_REQUESTS, + num_tokens=NUM_TOKENS, + max_tokens=MAX_TOKENS) + + # Add each request. + for request in requests: + scheduler.add_request(request) + scheduler_output = scheduler.schedule() + model_runner_output = make_output(scheduler) + scheduler.update_from_output(scheduler_output, model_runner_output) + + # Iterate until done. + while True: + scheduler_output = scheduler.schedule() + if len(scheduler.running) == 0: + break + model_runner_output = make_output(scheduler) + scheduler.update_from_output(scheduler_output, model_runner_output) + + # Confirm no memory leak. + assert_scheduler_empty(scheduler) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 21711c9292f9f..60c6a0f00600e 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -739,7 +739,10 @@ class Scheduler(SchedulerInterface): # Return the cached request data to the queue so they can be reused. for req_data in scheduler_output.scheduled_cached_reqs: - self._cached_reqs_data[req_data.req_id].append(req_data) + # NOTE(rob): since we free stopped reqs above, adding stopped reqs + # to _cached_reqs_data will cause a memory leak. + if req_data.req_id not in self.finished_req_ids: + self._cached_reqs_data[req_data.req_id].append(req_data) self.running = new_running engine_core_outputs = EngineCoreOutputs(