diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index 34f9389c82a9b..f3ad680b72b55 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -177,3 +177,34 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool): # cmp output assert output[0].outputs[0].text == output3[0].outputs[0].text + + +@create_new_process_for_each_test() +def test_deep_sleep(): + model = "Qwen/Qwen3-0.6B" + free, total = torch.cuda.mem_get_info() + used_bytes_baseline = total - free # in case other process is running + llm = LLM(model, enable_sleep_mode=True) + prompt = "How are you?" + sampling_params = SamplingParams(temperature=0, max_tokens=10) + output = llm.generate(prompt, sampling_params) + + # Put the engine to deep sleep + llm.sleep(level=2) + + free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info() + used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline + assert used_bytes < 3 * GiB_bytes + + llm.wake_up(tags=["weights"]) + llm.collective_rpc("reload_weights") + free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info() + used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline + assert used_bytes < 4 * GiB_bytes + + # now allocate kv cache and cuda graph memory + llm.wake_up(tags=["kv_cache"]) + output2 = llm.generate(prompt, sampling_params) + + # cmp output + assert output[0].outputs[0].text == output2[0].outputs[0].text diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index f83a4f4faeb5e..1688b8b83e873 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -216,8 +216,7 @@ class Worker(WorkerBase): self.model_runner.update_config(overrides) def reload_weights(self) -> None: - with self._maybe_get_memory_pool_context(tag="weights"): - self.model_runner.reload_weights() + self.model_runner.reload_weights() @torch.inference_mode() def determine_available_memory(self) -> int: