From b2496bb07fdf9318e7d9a8065356941fef380bac Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 10 Feb 2025 13:03:43 +0800
Subject: [PATCH] [core] fix sleep mode and pytorch checkpoint compatibility
 (#13001)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/basic_correctness/test_cumem.py            | 10 ++++++++--
 vllm/model_executor/model_loader/weight_utils.py |  1 -
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 4e9f1bf1cf86..3ac948799d77 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -115,10 +115,16 @@ def test_cumem_with_cudagraph():
 
 
 @fork_new_process_for_each_test
-def test_end_to_end():
+@pytest.mark.parametrize(
+    "model",
+    [
+        "meta-llama/Llama-3.2-1B",  # sleep mode with safetensors
+        "facebook/opt-125m"  # sleep mode with pytorch checkpoint
+    ])
+def test_end_to_end(model):
     free, total = torch.cuda.mem_get_info()
     used_bytes_baseline = total - free  # in case other process is running
-    llm = LLM("meta-llama/Llama-3.2-1B", enable_sleep_mode=True)
+    llm = LLM(model, enable_sleep_mode=True)
     prompt = "How are you?"
     sampling_params = SamplingParams(temperature=0, max_tokens=10)
     output = llm.generate(prompt, sampling_params)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 68ade319df28..8b2c5610f1f9 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -462,7 +462,6 @@ def pt_weights_iterator(
         state = torch.load(bin_file, map_location="cpu", weights_only=True)
         yield from state.items()
         del state
-        torch.cuda.empty_cache()
 
 
 def get_gguf_extra_tensor_names(