From b2496bb07fdf9318e7d9a8065356941fef380bac Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 10 Feb 2025 13:03:43 +0800 Subject: [PATCH] [core] fix sleep mode and pytorch checkpoint compatibility (#13001) Signed-off-by: youkaichao --- tests/basic_correctness/test_cumem.py | 10 ++++++++-- vllm/model_executor/model_loader/weight_utils.py | 1 - 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index 4e9f1bf1cf86..3ac948799d77 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -115,10 +115,16 @@ def test_cumem_with_cudagraph(): @fork_new_process_for_each_test -def test_end_to_end(): +@pytest.mark.parametrize( + "model", + [ + "meta-llama/Llama-3.2-1B", # sleep mode with safetensors + "facebook/opt-125m" # sleep mode with pytorch checkpoint + ]) +def test_end_to_end(model): free, total = torch.cuda.mem_get_info() used_bytes_baseline = total - free # in case other process is running - llm = LLM("meta-llama/Llama-3.2-1B", enable_sleep_mode=True) + llm = LLM(model, enable_sleep_mode=True) prompt = "How are you?" sampling_params = SamplingParams(temperature=0, max_tokens=10) output = llm.generate(prompt, sampling_params) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 68ade319df28..8b2c5610f1f9 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -462,7 +462,6 @@ def pt_weights_iterator( state = torch.load(bin_file, map_location="cpu", weights_only=True) yield from state.items() del state - torch.cuda.empty_cache() def get_gguf_extra_tensor_names(