From a8b150c5950a353d458b47ce319585698ef41e3f Mon Sep 17 00:00:00 2001
From: ljss <31004720+beginlner@users.noreply.github.com>
Date: Tue, 28 Nov 2023 03:18:26 +0800
Subject: [PATCH] Init model on GPU to reduce CPU memory footprint (#1796)

---
 vllm/model_executor/model_loader.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
index 54b87c4b866e..fe6ff36edb88 100644
--- a/vllm/model_executor/model_loader.py
+++ b/vllm/model_executor/model_loader.py
@@ -87,9 +87,9 @@ def get_model(model_config: ModelConfig) -> nn.Module:
     with _set_default_torch_dtype(model_config.dtype):
         # Create a model instance.
         # The weights will be initialized as empty tensors.
-        model = model_class(model_config.hf_config, linear_method)
+        with torch.device("cuda"):
+            model = model_class(model_config.hf_config, linear_method)
         if model_config.load_format == "dummy":
-            model = model.cuda()
             # NOTE(woosuk): For accurate performance evaluation, we assign
             # random values to the weights.
             initialize_dummy_weights(model)
@@ -97,5 +97,4 @@ def get_model(model_config: ModelConfig) -> nn.Module:
             # Load the weights from the cached or downloaded files.
             model.load_weights(model_config.model, model_config.download_dir,
                                model_config.load_format, model_config.revision)
-            model = model.cuda()
     return model.eval()