From 128bf7528370d792099c66f301c6c5deef8f4110 Mon Sep 17 00:00:00 2001
From: TY-AMD <tianyuan.wu@amd.com>
Date: Thu, 13 Mar 2025 11:14:36 +0800
Subject: [PATCH] [BugFix][TritonMLA] Process weights after model loading for
 GGUF (#14555)

Signed-off-by: TianyuanWu <Tianyuan.Wu@amd.com>
---
 vllm/model_executor/model_loader/loader.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index bf226f6611262..c88af56e18053 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1330,11 +1330,14 @@ class GGUFModelLoader(BaseModelLoader):
                 local_model_path, gguf_weights_map):
             model_config.hf_config.update({"tie_word_embeddings": True})
 
+        target_device = torch.device(device_config.device)
         with set_default_torch_dtype(model_config.dtype):
-            with torch.device(device_config.device):
+            with target_device:
                 model = _initialize_model(vllm_config=vllm_config)
             model.load_weights(
                 self._get_weights_iterator(local_model_path, gguf_weights_map))
+
+            _process_weights_after_loading(model, model_config, target_device)
         return model