[BugFix][TritonMLA] Process weights after model loading for GGUF (#14555)

Signed-off-by: TianyuanWu <Tianyuan.Wu@amd.com>
2025-12-27 02:48:42 +08:00 · 2025-03-13 11:14:36 +08:00 · 2025-03-13 11:14:36 +08:00 · 128bf75283
commit 128bf75283
parent a94a699c3f
1 changed files with 4 additions and 1 deletions
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@ -1330,11 +1330,14 @@ class GGUFModelLoader(BaseModelLoader):
                local_model_path, gguf_weights_map):
            model_config.hf_config.update({"tie_word_embeddings": True})

+        target_device = torch.device(device_config.device)
        with set_default_torch_dtype(model_config.dtype):
-            with torch.device(device_config.device):
+            with target_device:
                model = _initialize_model(vllm_config=vllm_config)
            model.load_weights(
                self._get_weights_iterator(local_model_path, gguf_weights_map))
+
+            _process_weights_after_loading(model, model_config, target_device)
        return model