From 128bf7528370d792099c66f301c6c5deef8f4110 Mon Sep 17 00:00:00 2001 From: TY-AMD Date: Thu, 13 Mar 2025 11:14:36 +0800 Subject: [PATCH] [BugFix][TritonMLA] Process weights after model loading for GGUF (#14555) Signed-off-by: TianyuanWu --- vllm/model_executor/model_loader/loader.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index bf226f6611262..c88af56e18053 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -1330,11 +1330,14 @@ class GGUFModelLoader(BaseModelLoader): local_model_path, gguf_weights_map): model_config.hf_config.update({"tie_word_embeddings": True}) + target_device = torch.device(device_config.device) with set_default_torch_dtype(model_config.dtype): - with torch.device(device_config.device): + with target_device: model = _initialize_model(vllm_config=vllm_config) model.load_weights( self._get_weights_iterator(local_model_path, gguf_weights_map)) + + _process_weights_after_loading(model, model_config, target_device) return model