Fix ShardedStateLoader for vllm fp8 quantization (#7708)

2026-03-19 00:57:15 +08:00 · 2024-08-22 05:25:04 -07:00 · 2024-08-22 05:25:04 -07:00 · 4f419c00a6
commit 4f419c00a6
parent a3fce56b88
1 changed files with 4 additions and 0 deletions
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@ -579,6 +579,10 @@ class ShardedStateLoader(BaseModelLoader):
            with torch.device(device_config.device):
                model = _initialize_model(model_config, self.load_config,
                                          lora_config, cache_config)
+                for _, module in model.named_modules():
+                    quant_method = getattr(module, "quant_method", None)
+                    if quant_method is not None:
+                        quant_method.process_weights_after_loading(module)
            rank = get_tensor_model_parallel_rank()
            pattern = os.path.join(
                local_model_path,