From 980de31ca0e16018ddb871c6c1df31a0fd7db223 Mon Sep 17 00:00:00 2001 From: Reinforce-II Date: Wed, 22 Oct 2025 23:16:09 +0800 Subject: [PATCH] [bugfix] remove unused parameters to reduce unnecessary vram usage (#26789) Signed-off-by: Reinforce-II Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- .../quantization/compressed_tensors/compressed_tensors_moe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 5488b65c6214f..bf38c15b47013 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -307,10 +307,12 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): layer.w13_weight = torch.nn.Parameter( layer.w13_weight_packed.data, requires_grad=False ) + delattr(layer, "w13_weight_packed") layer.w2_weight = torch.nn.Parameter( layer.w2_weight_packed.data, requires_grad=False ) + delattr(layer, "w2_weight_packed") # reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel. if self.allow_flashinfer: