diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 5488b65c6214f..bf38c15b47013 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -307,10 +307,12 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): layer.w13_weight = torch.nn.Parameter( layer.w13_weight_packed.data, requires_grad=False ) + delattr(layer, "w13_weight_packed") layer.w2_weight = torch.nn.Parameter( layer.w2_weight_packed.data, requires_grad=False ) + delattr(layer, "w2_weight_packed") # reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel. if self.allow_flashinfer: