[bugfix] remove unused parameters to reduce unnecessary vram usage (#26789)

Signed-off-by: Reinforce-II <fate@eastal.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
This commit is contained in:
Reinforce-II 2025-10-22 23:16:09 +08:00 committed by GitHub
parent 1c160841ea
commit 980de31ca0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -307,10 +307,12 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
layer.w13_weight = torch.nn.Parameter(
layer.w13_weight_packed.data, requires_grad=False
)
delattr(layer, "w13_weight_packed")
layer.w2_weight = torch.nn.Parameter(
layer.w2_weight_packed.data, requires_grad=False
)
delattr(layer, "w2_weight_packed")
# reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel.
if self.allow_flashinfer: