mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-01-03 07:35:15 +08:00
[bugfix] remove unused parameters to reduce unnecessary vram usage (#26789)
Signed-off-by: Reinforce-II <fate@eastal.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
This commit is contained in:
parent
1c160841ea
commit
980de31ca0
@ -307,10 +307,12 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
||||
layer.w13_weight = torch.nn.Parameter(
|
||||
layer.w13_weight_packed.data, requires_grad=False
|
||||
)
|
||||
delattr(layer, "w13_weight_packed")
|
||||
|
||||
layer.w2_weight = torch.nn.Parameter(
|
||||
layer.w2_weight_packed.data, requires_grad=False
|
||||
)
|
||||
delattr(layer, "w2_weight_packed")
|
||||
|
||||
# reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel.
|
||||
if self.allow_flashinfer:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user