mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-05 11:29:09 +08:00
[bugfix] remove unused parameters to reduce unnecessary vram usage (#26789)
Signed-off-by: Reinforce-II <fate@eastal.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
This commit is contained in:
parent
1c160841ea
commit
980de31ca0
@ -307,10 +307,12 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
|||||||
layer.w13_weight = torch.nn.Parameter(
|
layer.w13_weight = torch.nn.Parameter(
|
||||||
layer.w13_weight_packed.data, requires_grad=False
|
layer.w13_weight_packed.data, requires_grad=False
|
||||||
)
|
)
|
||||||
|
delattr(layer, "w13_weight_packed")
|
||||||
|
|
||||||
layer.w2_weight = torch.nn.Parameter(
|
layer.w2_weight = torch.nn.Parameter(
|
||||||
layer.w2_weight_packed.data, requires_grad=False
|
layer.w2_weight_packed.data, requires_grad=False
|
||||||
)
|
)
|
||||||
|
delattr(layer, "w2_weight_packed")
|
||||||
|
|
||||||
# reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel.
|
# reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel.
|
||||||
if self.allow_flashinfer:
|
if self.allow_flashinfer:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user