mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-01 21:11:19 +08:00
[Perf] Reuse workspace for FP8+FP4 Marlin MoE (#20500)
Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
This commit is contained in:
parent
3053a22b33
commit
dbebb7f812
@ -398,7 +398,8 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
|
|||||||
quant_type_id=scalar_types.float4_e2m1f.id,
|
quant_type_id=scalar_types.float4_e2m1f.id,
|
||||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||||
global_num_experts=global_num_experts,
|
global_num_experts=global_num_experts,
|
||||||
expert_map=expert_map)
|
expert_map=expert_map,
|
||||||
|
workspace=layer.workspace)
|
||||||
|
|
||||||
# FlashInfer fused experts path
|
# FlashInfer fused experts path
|
||||||
if self.fused_experts is not None:
|
if self.fused_experts is not None:
|
||||||
@ -940,7 +941,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
|
|||||||
quant_type_id=scalar_types.float8_e4m3fn.id,
|
quant_type_id=scalar_types.float8_e4m3fn.id,
|
||||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||||
global_num_experts=global_num_experts,
|
global_num_experts=global_num_experts,
|
||||||
expert_map=expert_map)
|
expert_map=expert_map,
|
||||||
|
workspace=layer.workspace)
|
||||||
|
|
||||||
assert self.fused_experts_func is not None
|
assert self.fused_experts_func is not None
|
||||||
|
|
||||||
|
|||||||
@ -1103,7 +1103,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
|||||||
quant_type_id=scalar_types.float8_e4m3fn.id,
|
quant_type_id=scalar_types.float8_e4m3fn.id,
|
||||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||||
global_num_experts=global_num_experts,
|
global_num_experts=global_num_experts,
|
||||||
expert_map=expert_map)
|
expert_map=expert_map,
|
||||||
|
workspace=layer.workspace)
|
||||||
elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
|
elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
|
||||||
assert self.block_quant is None
|
assert self.block_quant is None
|
||||||
assert (not renormalize and custom_routing_function is not None)
|
assert (not renormalize and custom_routing_function is not None)
|
||||||
|
|||||||
@ -1474,7 +1474,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
|||||||
quant_type_id=scalar_types.float4_e2m1f.id,
|
quant_type_id=scalar_types.float4_e2m1f.id,
|
||||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||||
global_num_experts=global_num_experts,
|
global_num_experts=global_num_experts,
|
||||||
expert_map=expert_map)
|
expert_map=expert_map,
|
||||||
|
workspace=layer.workspace)
|
||||||
|
|
||||||
if self.fused_experts is not None:
|
if self.fused_experts is not None:
|
||||||
assert self.allow_flashinfer and \
|
assert self.allow_flashinfer and \
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user