diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index c2b884c058d3a..5470deb768450 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -398,7 +398,8 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): quant_type_id=scalar_types.float4_e2m1f.id, apply_router_weight_on_input=apply_router_weight_on_input, global_num_experts=global_num_experts, - expert_map=expert_map) + expert_map=expert_map, + workspace=layer.workspace) # FlashInfer fused experts path if self.fused_experts is not None: @@ -940,7 +941,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): quant_type_id=scalar_types.float8_e4m3fn.id, apply_router_weight_on_input=apply_router_weight_on_input, global_num_experts=global_num_experts, - expert_map=expert_map) + expert_map=expert_map, + workspace=layer.workspace) assert self.fused_experts_func is not None diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 49ff87df93c31..254cc2be05ee6 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1103,7 +1103,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): quant_type_id=scalar_types.float8_e4m3fn.id, apply_router_weight_on_input=apply_router_weight_on_input, global_num_experts=global_num_experts, - expert_map=expert_map) + expert_map=expert_map, + workspace=layer.workspace) elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: assert self.block_quant is None assert (not renormalize and custom_routing_function is not None) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 9b99931e7b43f..60a79e53e8141 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1474,7 +1474,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): quant_type_id=scalar_types.float4_e2m1f.id, apply_router_weight_on_input=apply_router_weight_on_input, global_num_experts=global_num_experts, - expert_map=expert_map) + expert_map=expert_map, + workspace=layer.workspace) if self.fused_experts is not None: assert self.allow_flashinfer and \