diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 39607dc4ca11e..c966be99ed24f 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -660,36 +660,17 @@ def get_default_config( is_marlin: bool, block_shape: Optional[List[int]] = None, ) -> Dict[str, int]: - if dtype == "fp8_w8a8": - if block_shape is None: - config = { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, - "num_warps": 8, - "num_stages": 4, - } - if M <= E: - config = { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 4, - } - else: - # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0] - # BLOCK_SIZE_K must be divisible by block_shape[1] - config = { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": block_shape[0], - "BLOCK_SIZE_K": block_shape[1], - "GROUP_SIZE_M": 32, - "num_warps": 4, - "num_stages": 3, - } + if dtype == "fp8_w8a8" and block_shape is not None: + # Block-wise quant: BLOCK_SIZE_N must be divisible by block_shape[0] + # BLOCK_SIZE_K must be divisible by block_shape[1] + config = { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": block_shape[0], + "BLOCK_SIZE_K": block_shape[1], + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3, + } else: config = { "BLOCK_SIZE_M": 64,