[ROCm][MXFP4] Infer w4a4 quant method in rocm aiter fused moe (#29775)

Signed-off-by: ZhiweiYan-96 <zhiwei.yan@amd.com>
2025-12-20 02:25:24 +08:00 · 2025-12-05 19:01:16 +08:00 · 2025-12-05 19:01:16 +08:00 · 3628bcaaf2
commit 3628bcaaf2
parent b73b158ab0
2 changed files with 6 additions and 2 deletions
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@ -345,6 +345,10 @@ class FusedMoEQuantConfig:
    def use_mxfp4_w4a16(self) -> bool:
        return self._a1.dtype is None and self._w1.dtype == "mxfp4"
    @property
    def use_mxfp4_w4a4(self) -> bool:
        return self._a1.dtype == "mxfp4" and self._w1.dtype == "mxfp4"
    @property
    def use_nvfp4_w4a4(self) -> bool:
        return self.quant_dtype == "nvfp4"
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@ -221,8 +221,8 @@ def rocm_aiter_fused_experts(
    else:
        quant_method = QuantMethod.NO.value
-        # quark moe for mxfp4 w_dtype
+        # quark moe for mxfp4 w_dtype mxfp4 a_dtype
-        if quant_config.use_mxfp4_w4a16:
+        if quant_config.use_mxfp4_w4a4:
            quant_method = QuantMethod.BLOCK_1X32.value
        # w8a8 block-scaled
        if quant_config.block_shape is not None and quant_config.use_fp8_w8a8: