[ModelOpt] Remove NVFP4 MoE K%16==0 constraint (#26891)

Signed-off-by: XiaobingSuper <xiaobingzhangupc@gmail.com>
2026-06-04 21:42:22 +08:00 · 2025-10-16 01:06:17 +08:00 · 2025-10-16 01:06:17 +08:00 · d796375258
commit d796375258
parent 14f8456344
1 changed files with 0 additions and 12 deletions
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@ -1542,23 +1542,11 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
            del layer.w2_input_scale_quant
        else:
            # Non-TRT-LLM processing (Cutlass or non-flashinfer)
            assert layer.w13_weight_scale.shape[2] % 16 == 0, (
                "Expected weight_scale.dim(1) to be divisible by 16"
            )
            assert layer.w13_weight_scale.dtype == torch.float8_e4m3fn, (
                "Weight Blockscale must be represented as FP8-E4M3"
            )
            w13_blockscale_swizzled = swizzle_blockscale(layer.w13_weight_scale)
            layer.w13_weight_scale = Parameter(
                w13_blockscale_swizzled, requires_grad=False
            )
            assert layer.w2_weight_scale.shape[2] % 16 == 0, (
                "Expected weight_scale.dim(1) to be divisible by 16"
            )
            assert layer.w2_weight_scale.dtype == torch.float8_e4m3fn, (
                "Weight Blockscale must be represented as FP8-E4M3"
            )
            w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale)
            layer.w2_weight_scale = Parameter(
                w2_blockscale_swizzled, requires_grad=False