[Bugfix] Fix mismatched nvfp4 gemm output shape (#29742)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-01-25 18:24:33 +08:00 · 2025-11-30 17:15:01 +08:00 · 2025-11-30 17:15:01 +08:00 · 47539cfd3e
commit 47539cfd3e
parent 2afcec4dec
1 changed files with 1 additions and 1 deletions
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@ -184,7 +184,7 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
            return out

        output_dtype = x.dtype
-        output_shape = [x.shape[0], layer.weight_packed.shape[0]]
+        output_shape = [*x.shape[:-1], layer.weight_packed.shape[0]]

        # quantize BF16 or FP16 to (FP4 and interleaved block scale)
        x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale)