From 47539cfd3e5006159e427ee5bc32823f6cef7ec3 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 30 Nov 2025 17:15:01 +0800 Subject: [PATCH] [Bugfix] Fix mismatched nvfp4 gemm output shape (#29742) Signed-off-by: Isotr0py --- .../compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py index b603bdb13280b..c0b1e3ceeba34 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py @@ -184,7 +184,7 @@ class CompressedTensorsW4A4Fp4(CompressedTensorsScheme): return out output_dtype = x.dtype - output_shape = [x.shape[0], layer.weight_packed.shape[0]] + output_shape = [*x.shape[:-1], layer.weight_packed.shape[0]] # quantize BF16 or FP16 to (FP4 and interleaved block scale) x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale)