From 6b04039a7240ae1039fea4bd179ec3b452f19107 Mon Sep 17 00:00:00 2001 From: sstamenk Date: Fri, 15 Aug 2025 19:17:31 +0200 Subject: [PATCH] [BugFix] Skip the Q component for QKVParallelLinear in the case of QKVCrossParallelLinear since its width is 0 (#22369) Signed-off-by: sstamenk --- vllm/model_executor/layers/quantization/utils/w8a8_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index ddb50968904d1..659029fd37f70 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -121,6 +121,9 @@ def requantize_with_max_scale( if unfused_module_in_checkpoint: start = 0 for idx, logical_width in enumerate(logical_widths): + # Skip any component with zero width. + if logical_width == 0: + continue end = start + logical_width weight_dq = per_tensor_dequantize(weight[start:end, :], weight_scale[idx])