From 55211b01e87a02bdd0045b455715dfe508580738 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20O=C5=BC=C3=B3g?=
 <58388001+SzymonOzog@users.noreply.github.com>
Date: Thu, 13 Mar 2025 08:19:03 +0100
Subject: [PATCH] [Bugfix] Fix chunked prefill for GGUF (#14666)

Signed-off-by: SzymonOzog <szymon.ozog@aleph-alpha.com>
---
 vllm/model_executor/layers/quantization/gguf.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 5d4c1c6ec8930..c92bcbea540a5 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -98,6 +98,13 @@ MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES
 
 def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor,
                   qweight_type: int) -> torch.Tensor:
+    # HACK: when doing chunked prefill we don't generate output tokens
+    # so input to logits generator is empty which causes invalid parameter
+    if x.shape[0] == 0:
+        return torch.empty(x.shape[0],
+                           qweight.shape[0],
+                           dtype=x.dtype,
+                           device=x.device)
     # there is no need to call any kernel for fp16/bf16
     if qweight_type in UNQUANTIZED_TYPES:
         return x @ qweight.T