From a5345bf49df74cd394a07797649f51cd67c6c697 Mon Sep 17 00:00:00 2001
From: Andrii Skliar <andreyws96@gmail.com>
Date: Thu, 27 Nov 2025 20:34:59 +0100
Subject: [PATCH] [BugFix] Fix `plan` API Mismatch when using latest FlashInfer
 (#29426)

Signed-off-by: Andrii Skliar <askliar@askliar-mlt.client.nvidia.com>
Co-authored-by: Andrii Skliar <askliar@askliar-mlt.client.nvidia.com>
---
 docker/Dockerfile                        | 4 ++--
 requirements/cuda.txt                    | 2 +-
 vllm/v1/attention/backends/flashinfer.py | 3 ++-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index aa3aad21d6c0..eb7c105071c0 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -398,8 +398,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer pre-compiled kernel cache and binaries
 # https://docs.flashinfer.ai/installation.html
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system flashinfer-cubin==0.5.2 \
-    && uv pip install --system flashinfer-jit-cache==0.5.2 \
+    uv pip install --system flashinfer-cubin==0.5.3 \
+    && uv pip install --system flashinfer-jit-cache==0.5.3 \
         --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
     && flashinfer show-config
 
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 15e8aadc56f4..462f18ef7159 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -10,4 +10,4 @@ torchaudio==2.9.0
 # These must be updated alongside torch
 torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
-flashinfer-python==0.5.2
+flashinfer-python==0.5.3
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index dbd72b298b1f..777398bf8a20 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -1508,7 +1508,7 @@ def fast_plan_decode(
     qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
 
     try:
-        # Make sure we pass exactly 18 arguments for tensor core version
+        # Make sure we pass exactly 19 arguments for tensor core version
         self._plan_info = self._cached_module.plan(
             self._float_workspace_buffer,
             self._int_workspace_buffer,
@@ -1528,6 +1528,7 @@ def fast_plan_decode(
             window_left,
             fixed_split_size,
             disable_split_kv,
+            0,
         )
     except Exception as e:
         raise RuntimeError(f"Error in tensor core plan: {e}") from e