[BugFix] Fix plan API Mismatch when using latest FlashInfer (#29426)

Signed-off-by: Andrii Skliar <askliar@askliar-mlt.client.nvidia.com> Co-authored-by: Andrii Skliar <askliar@askliar-mlt.client.nvidia.com>
2025-12-12 02:45:02 +08:00 · 2025-11-27 20:34:59 +01:00 · 2025-11-27 20:34:59 +01:00 · a5345bf49d
commit a5345bf49d
parent e5a621b724
3 changed files with 5 additions and 4 deletions
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -398,8 +398,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer pre-compiled kernel cache and binaries
 # https://docs.flashinfer.ai/installation.html
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system flashinfer-cubin==0.5.2 \
+    uv pip install --system flashinfer-cubin==0.5.3 \
-    && uv pip install --system flashinfer-jit-cache==0.5.2 \
+    && uv pip install --system flashinfer-jit-cache==0.5.3 \
        --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
    && flashinfer show-config
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@ -10,4 +10,4 @@ torchaudio==2.9.0
 # These must be updated alongside torch
 torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
-flashinfer-python==0.5.2
+flashinfer-python==0.5.3
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@ -1508,7 +1508,7 @@ def fast_plan_decode(
    qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
    try:
-        # Make sure we pass exactly 18 arguments for tensor core version
+        # Make sure we pass exactly 19 arguments for tensor core version
        self._plan_info = self._cached_module.plan(
            self._float_workspace_buffer,
            self._int_workspace_buffer,
@ -1528,6 +1528,7 @@ def fast_plan_decode(
            window_left,
            fixed_split_size,
            disable_split_kv,
            0,
        )
    except Exception as e:
        raise RuntimeError(f"Error in tensor core plan: {e}") from e