From a5345bf49df74cd394a07797649f51cd67c6c697 Mon Sep 17 00:00:00 2001 From: Andrii Skliar Date: Thu, 27 Nov 2025 20:34:59 +0100 Subject: [PATCH] [BugFix] Fix `plan` API Mismatch when using latest FlashInfer (#29426) Signed-off-by: Andrii Skliar Co-authored-by: Andrii Skliar --- docker/Dockerfile | 4 ++-- requirements/cuda.txt | 2 +- vllm/v1/attention/backends/flashinfer.py | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index aa3aad21d6c0..eb7c105071c0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -398,8 +398,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist # Install FlashInfer pre-compiled kernel cache and binaries # https://docs.flashinfer.ai/installation.html RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system flashinfer-cubin==0.5.2 \ - && uv pip install --system flashinfer-jit-cache==0.5.2 \ + uv pip install --system flashinfer-cubin==0.5.3 \ + && uv pip install --system flashinfer-jit-cache==0.5.3 \ --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ && flashinfer show-config diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 15e8aadc56f4..462f18ef7159 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -10,4 +10,4 @@ torchaudio==2.9.0 # These must be updated alongside torch torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version # FlashInfer should be updated together with the Dockerfile -flashinfer-python==0.5.2 +flashinfer-python==0.5.3 diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index dbd72b298b1f..777398bf8a20 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -1508,7 +1508,7 @@ def fast_plan_decode( qo_indptr_host = _get_range_buf(batch_size + 1, "cpu") try: - # Make sure we pass exactly 18 arguments for tensor core version + # Make sure we pass exactly 19 arguments for tensor core version self._plan_info = self._cached_module.plan( self._float_workspace_buffer, self._int_workspace_buffer, @@ -1528,6 +1528,7 @@ def fast_plan_decode( window_left, fixed_split_size, disable_split_kv, + 0, ) except Exception as e: raise RuntimeError(f"Error in tensor core plan: {e}") from e