mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-12 02:45:02 +08:00
[BugFix] Fix plan API Mismatch when using latest FlashInfer (#29426)
Signed-off-by: Andrii Skliar <askliar@askliar-mlt.client.nvidia.com> Co-authored-by: Andrii Skliar <askliar@askliar-mlt.client.nvidia.com>
This commit is contained in:
parent
e5a621b724
commit
a5345bf49d
@ -398,8 +398,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
|||||||
# Install FlashInfer pre-compiled kernel cache and binaries
|
# Install FlashInfer pre-compiled kernel cache and binaries
|
||||||
# https://docs.flashinfer.ai/installation.html
|
# https://docs.flashinfer.ai/installation.html
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system flashinfer-cubin==0.5.2 \
|
uv pip install --system flashinfer-cubin==0.5.3 \
|
||||||
&& uv pip install --system flashinfer-jit-cache==0.5.2 \
|
&& uv pip install --system flashinfer-jit-cache==0.5.3 \
|
||||||
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
|
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
|
||||||
&& flashinfer show-config
|
&& flashinfer show-config
|
||||||
|
|
||||||
|
|||||||
@ -10,4 +10,4 @@ torchaudio==2.9.0
|
|||||||
# These must be updated alongside torch
|
# These must be updated alongside torch
|
||||||
torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
||||||
# FlashInfer should be updated together with the Dockerfile
|
# FlashInfer should be updated together with the Dockerfile
|
||||||
flashinfer-python==0.5.2
|
flashinfer-python==0.5.3
|
||||||
|
|||||||
@ -1508,7 +1508,7 @@ def fast_plan_decode(
|
|||||||
qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
|
qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Make sure we pass exactly 18 arguments for tensor core version
|
# Make sure we pass exactly 19 arguments for tensor core version
|
||||||
self._plan_info = self._cached_module.plan(
|
self._plan_info = self._cached_module.plan(
|
||||||
self._float_workspace_buffer,
|
self._float_workspace_buffer,
|
||||||
self._int_workspace_buffer,
|
self._int_workspace_buffer,
|
||||||
@ -1528,6 +1528,7 @@ def fast_plan_decode(
|
|||||||
window_left,
|
window_left,
|
||||||
fixed_split_size,
|
fixed_split_size,
|
||||||
disable_split_kv,
|
disable_split_kv,
|
||||||
|
0,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(f"Error in tensor core plan: {e}") from e
|
raise RuntimeError(f"Error in tensor core plan: {e}") from e
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user