mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 08:45:36 +08:00
Update Flashinfer from v0.4.1 to v0.5.2 (#27952)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
67a2da890e
commit
811df41ee9
@ -132,9 +132,7 @@ WORKDIR /workspace
|
|||||||
COPY requirements/common.txt requirements/common.txt
|
COPY requirements/common.txt requirements/common.txt
|
||||||
COPY requirements/cuda.txt requirements/cuda.txt
|
COPY requirements/cuda.txt requirements/cuda.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
# TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
|
uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
|
||||||
uv pip install --python /opt/venv/bin/python3 --pre apache-tvm-ffi==0.1.0b15 \
|
|
||||||
&& uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
|
|
||||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
|
||||||
# cuda arch list used by torch
|
# cuda arch list used by torch
|
||||||
@ -356,16 +354,14 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
|||||||
# Install vllm wheel first, so that torch etc will be installed.
|
# Install vllm wheel first, so that torch etc will be installed.
|
||||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||||
--mount=type=cache,target=/root/.cache/uv \
|
--mount=type=cache,target=/root/.cache/uv \
|
||||||
# TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
|
uv pip install --system dist/*.whl --verbose \
|
||||||
uv pip install --system --pre apache-tvm-ffi==0.1.0b15 \
|
|
||||||
&& uv pip install --system dist/*.whl --verbose \
|
|
||||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||||
|
|
||||||
# Install FlashInfer pre-compiled kernel cache and binaries
|
# Install FlashInfer pre-compiled kernel cache and binaries
|
||||||
# https://docs.flashinfer.ai/installation.html
|
# https://docs.flashinfer.ai/installation.html
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install --system flashinfer-cubin==0.4.1 \
|
uv pip install --system flashinfer-cubin==0.5.2 \
|
||||||
&& uv pip install --system flashinfer-jit-cache==0.4.1 \
|
&& uv pip install --system flashinfer-jit-cache==0.5.2 \
|
||||||
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
|
--extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
|
||||||
&& flashinfer show-config
|
&& flashinfer show-config
|
||||||
|
|
||||||
|
|||||||
@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
|
|||||||
|
|
||||||
|
|
||||||
# build flashinfer for torch nightly from source around 10 mins
|
# build flashinfer for torch nightly from source around 10 mins
|
||||||
# release version: v0.4.1
|
# release version: v0.5.2
|
||||||
# todo(elainewy): cache flashinfer build result for faster build
|
# todo(elainewy): cache flashinfer build result for faster build
|
||||||
ENV CCACHE_DIR=/root/.cache/ccache
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||||
@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
|
|||||||
echo "git clone flashinfer..." \
|
echo "git clone flashinfer..." \
|
||||||
&& git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
|
&& git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
|
||||||
&& cd flashinfer \
|
&& cd flashinfer \
|
||||||
&& git checkout v0.4.1\
|
&& git checkout v0.5.2 \
|
||||||
&& git submodule update --init --recursive \
|
&& git submodule update --init --recursive \
|
||||||
&& echo "finish git clone flashinfer..." \
|
&& echo "finish git clone flashinfer..." \
|
||||||
&& rm -rf build \
|
&& rm -rf build \
|
||||||
|
|||||||
@ -12,4 +12,4 @@ torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytor
|
|||||||
# Build from https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
|
# Build from https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
|
||||||
xformers==0.0.33+5d4b92a5.d20251029; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.9
|
xformers==0.0.33+5d4b92a5.d20251029; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.9
|
||||||
# FlashInfer should be updated together with the Dockerfile
|
# FlashInfer should be updated together with the Dockerfile
|
||||||
flashinfer-python==0.4.1
|
flashinfer-python==0.5.2
|
||||||
|
|||||||
@ -238,9 +238,11 @@ def test_flashinfer_trtllm_decode_with_baseline(
|
|||||||
if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
|
if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
|
||||||
rtol, atol = 7e-2, 9e-2
|
rtol, atol = 7e-2, 9e-2
|
||||||
elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
|
elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
|
||||||
rtol, atol = 2e-2, 4e-2
|
rtol, atol = 3e-2, 4e-2
|
||||||
elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype:
|
elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype:
|
||||||
rtol, atol = 1e-2, 2e-2
|
rtol, atol = 2e-2, 2e-2
|
||||||
|
elif kv_quant_dtype == FP8_DTYPE:
|
||||||
|
rtol, atol = 4e-2, 6e-2
|
||||||
else:
|
else:
|
||||||
rtol, atol = 1e-2, 1e-2
|
rtol, atol = 1e-2, 1e-2
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user