mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-15 10:57:06 +08:00
Merge 902d7df99aec81621dd90923d57b5a189a732a8d into 254f6b986720c92ddf97fbb1a6a6465da8e87e29
This commit is contained in:
commit
1829f5dfb6
@ -38,7 +38,7 @@ repos:
|
||||
rev: 0.9.1
|
||||
hooks:
|
||||
- id: pip-compile
|
||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"]
|
||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --extra-index-url, https://download.pytorch.org/whl/test/cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"]
|
||||
files: ^requirements/test\.(in|txt)$
|
||||
- repo: local
|
||||
hooks:
|
||||
|
||||
@ -56,8 +56,8 @@ endif()
|
||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||
# versions are derived from docker/Dockerfile.rocm
|
||||
#
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.10.0")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.10.0")
|
||||
|
||||
#
|
||||
# Try to find python package with an executable that exactly matches
|
||||
@ -432,7 +432,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
|
||||
endif()
|
||||
|
||||
if (MARLIN_SM75_ARCHS)
|
||||
if (MARLIN_SM75_ARCHS)
|
||||
file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
|
||||
@ -444,7 +444,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
|
||||
endif()
|
||||
|
||||
if (MARLIN_FP8_ARCHS)
|
||||
if (MARLIN_FP8_ARCHS)
|
||||
file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
|
||||
@ -1042,7 +1042,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
|
||||
endif()
|
||||
|
||||
if (MARLIN_MOE_SM75_ARCHS)
|
||||
if (MARLIN_MOE_SM75_ARCHS)
|
||||
file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${MARLIN_MOE_SM75_SRC}"
|
||||
|
||||
@ -13,7 +13,7 @@ vLLM uses a “right-shifted” versioning scheme where a new patch release is o
|
||||
|
||||
## Release Cadence
|
||||
|
||||
Patch release is released on bi-weekly basis. Post release 1-3 days after patch release and uses same branch as patch release.
|
||||
Patch release is released on bi-weekly basis. Post release 1-2 days after patch release and uses same branch as patch release.
|
||||
Following is the release cadence for year 2025. All future release dates below are tentative. Please note: Post releases are optional.
|
||||
|
||||
| Release Date | Patch release versions | Post Release versions |
|
||||
|
||||
@ -55,7 +55,7 @@ ARG UV_INDEX_URL=${PIP_INDEX_URL}
|
||||
ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
||||
|
||||
# PyTorch provides its own indexes for standard and nightly builds
|
||||
ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl
|
||||
ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl/test
|
||||
|
||||
# PIP supports multiple authentication schemes, including keyring
|
||||
# By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to
|
||||
@ -135,7 +135,7 @@ WORKDIR /workspace
|
||||
COPY requirements/common.txt requirements/common.txt
|
||||
COPY requirements/cuda.txt requirements/cuda.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
|
||||
uv pip install --python /opt/venv/bin/python3 --prerelease=allow -r requirements/cuda.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
# CUDA arch list used by torch
|
||||
@ -303,7 +303,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
|
||||
uv pip install --python /opt/venv/bin/python3 --prerelease=allow -r requirements/build.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
WORKDIR /workspace
|
||||
@ -367,7 +367,7 @@ COPY requirements/lint.txt requirements/lint.txt
|
||||
COPY requirements/test.txt requirements/test.txt
|
||||
COPY requirements/dev.txt requirements/dev.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
|
||||
uv pip install --python /opt/venv/bin/python3 --prerelease=allow -r requirements/dev.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
#################### DEV IMAGE ####################
|
||||
#################### vLLM installation IMAGE ####################
|
||||
@ -465,7 +465,7 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
|
||||
COPY requirements/common.txt /tmp/common.txt
|
||||
COPY requirements/cuda.txt /tmp/requirements-cuda.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r /tmp/requirements-cuda.txt \
|
||||
uv pip install --system --prerelease=allow -r /tmp/requirements-cuda.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \
|
||||
rm /tmp/requirements-cuda.txt /tmp/common.txt
|
||||
|
||||
@ -522,9 +522,10 @@ ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
|
||||
# Install vllm wheel first, so that torch etc will be installed.
|
||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system dist/*.whl --verbose \
|
||||
uv pip install --prerelease=allow --system dist/*.whl --verbose \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
. /etc/environment && \
|
||||
uv pip list
|
||||
@ -544,7 +545,7 @@ ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
|
||||
# Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
|
||||
RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm-workspace/ep_kernels/dist \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system ep_kernels/dist/*.whl --verbose \
|
||||
uv pip install --prerelease=allow --system ep_kernels/dist/*.whl --verbose \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
|
||||
|
||||
# CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
|
||||
@ -587,7 +588,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
|
||||
if [ "$CUDA_MAJOR" -ge 12 ]; then \
|
||||
uv pip install --system -r requirements/dev.txt \
|
||||
uv pip install --prerelease=allow --system -r requirements/dev.txt \
|
||||
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
|
||||
fi
|
||||
|
||||
|
||||
@ -26,7 +26,7 @@ FROM ubuntu:22.04 AS base-common
|
||||
WORKDIR /workspace/
|
||||
|
||||
ARG PYTHON_VERSION=3.12
|
||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/test/cpu"
|
||||
|
||||
# Install minimal dependencies and uv
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
|
||||
@ -6,7 +6,7 @@ requires = [
|
||||
"packaging>=24.2",
|
||||
"setuptools>=77.0.3,<81.0.0",
|
||||
"setuptools-scm>=8.0",
|
||||
"torch == 2.9.1",
|
||||
"torch == 2.10.0",
|
||||
"wheel",
|
||||
"jinja2",
|
||||
]
|
||||
|
||||
@ -4,7 +4,7 @@ ninja
|
||||
packaging>=24.2
|
||||
setuptools>=77.0.3,<81.0.0
|
||||
setuptools-scm>=8
|
||||
torch==2.9.1
|
||||
torch==2.10.0
|
||||
wheel
|
||||
jinja2>=3.1.6
|
||||
regex
|
||||
|
||||
@ -5,9 +5,9 @@ numba == 0.61.2 # Required for N-gram speculative decoding
|
||||
|
||||
# Dependencies for NVIDIA GPUs
|
||||
ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
|
||||
torch==2.9.1
|
||||
torchaudio==2.9.1
|
||||
torch==2.10.0
|
||||
torchaudio==2.10.0
|
||||
# These must be updated alongside torch
|
||||
torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
||||
torchvision==0.25.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
||||
# FlashInfer should be updated together with the Dockerfile
|
||||
flashinfer-python==0.5.3
|
||||
|
||||
@ -1,12 +1,11 @@
|
||||
# Common dependencies
|
||||
-r common.txt
|
||||
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.4
|
||||
torch==2.9.1
|
||||
torchvision==0.24.1
|
||||
torchaudio==2.9.1
|
||||
|
||||
triton==3.5.1
|
||||
--extra-index-url https://download.pytorch.org/whl/test/rocm7.0
|
||||
torch==2.10.0
|
||||
torchvision==0.25.0
|
||||
torchaudio==2.10.0
|
||||
triton==3.6.0
|
||||
cmake>=3.26.1,<4
|
||||
packaging>=24.2
|
||||
setuptools>=77.0.3,<80.0.0
|
||||
|
||||
@ -24,9 +24,9 @@ soundfile # required for audio tests
|
||||
jiwer # required for audio tests
|
||||
tblib # for pickling test exceptions
|
||||
timm >=1.0.17 # required for internvl and gemma3n-mm test
|
||||
torch==2.9.1
|
||||
torchaudio==2.9.1
|
||||
torchvision==0.24.1
|
||||
torch==2.10.0
|
||||
torchaudio==2.10.0
|
||||
torchvision==0.25.0
|
||||
transformers_stream_generator # required for qwen-vl test
|
||||
matplotlib # required for qwen-vl test
|
||||
mistral_common[image,audio] >= 1.8.5 # required for voxtral test
|
||||
|
||||
@ -608,7 +608,7 @@ nvidia-nvjitlink-cu12==12.9.86
|
||||
# nvidia-cusolver-cu12
|
||||
# nvidia-cusparse-cu12
|
||||
# torch
|
||||
nvidia-nvshmem-cu12==3.3.20
|
||||
nvidia-nvshmem-cu12==3.4.5
|
||||
# via torch
|
||||
nvidia-nvtx-cu12==12.9.79
|
||||
# via torch
|
||||
@ -1123,7 +1123,7 @@ tomli==2.2.1
|
||||
# via schemathesis
|
||||
tomli-w==1.2.0
|
||||
# via schemathesis
|
||||
torch==2.9.1+cu129
|
||||
torch==2.10.0+cu129
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# accelerate
|
||||
@ -1152,7 +1152,7 @@ torch==2.9.1+cu129
|
||||
# torchvision
|
||||
# vector-quantize-pytorch
|
||||
# vocos
|
||||
torchaudio==2.9.1+cu129
|
||||
torchaudio==2.10.0+cu129
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# encodec
|
||||
@ -1165,7 +1165,7 @@ torchmetrics==1.7.4
|
||||
# pytorch-lightning
|
||||
# terratorch
|
||||
# torchgeo
|
||||
torchvision==0.24.1+cu129
|
||||
torchvision==0.25.0+cu129
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# lightly
|
||||
@ -1206,7 +1206,7 @@ transformers==4.57.3
|
||||
# transformers-stream-generator
|
||||
transformers-stream-generator==0.0.5
|
||||
# via -r requirements/test.in
|
||||
triton==3.5.1
|
||||
triton==3.6.0
|
||||
# via torch
|
||||
tritonclient==2.51.0
|
||||
# via
|
||||
|
||||
@ -56,7 +56,7 @@ def use_vllm_config(vllm_config: VllmConfig):
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||
not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10"
|
||||
)
|
||||
def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as m:
|
||||
@ -80,7 +80,7 @@ def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||
not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10"
|
||||
)
|
||||
def test_force_aot_load(monkeypatch: pytest.MonkeyPatch):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname, monkeypatch.context() as m:
|
||||
@ -94,7 +94,7 @@ def test_force_aot_load(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||
not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10"
|
||||
)
|
||||
def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as m:
|
||||
@ -116,7 +116,7 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||
not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10"
|
||||
)
|
||||
def test_shape_env(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
@ -149,7 +149,7 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||
not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10"
|
||||
)
|
||||
@create_new_process_for_each_test("spawn")
|
||||
def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
|
||||
|
||||
@ -40,7 +40,7 @@ def get_test_models():
|
||||
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
|
||||
@pytest.mark.parametrize("evaluate_guards", [False, True])
|
||||
@pytest.mark.skipif(
|
||||
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
||||
not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10"
|
||||
)
|
||||
def test_dynamic_shapes_compilation(
|
||||
monkeypatch,
|
||||
|
||||
@ -67,7 +67,8 @@ apt autoremove -y
|
||||
|
||||
echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py
|
||||
|
||||
VLLM_PRECOMPILED_WHEEL_COMMIT=$merge_base_commit VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
|
||||
|
||||
VLLM_PRECOMPILED_WHEEL_COMMIT=$merge_base_commit VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e . --extra-index-url https://download.pytorch.org/whl/test/cu129/
|
||||
|
||||
# Run the script
|
||||
python3 -c 'import vllm'
|
||||
|
||||
@ -316,7 +316,7 @@ def _support_torch_compile(
|
||||
def _mark_dynamic_inputs(mod, type, *args, **kwargs):
|
||||
def mark_dynamic(arg, dims):
|
||||
if type == DynamicShapesType.UNBACKED:
|
||||
if is_torch_equal_or_newer("2.10.0.dev"):
|
||||
if is_torch_equal_or_newer("2.10.0"):
|
||||
for dim in dims:
|
||||
torch._dynamo.decorators.mark_unbacked(
|
||||
arg, dim, hint_override=arg.size()[dim]
|
||||
@ -356,7 +356,7 @@ def _support_torch_compile(
|
||||
if isinstance(arg, torch.Tensor):
|
||||
# In case dims is specified with negative indexing
|
||||
dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
|
||||
if is_torch_equal_or_newer("2.10.0.dev"):
|
||||
if is_torch_equal_or_newer("2.10.0"):
|
||||
for dim in dims:
|
||||
torch._dynamo.decorators.mark_unbacked(
|
||||
arg, dim, hint_override=arg.size()[dim]
|
||||
@ -496,9 +496,9 @@ def _support_torch_compile(
|
||||
fx_config_patches["backed_size_oblivious"] = True
|
||||
|
||||
# Prepare inductor config patches
|
||||
# assume_32bit_indexing is only available in torch 2.10.0.dev+
|
||||
# assume_32bit_indexing is only available in torch 2.10.0+
|
||||
inductor_config_patches = {}
|
||||
if is_torch_equal_or_newer("2.10.0.dev"):
|
||||
if is_torch_equal_or_newer("2.10.0"):
|
||||
inductor_config_patches["assume_32bit_indexing"] = True
|
||||
|
||||
with (
|
||||
|
||||
@ -286,7 +286,7 @@ def use_aot_compile() -> bool:
|
||||
|
||||
default_value = (
|
||||
"1"
|
||||
if is_torch_equal_or_newer("2.10.0.dev") and not disable_compile_cache()
|
||||
if is_torch_equal_or_newer("2.10.0") and not disable_compile_cache()
|
||||
else "0"
|
||||
)
|
||||
|
||||
|
||||
@ -974,7 +974,7 @@ def enable_batch_invariant_mode():
|
||||
)
|
||||
|
||||
reduced_precision_val = (
|
||||
(False, False) if is_torch_equal_or_newer("2.10.0.dev") else False
|
||||
(False, False) if is_torch_equal_or_newer("2.10.0") else False
|
||||
)
|
||||
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
|
||||
reduced_precision_val
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user