Merge 902d7df99aec81621dd90923d57b5a189a732a8d into 254f6b986720c92ddf97fbb1a6a6465da8e87e29

2026-07-17 05:47:10 +08:00 · 2025-12-25 00:07:21 +00:00 · 2025-12-25 00:07:21 +00:00 · 1829f5dfb6
commit 1829f5dfb6
parent 254f6b9867 902d7df99a
17 changed files with 49 additions and 48 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -38,7 +38,7 @@ repos:
  rev: 0.9.1
  hooks:
    - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --extra-index-url, https://download.pytorch.org/whl/test/cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"]
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -56,8 +56,8 @@ endif()
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.10.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.10.0")

 #
 # Try to find python package with an executable that exactly matches
@ -432,7 +432,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
    endif()

-    if (MARLIN_SM75_ARCHS) 
+    if (MARLIN_SM75_ARCHS)
      file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
@ -444,7 +444,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
    endif()

-    if (MARLIN_FP8_ARCHS) 
+    if (MARLIN_FP8_ARCHS)
      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
@ -1042,7 +1042,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
    endif()

-    if (MARLIN_MOE_SM75_ARCHS) 
+    if (MARLIN_MOE_SM75_ARCHS)
      file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
      set_gencode_flags_for_srcs(
        SRCS "${MARLIN_MOE_SM75_SRC}"
--- a/RELEASE.md
+++ b/RELEASE.md
@ -13,7 +13,7 @@ vLLM uses a “right-shifted” versioning scheme where a new patch release is o

 ## Release Cadence

-Patch release is released on bi-weekly basis. Post release 1-3 days after patch release and uses same branch as patch release.
+Patch release is released on bi-weekly basis. Post release 1-2 days after patch release and uses same branch as patch release.
 Following is the release cadence for year 2025. All future release dates below are tentative. Please note: Post releases are optional.

 | Release Date | Patch release versions | Post Release versions |
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -55,7 +55,7 @@ ARG UV_INDEX_URL=${PIP_INDEX_URL}
 ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}

 # PyTorch provides its own indexes for standard and nightly builds
-ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl
+ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl/test

 # PIP supports multiple authentication schemes, including keyring
 # By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to
@ -135,7 +135,7 @@ WORKDIR /workspace
 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
+    uv pip install --python /opt/venv/bin/python3 --prerelease=allow -r requirements/cuda.txt \
    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

 # CUDA arch list used by torch
@ -303,7 +303,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE=copy

 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
+    uv pip install --python /opt/venv/bin/python3 --prerelease=allow -r requirements/build.txt \
    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

 WORKDIR /workspace
@ -367,7 +367,7 @@ COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
+    uv pip install --python /opt/venv/bin/python3 --prerelease=allow -r requirements/dev.txt \
    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 #################### DEV IMAGE ####################
 #################### vLLM installation IMAGE ####################
@ -465,7 +465,7 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
 COPY requirements/common.txt /tmp/common.txt
 COPY requirements/cuda.txt /tmp/requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r /tmp/requirements-cuda.txt \
+    uv pip install --system --prerelease=allow -r /tmp/requirements-cuda.txt \
        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \
    rm /tmp/requirements-cuda.txt /tmp/common.txt

@ -522,9 +522,10 @@ ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system dist/*.whl --verbose \
+    uv pip install --prerelease=allow --system dist/*.whl --verbose \
        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

+
 RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
 uv pip list
@ -544,7 +545,7 @@ ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 # Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
 RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm-workspace/ep_kernels/dist \
    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system ep_kernels/dist/*.whl --verbose \
+    uv pip install --prerelease=allow --system ep_kernels/dist/*.whl --verbose \
        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

 # CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
@ -587,7 +588,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 RUN --mount=type=cache,target=/root/.cache/uv \
    CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
    if [ "$CUDA_MAJOR" -ge 12 ]; then \
-        uv pip install --system -r requirements/dev.txt \
+        uv pip install --prerelease=allow --system -r requirements/dev.txt \
        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
    fi

--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@ -26,7 +26,7 @@ FROM ubuntu:22.04 AS base-common
 WORKDIR /workspace/

 ARG PYTHON_VERSION=3.12
-ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/test/cpu"

 # Install minimal dependencies and uv
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ requires = [
    "packaging>=24.2",
    "setuptools>=77.0.3,<81.0.0",
    "setuptools-scm>=8.0",
-    "torch == 2.9.1",
+    "torch == 2.10.0",
    "wheel",
    "jinja2",
 ]
--- a/requirements/build.txt
+++ b/requirements/build.txt
@ -4,7 +4,7 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
-torch==2.9.1
+torch==2.10.0
 wheel
 jinja2>=3.1.6
 regex
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@ -5,9 +5,9 @@ numba == 0.61.2 # Required for N-gram speculative decoding

 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.9.1
-torchaudio==2.9.1
+torch==2.10.0
+torchaudio==2.10.0
 # These must be updated alongside torch
-torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision==0.25.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
 flashinfer-python==0.5.3
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@ -1,12 +1,11 @@
 # Common dependencies
 -r common.txt

--extra-index-url https://download.pytorch.org/whl/rocm6.4
-torch==2.9.1
-torchvision==0.24.1
-torchaudio==2.9.1
-
-triton==3.5.1
+--extra-index-url https://download.pytorch.org/whl/test/rocm7.0
+torch==2.10.0
+torchvision==0.25.0
+torchaudio==2.10.0
+triton==3.6.0
 cmake>=3.26.1,<4
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
--- a/requirements/test.in
+++ b/requirements/test.in
@ -24,9 +24,9 @@ soundfile # required for audio tests
 jiwer # required for audio tests
 tblib # for pickling test exceptions
 timm >=1.0.17 # required for internvl and gemma3n-mm test
-torch==2.9.1
-torchaudio==2.9.1
-torchvision==0.24.1
+torch==2.10.0
+torchaudio==2.10.0
+torchvision==0.25.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[image,audio] >= 1.8.5 # required for voxtral test
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -608,7 +608,7 @@ nvidia-nvjitlink-cu12==12.9.86
    #   nvidia-cusolver-cu12
    #   nvidia-cusparse-cu12
    #   torch
-nvidia-nvshmem-cu12==3.3.20
+nvidia-nvshmem-cu12==3.4.5
    # via torch
 nvidia-nvtx-cu12==12.9.79
    # via torch
@ -1123,7 +1123,7 @@ tomli==2.2.1
    # via schemathesis
 tomli-w==1.2.0
    # via schemathesis
-torch==2.9.1+cu129
+torch==2.10.0+cu129
    # via
    #   -r requirements/test.in
    #   accelerate
@ -1152,7 +1152,7 @@ torch==2.9.1+cu129
    #   torchvision
    #   vector-quantize-pytorch
    #   vocos
-torchaudio==2.9.1+cu129
+torchaudio==2.10.0+cu129
    # via
    #   -r requirements/test.in
    #   encodec
@ -1165,7 +1165,7 @@ torchmetrics==1.7.4
    #   pytorch-lightning
    #   terratorch
    #   torchgeo
-torchvision==0.24.1+cu129
+torchvision==0.25.0+cu129
    # via
    #   -r requirements/test.in
    #   lightly
@ -1206,7 +1206,7 @@ transformers==4.57.3
    #   transformers-stream-generator
 transformers-stream-generator==0.0.5
    # via -r requirements/test.in
-triton==3.5.1
+triton==3.6.0
    # via torch
 tritonclient==2.51.0
    # via
--- a/tests/compile/test_aot_compile.py
+++ b/tests/compile/test_aot_compile.py
@ -56,7 +56,7 @@ def use_vllm_config(vllm_config: VllmConfig):


@pytest.mark.skipif(
-    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+    not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10"
 )
 def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
@ -80,7 +80,7 @@ def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):


@pytest.mark.skipif(
-    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+    not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10"
 )
 def test_force_aot_load(monkeypatch: pytest.MonkeyPatch):
    with tempfile.TemporaryDirectory() as tmpdirname, monkeypatch.context() as m:
@ -94,7 +94,7 @@ def test_force_aot_load(monkeypatch: pytest.MonkeyPatch):


@pytest.mark.skipif(
-    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+    not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10"
 )
 def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
    with monkeypatch.context() as m:
@ -116,7 +116,7 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):


@pytest.mark.skipif(
-    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+    not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10"
 )
 def test_shape_env(monkeypatch: pytest.MonkeyPatch):
    """
@ -149,7 +149,7 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch):


@pytest.mark.skipif(
-    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+    not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10"
 )
@create_new_process_for_each_test("spawn")
 def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
--- a/tests/compile/test_dynamic_shapes_compilation.py
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@ -40,7 +40,7 @@ def get_test_models():
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
@pytest.mark.parametrize("evaluate_guards", [False, True])
@pytest.mark.skipif(
-    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+    not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10"
 )
 def test_dynamic_shapes_compilation(
    monkeypatch,
--- a/tests/standalone_tests/python_only_compile.sh
+++ b/tests/standalone_tests/python_only_compile.sh
@ -67,7 +67,8 @@ apt autoremove -y

 echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py

-VLLM_PRECOMPILED_WHEEL_COMMIT=$merge_base_commit VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
+
+VLLM_PRECOMPILED_WHEEL_COMMIT=$merge_base_commit VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e . --extra-index-url https://download.pytorch.org/whl/test/cu129/

 # Run the script
 python3 -c 'import vllm'
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@ -316,7 +316,7 @@ def _support_torch_compile(
    def _mark_dynamic_inputs(mod, type, *args, **kwargs):
        def mark_dynamic(arg, dims):
            if type == DynamicShapesType.UNBACKED:
-                if is_torch_equal_or_newer("2.10.0.dev"):
+                if is_torch_equal_or_newer("2.10.0"):
                    for dim in dims:
                        torch._dynamo.decorators.mark_unbacked(
                            arg, dim, hint_override=arg.size()[dim]
@ -356,7 +356,7 @@ def _support_torch_compile(
                    if isinstance(arg, torch.Tensor):
                        # In case dims is specified with negative indexing
                        dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
-                        if is_torch_equal_or_newer("2.10.0.dev"):
+                        if is_torch_equal_or_newer("2.10.0"):
                            for dim in dims:
                                torch._dynamo.decorators.mark_unbacked(
                                    arg, dim, hint_override=arg.size()[dim]
@ -496,9 +496,9 @@ def _support_torch_compile(
            fx_config_patches["backed_size_oblivious"] = True

        # Prepare inductor config patches
-        # assume_32bit_indexing is only available in torch 2.10.0.dev+
+        # assume_32bit_indexing is only available in torch 2.10.0+
        inductor_config_patches = {}
-        if is_torch_equal_or_newer("2.10.0.dev"):
+        if is_torch_equal_or_newer("2.10.0"):
            inductor_config_patches["assume_32bit_indexing"] = True

        with (
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -286,7 +286,7 @@ def use_aot_compile() -> bool:

    default_value = (
        "1"
-        if is_torch_equal_or_newer("2.10.0.dev") and not disable_compile_cache()
+        if is_torch_equal_or_newer("2.10.0") and not disable_compile_cache()
        else "0"
    )

--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@ -974,7 +974,7 @@ def enable_batch_invariant_mode():
    )

    reduced_precision_val = (
-        (False, False) if is_torch_equal_or_newer("2.10.0.dev") else False
+        (False, False) if is_torch_equal_or_newer("2.10.0") else False
    )
    torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
        reduced_precision_val