From a1abb576368a0be452cdf52f76175445a4dbc9da Mon Sep 17 00:00:00 2001
From: atalman <atalman@fb.com>
Date: Tue, 11 Nov 2025 14:15:01 -0800
Subject: [PATCH 1/3] update to 2.9.1 PyTorch release

tests

xformers

xformers_release

torchao_pin_advance

Update conv.py

Update env_override.py

[Release 2.10] Test Torch 2.10 RC

triton

release_210_testing

release_210_testing

release_210_testing

release_210_testing

update_to_210

fix_210_resr

update_291_test

python_only_compile
---
 .pre-commit-config.yaml                         |  2 +-
 CMakeLists.txt                                  | 10 +++++-----
 docker/Dockerfile                               | 17 +++++++++--------
 docker/Dockerfile.cpu                           |  2 +-
 pyproject.toml                                  |  2 +-
 requirements/build.txt                          |  2 +-
 requirements/cuda.txt                           |  6 +++---
 requirements/rocm-build.txt                     | 11 +++++------
 requirements/test.in                            |  6 +++---
 requirements/test.txt                           | 10 +++++-----
 tests/compile/test_aot_compile.py               | 10 +++++-----
 .../compile/test_dynamic_shapes_compilation.py  |  2 +-
 tests/standalone_tests/python_only_compile.sh   |  3 ++-
 vllm/compilation/decorators.py                  |  8 ++++----
 vllm/envs.py                                    |  2 +-
 vllm/model_executor/layers/batch_invariant.py   |  2 +-
 16 files changed, 48 insertions(+), 47 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e034f75a9d322..422153e28f47e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -38,7 +38,7 @@ repos:
   rev: 0.9.1
   hooks:
     - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --extra-index-url, https://download.pytorch.org/whl/test/cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"]
       files: ^requirements/test\.(in|txt)$
 - repo: local
   hooks:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c46fb18d7bfef..b7a77a8c450bb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,8 +56,8 @@ endif()
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.10.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.10.0")
 
 #
 # Try to find python package with an executable that exactly matches
@@ -432,7 +432,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
     endif()
 
-    if (MARLIN_SM75_ARCHS) 
+    if (MARLIN_SM75_ARCHS)
       file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu")
       set_gencode_flags_for_srcs(
         SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}"
@@ -444,7 +444,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC})
     endif()
 
-    if (MARLIN_FP8_ARCHS) 
+    if (MARLIN_FP8_ARCHS)
       file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
       set_gencode_flags_for_srcs(
         SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
@@ -1042,7 +1042,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
     endif()
 
-    if (MARLIN_MOE_SM75_ARCHS) 
+    if (MARLIN_MOE_SM75_ARCHS)
       file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu")
       set_gencode_flags_for_srcs(
         SRCS "${MARLIN_MOE_SM75_SRC}"
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 679ffc4a7df5f..acc530a1c88a6 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -55,7 +55,7 @@ ARG UV_INDEX_URL=${PIP_INDEX_URL}
 ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 
 # PyTorch provides its own indexes for standard and nightly builds
-ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl
+ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl/test
 
 # PIP supports multiple authentication schemes, including keyring
 # By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to
@@ -135,7 +135,7 @@ WORKDIR /workspace
 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
+    uv pip install --python /opt/venv/bin/python3 --prerelease=allow -r requirements/cuda.txt \
     --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 # CUDA arch list used by torch
@@ -303,7 +303,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE=copy
 
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \
+    uv pip install --python /opt/venv/bin/python3 --prerelease=allow -r requirements/build.txt \
     --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 WORKDIR /workspace
@@ -367,7 +367,7 @@ COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \
+    uv pip install --python /opt/venv/bin/python3 --prerelease=allow -r requirements/dev.txt \
     --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 #################### DEV IMAGE ####################
 #################### vLLM installation IMAGE ####################
@@ -465,7 +465,7 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL
 COPY requirements/common.txt /tmp/common.txt
 COPY requirements/cuda.txt /tmp/requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r /tmp/requirements-cuda.txt \
+    uv pip install --system --prerelease=allow -r /tmp/requirements-cuda.txt \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \
     rm /tmp/requirements-cuda.txt /tmp/common.txt
 
@@ -522,9 +522,10 @@ ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system dist/*.whl --verbose \
+    uv pip install --prerelease=allow --system dist/*.whl --verbose \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
+
 RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
 uv pip list
@@ -544,7 +545,7 @@ ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 # Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
 RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm-workspace/ep_kernels/dist \
     --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system ep_kernels/dist/*.whl --verbose \
+    uv pip install --prerelease=allow --system ep_kernels/dist/*.whl --verbose \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
 # CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
@@ -587,7 +588,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 RUN --mount=type=cache,target=/root/.cache/uv \
     CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
     if [ "$CUDA_MAJOR" -ge 12 ]; then \
-        uv pip install --system -r requirements/dev.txt \
+        uv pip install --prerelease=allow --system -r requirements/dev.txt \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
     fi
 
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 2caf1ad144178..f835803ad6aa4 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -26,7 +26,7 @@ FROM ubuntu:22.04 AS base-common
 WORKDIR /workspace/
 
 ARG PYTHON_VERSION=3.12
-ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/test/cpu"
 
 # Install minimal dependencies and uv
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
diff --git a/pyproject.toml b/pyproject.toml
index c03f96dd7acd5..3204c382ed7b9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "packaging>=24.2",
     "setuptools>=77.0.3,<81.0.0",
     "setuptools-scm>=8.0",
-    "torch == 2.9.1",
+    "torch == 2.10.0",
     "wheel",
     "jinja2",
 ]
diff --git a/requirements/build.txt b/requirements/build.txt
index 3756371638bad..44534adb861db 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -4,7 +4,7 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
-torch==2.9.1
+torch==2.10.0
 wheel
 jinja2>=3.1.6
 regex
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 1417fb99120bc..c00a99084a485 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -5,9 +5,9 @@ numba == 0.61.2 # Required for N-gram speculative decoding
 
 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.9.1
-torchaudio==2.9.1
+torch==2.10.0
+torchaudio==2.10.0
 # These must be updated alongside torch
-torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision==0.25.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
 flashinfer-python==0.5.3
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index 54af9d995c4a2..01a71c2da38c8 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -1,12 +1,11 @@
 # Common dependencies
 -r common.txt
 
---extra-index-url https://download.pytorch.org/whl/rocm6.4
-torch==2.9.1
-torchvision==0.24.1
-torchaudio==2.9.1
-
-triton==3.5.1
+--extra-index-url https://download.pytorch.org/whl/test/rocm7.0
+torch==2.10.0
+torchvision==0.25.0
+torchaudio==2.10.0
+triton==3.6.0
 cmake>=3.26.1,<4
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
diff --git a/requirements/test.in b/requirements/test.in
index b3fd733fb1bc0..4b9ecaed1a65d 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -24,9 +24,9 @@ soundfile # required for audio tests
 jiwer # required for audio tests
 tblib # for pickling test exceptions
 timm >=1.0.17 # required for internvl and gemma3n-mm test
-torch==2.9.1
-torchaudio==2.9.1
-torchvision==0.24.1
+torch==2.10.0
+torchaudio==2.10.0
+torchvision==0.25.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[image,audio] >= 1.8.5 # required for voxtral test
diff --git a/requirements/test.txt b/requirements/test.txt
index 4012c2d3b212b..e1930e136df71 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -608,7 +608,7 @@ nvidia-nvjitlink-cu12==12.9.86
     #   nvidia-cusolver-cu12
     #   nvidia-cusparse-cu12
     #   torch
-nvidia-nvshmem-cu12==3.3.20
+nvidia-nvshmem-cu12==3.4.5
     # via torch
 nvidia-nvtx-cu12==12.9.79
     # via torch
@@ -1123,7 +1123,7 @@ tomli==2.2.1
     # via schemathesis
 tomli-w==1.2.0
     # via schemathesis
-torch==2.9.1+cu129
+torch==2.10.0+cu129
     # via
     #   -r requirements/test.in
     #   accelerate
@@ -1152,7 +1152,7 @@ torch==2.9.1+cu129
     #   torchvision
     #   vector-quantize-pytorch
     #   vocos
-torchaudio==2.9.1+cu129
+torchaudio==2.10.0+cu129
     # via
     #   -r requirements/test.in
     #   encodec
@@ -1165,7 +1165,7 @@ torchmetrics==1.7.4
     #   pytorch-lightning
     #   terratorch
     #   torchgeo
-torchvision==0.24.1+cu129
+torchvision==0.25.0+cu129
     # via
     #   -r requirements/test.in
     #   lightly
@@ -1206,7 +1206,7 @@ transformers==4.57.3
     #   transformers-stream-generator
 transformers-stream-generator==0.0.5
     # via -r requirements/test.in
-triton==3.5.1
+triton==3.6.0
     # via torch
 tritonclient==2.51.0
     # via
diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py
index 2ffcd627e476a..d462fa02499d3 100644
--- a/tests/compile/test_aot_compile.py
+++ b/tests/compile/test_aot_compile.py
@@ -56,7 +56,7 @@ def use_vllm_config(vllm_config: VllmConfig):
 
 
 @pytest.mark.skipif(
-    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+    not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10"
 )
 def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
@@ -80,7 +80,7 @@ def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch):
 
 
 @pytest.mark.skipif(
-    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+    not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10"
 )
 def test_force_aot_load(monkeypatch: pytest.MonkeyPatch):
     with tempfile.TemporaryDirectory() as tmpdirname, monkeypatch.context() as m:
@@ -94,7 +94,7 @@ def test_force_aot_load(monkeypatch: pytest.MonkeyPatch):
 
 
 @pytest.mark.skipif(
-    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+    not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10"
 )
 def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
@@ -116,7 +116,7 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
 
 
 @pytest.mark.skipif(
-    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+    not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10"
 )
 def test_shape_env(monkeypatch: pytest.MonkeyPatch):
     """
@@ -149,7 +149,7 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch):
 
 
 @pytest.mark.skipif(
-    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+    not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10"
 )
 @create_new_process_for_each_test("spawn")
 def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch):
diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py
index 1fda21dea6361..0d9931ec33b2a 100644
--- a/tests/compile/test_dynamic_shapes_compilation.py
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -40,7 +40,7 @@ def get_test_models():
 @pytest.mark.parametrize("use_bytecode_hook", [True, False])
 @pytest.mark.parametrize("evaluate_guards", [False, True])
 @pytest.mark.skipif(
-    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+    not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10"
 )
 def test_dynamic_shapes_compilation(
     monkeypatch,
diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh
index ebf199a5056fb..a2fd3e358f2e2 100644
--- a/tests/standalone_tests/python_only_compile.sh
+++ b/tests/standalone_tests/python_only_compile.sh
@@ -67,7 +67,8 @@ apt autoremove -y
 
 echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py
 
-VLLM_PRECOMPILED_WHEEL_COMMIT=$merge_base_commit VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
+
+VLLM_PRECOMPILED_WHEEL_COMMIT=$merge_base_commit VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e . --extra-index-url https://download.pytorch.org/whl/test/cu129/
 
 # Run the script
 python3 -c 'import vllm'
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 40bde97ac61d8..89c625de9cebd 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -316,7 +316,7 @@ def _support_torch_compile(
     def _mark_dynamic_inputs(mod, type, *args, **kwargs):
         def mark_dynamic(arg, dims):
             if type == DynamicShapesType.UNBACKED:
-                if is_torch_equal_or_newer("2.10.0.dev"):
+                if is_torch_equal_or_newer("2.10.0"):
                     for dim in dims:
                         torch._dynamo.decorators.mark_unbacked(
                             arg, dim, hint_override=arg.size()[dim]
@@ -356,7 +356,7 @@ def _support_torch_compile(
                     if isinstance(arg, torch.Tensor):
                         # In case dims is specified with negative indexing
                         dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
-                        if is_torch_equal_or_newer("2.10.0.dev"):
+                        if is_torch_equal_or_newer("2.10.0"):
                             for dim in dims:
                                 torch._dynamo.decorators.mark_unbacked(
                                     arg, dim, hint_override=arg.size()[dim]
@@ -496,9 +496,9 @@ def _support_torch_compile(
             fx_config_patches["backed_size_oblivious"] = True
 
         # Prepare inductor config patches
-        # assume_32bit_indexing is only available in torch 2.10.0.dev+
+        # assume_32bit_indexing is only available in torch 2.10.0+
         inductor_config_patches = {}
-        if is_torch_equal_or_newer("2.10.0.dev"):
+        if is_torch_equal_or_newer("2.10.0"):
             inductor_config_patches["assume_32bit_indexing"] = True
 
         with (
diff --git a/vllm/envs.py b/vllm/envs.py
index 1d4128d74b95c..25e855d768aa9 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -286,7 +286,7 @@ def use_aot_compile() -> bool:
 
     default_value = (
         "1"
-        if is_torch_equal_or_newer("2.10.0.dev") and not disable_compile_cache()
+        if is_torch_equal_or_newer("2.10.0") and not disable_compile_cache()
         else "0"
     )
 
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 1058270889b29..dbd7540646d02 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -974,7 +974,7 @@ def enable_batch_invariant_mode():
     )
 
     reduced_precision_val = (
-        (False, False) if is_torch_equal_or_newer("2.10.0.dev") else False
+        (False, False) if is_torch_equal_or_newer("2.10.0") else False
     )
     torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
         reduced_precision_val

From 6ca18fcc57d2e01904e81cc51811b635e09ea272 Mon Sep 17 00:00:00 2001
From: atalman <atalman@fb.com>
Date: Tue, 16 Dec 2025 15:37:36 -0800
Subject: [PATCH 2/3] release_210_testing

---
 RELEASE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RELEASE.md b/RELEASE.md
index db0d51afc7be1..d3e6d3b9b7bdd 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -13,7 +13,7 @@ vLLM uses a “right-shifted” versioning scheme where a new patch release is o
 
 ## Release Cadence
 
-Patch release is released on bi-weekly basis. Post release 1-3 days after patch release and uses same branch as patch release.
+Patch release is released on bi-weekly basis. Post release 1-2 days after patch release and uses same branch as patch release.
 Following is the release cadence for year 2025. All future release dates below are tentative. Please note: Post releases are optional.
 
 | Release Date | Patch release versions | Post Release versions |

From 902d7df99aec81621dd90923d57b5a189a732a8d Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Tue, 23 Dec 2025 13:10:02 -0800
Subject: [PATCH 3/3] Fix eagle dp tests on A100

`TP_SIZE=1 DP_SIZE=2 pytest -v -s tests/v1/distributed/test_eagle_dp.py` fails
on A100 for me before this PR.

Here's what I think is happening:
- the test is checking that the tokens produced by a model with eagle is
  identical to a model without eagle
- the model with eagle uses a draft model to produce draft tokens
- the target model takes all of the draft tokens and then does a forward
  pass to see how many of the tokens to accept/reject. The target model
  is using a batch_size > 1.
- the model without eagle just generates the tokens one-by-one, that is,
  it has batch_size = 1.
- For these two models to be *consistent*, we need batch invariance. So
  I turned on batch invariance (which also required the selection of an
  attention backend)

Signed-off-by: Richard Zou <zou3519@gmail.com>
---
 tests/v1/distributed/test_eagle_dp.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/v1/distributed/test_eagle_dp.py b/tests/v1/distributed/test_eagle_dp.py
index 9f6a6614fc1fd..f529fce0ab068 100644
--- a/tests/v1/distributed/test_eagle_dp.py
+++ b/tests/v1/distributed/test_eagle_dp.py
@@ -16,7 +16,12 @@ DP_SIZE = int(os.getenv("DP_SIZE", 2))
 
 
 @pytest.mark.asyncio
-async def test_run_eagle_dp():
+async def test_run_eagle_dp(monkeypatch: pytest.MonkeyPatch):
+    # This test checks that running a model with and without eagle
+    # leads to identical tokens. This is only true in batch invariant mode
+    # (because the target model verifies all draft tokens in one big forward pass)
+    monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1")
+
     target_model = "meta-llama/Llama-3.1-8B-Instruct"
     draft_model = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
 
@@ -29,6 +34,7 @@ async def test_run_eagle_dp():
         data_parallel_backend="mp",  # ray takes more time
         trust_remote_code=True,
         max_model_len=16384,
+        attention_config={"backend": "FLASH_ATTN"},
     )
 
     eagle_engine_args = replace(