From a1abb576368a0be452cdf52f76175445a4dbc9da Mon Sep 17 00:00:00 2001 From: atalman Date: Tue, 11 Nov 2025 14:15:01 -0800 Subject: [PATCH 1/3] update to 2.9.1 PyTorch release tests xformers xformers_release torchao_pin_advance Update conv.py Update env_override.py [Release 2.10] Test Torch 2.10 RC triton release_210_testing release_210_testing release_210_testing release_210_testing update_to_210 fix_210_resr update_291_test python_only_compile --- .pre-commit-config.yaml | 2 +- CMakeLists.txt | 10 +++++----- docker/Dockerfile | 17 +++++++++-------- docker/Dockerfile.cpu | 2 +- pyproject.toml | 2 +- requirements/build.txt | 2 +- requirements/cuda.txt | 6 +++--- requirements/rocm-build.txt | 11 +++++------ requirements/test.in | 6 +++--- requirements/test.txt | 10 +++++----- tests/compile/test_aot_compile.py | 10 +++++----- .../compile/test_dynamic_shapes_compilation.py | 2 +- tests/standalone_tests/python_only_compile.sh | 3 ++- vllm/compilation/decorators.py | 8 ++++---- vllm/envs.py | 2 +- vllm/model_executor/layers/batch_invariant.py | 2 +- 16 files changed, 48 insertions(+), 47 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e034f75a9d322..422153e28f47e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,7 +38,7 @@ repos: rev: 0.9.1 hooks: - id: pip-compile - args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"] + args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --extra-index-url, https://download.pytorch.org/whl/test/cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"] files: ^requirements/test\.(in|txt)$ - repo: local hooks: diff --git a/CMakeLists.txt b/CMakeLists.txt index c46fb18d7bfef..b7a77a8c450bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,8 +56,8 @@ endif() # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from docker/Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.9.1") -set(TORCH_SUPPORTED_VERSION_ROCM "2.9.1") +set(TORCH_SUPPORTED_VERSION_CUDA "2.10.0") +set(TORCH_SUPPORTED_VERSION_ROCM "2.10.0") # # Try to find python package with an executable that exactly matches @@ -432,7 +432,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC}) endif() - if (MARLIN_SM75_ARCHS) + if (MARLIN_SM75_ARCHS) file(GLOB MARLIN_TEMPLATE_SM75_KERNEL_SRC "csrc/quantization/gptq_marlin/sm75_kernel_*.cu") set_gencode_flags_for_srcs( SRCS "${MARLIN_TEMPLATE_SM75_KERNEL_SRC}" @@ -444,7 +444,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_SM75_KERNEL_SRC}) endif() - if (MARLIN_FP8_ARCHS) + if (MARLIN_FP8_ARCHS) file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu") set_gencode_flags_for_srcs( SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}" @@ -1042,7 +1042,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC}) endif() - if (MARLIN_MOE_SM75_ARCHS) + if (MARLIN_MOE_SM75_ARCHS) file(GLOB MARLIN_MOE_SM75_SRC "csrc/moe/marlin_moe_wna16/sm75_kernel_*.cu") set_gencode_flags_for_srcs( SRCS "${MARLIN_MOE_SM75_SRC}" diff --git a/docker/Dockerfile b/docker/Dockerfile index 679ffc4a7df5f..acc530a1c88a6 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -55,7 +55,7 @@ ARG UV_INDEX_URL=${PIP_INDEX_URL} ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} # PyTorch provides its own indexes for standard and nightly builds -ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl +ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl/test # PIP supports multiple authentication schemes, including keyring # By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to @@ -135,7 +135,7 @@ WORKDIR /workspace COPY requirements/common.txt requirements/common.txt COPY requirements/cuda.txt requirements/cuda.txt RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \ + uv pip install --python /opt/venv/bin/python3 --prerelease=allow -r requirements/cuda.txt \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') # CUDA arch list used by torch @@ -303,7 +303,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match" ENV UV_LINK_MODE=copy RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --python /opt/venv/bin/python3 -r requirements/build.txt \ + uv pip install --python /opt/venv/bin/python3 --prerelease=allow -r requirements/build.txt \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') WORKDIR /workspace @@ -367,7 +367,7 @@ COPY requirements/lint.txt requirements/lint.txt COPY requirements/test.txt requirements/test.txt COPY requirements/dev.txt requirements/dev.txt RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --python /opt/venv/bin/python3 -r requirements/dev.txt \ + uv pip install --python /opt/venv/bin/python3 --prerelease=allow -r requirements/dev.txt \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') #################### DEV IMAGE #################### #################### vLLM installation IMAGE #################### @@ -465,7 +465,7 @@ ARG PYTORCH_CUDA_INDEX_BASE_URL COPY requirements/common.txt /tmp/common.txt COPY requirements/cuda.txt /tmp/requirements-cuda.txt RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system -r /tmp/requirements-cuda.txt \ + uv pip install --system --prerelease=allow -r /tmp/requirements-cuda.txt \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \ rm /tmp/requirements-cuda.txt /tmp/common.txt @@ -522,9 +522,10 @@ ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER # Install vllm wheel first, so that torch etc will be installed. RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system dist/*.whl --verbose \ + uv pip install --prerelease=allow --system dist/*.whl --verbose \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') + RUN --mount=type=cache,target=/root/.cache/uv \ . /etc/environment && \ uv pip list @@ -544,7 +545,7 @@ ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH # Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm-workspace/ep_kernels/dist \ --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system ep_kernels/dist/*.whl --verbose \ + uv pip install --prerelease=allow --system ep_kernels/dist/*.whl --verbose \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') # CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will @@ -587,7 +588,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ RUN --mount=type=cache,target=/root/.cache/uv \ CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ if [ "$CUDA_MAJOR" -ge 12 ]; then \ - uv pip install --system -r requirements/dev.txt \ + uv pip install --prerelease=allow --system -r requirements/dev.txt \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \ fi diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 2caf1ad144178..f835803ad6aa4 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -26,7 +26,7 @@ FROM ubuntu:22.04 AS base-common WORKDIR /workspace/ ARG PYTHON_VERSION=3.12 -ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" +ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/test/cpu" # Install minimal dependencies and uv RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ diff --git a/pyproject.toml b/pyproject.toml index c03f96dd7acd5..3204c382ed7b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "packaging>=24.2", "setuptools>=77.0.3,<81.0.0", "setuptools-scm>=8.0", - "torch == 2.9.1", + "torch == 2.10.0", "wheel", "jinja2", ] diff --git a/requirements/build.txt b/requirements/build.txt index 3756371638bad..44534adb861db 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -4,7 +4,7 @@ ninja packaging>=24.2 setuptools>=77.0.3,<81.0.0 setuptools-scm>=8 -torch==2.9.1 +torch==2.10.0 wheel jinja2>=3.1.6 regex diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 1417fb99120bc..c00a99084a485 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -5,9 +5,9 @@ numba == 0.61.2 # Required for N-gram speculative decoding # Dependencies for NVIDIA GPUs ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1. -torch==2.9.1 -torchaudio==2.9.1 +torch==2.10.0 +torchaudio==2.10.0 # These must be updated alongside torch -torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +torchvision==0.25.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version # FlashInfer should be updated together with the Dockerfile flashinfer-python==0.5.3 diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt index 54af9d995c4a2..01a71c2da38c8 100644 --- a/requirements/rocm-build.txt +++ b/requirements/rocm-build.txt @@ -1,12 +1,11 @@ # Common dependencies -r common.txt ---extra-index-url https://download.pytorch.org/whl/rocm6.4 -torch==2.9.1 -torchvision==0.24.1 -torchaudio==2.9.1 - -triton==3.5.1 +--extra-index-url https://download.pytorch.org/whl/test/rocm7.0 +torch==2.10.0 +torchvision==0.25.0 +torchaudio==2.10.0 +triton==3.6.0 cmake>=3.26.1,<4 packaging>=24.2 setuptools>=77.0.3,<80.0.0 diff --git a/requirements/test.in b/requirements/test.in index b3fd733fb1bc0..4b9ecaed1a65d 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -24,9 +24,9 @@ soundfile # required for audio tests jiwer # required for audio tests tblib # for pickling test exceptions timm >=1.0.17 # required for internvl and gemma3n-mm test -torch==2.9.1 -torchaudio==2.9.1 -torchvision==0.24.1 +torch==2.10.0 +torchaudio==2.10.0 +torchvision==0.25.0 transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test mistral_common[image,audio] >= 1.8.5 # required for voxtral test diff --git a/requirements/test.txt b/requirements/test.txt index 4012c2d3b212b..e1930e136df71 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -608,7 +608,7 @@ nvidia-nvjitlink-cu12==12.9.86 # nvidia-cusolver-cu12 # nvidia-cusparse-cu12 # torch -nvidia-nvshmem-cu12==3.3.20 +nvidia-nvshmem-cu12==3.4.5 # via torch nvidia-nvtx-cu12==12.9.79 # via torch @@ -1123,7 +1123,7 @@ tomli==2.2.1 # via schemathesis tomli-w==1.2.0 # via schemathesis -torch==2.9.1+cu129 +torch==2.10.0+cu129 # via # -r requirements/test.in # accelerate @@ -1152,7 +1152,7 @@ torch==2.9.1+cu129 # torchvision # vector-quantize-pytorch # vocos -torchaudio==2.9.1+cu129 +torchaudio==2.10.0+cu129 # via # -r requirements/test.in # encodec @@ -1165,7 +1165,7 @@ torchmetrics==1.7.4 # pytorch-lightning # terratorch # torchgeo -torchvision==0.24.1+cu129 +torchvision==0.25.0+cu129 # via # -r requirements/test.in # lightly @@ -1206,7 +1206,7 @@ transformers==4.57.3 # transformers-stream-generator transformers-stream-generator==0.0.5 # via -r requirements/test.in -triton==3.5.1 +triton==3.6.0 # via torch tritonclient==2.51.0 # via diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py index 2ffcd627e476a..d462fa02499d3 100644 --- a/tests/compile/test_aot_compile.py +++ b/tests/compile/test_aot_compile.py @@ -56,7 +56,7 @@ def use_vllm_config(vllm_config: VllmConfig): @pytest.mark.skipif( - not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10" + not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10" ) def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as m: @@ -80,7 +80,7 @@ def test_no_dynamo_cache_entry(monkeypatch: pytest.MonkeyPatch): @pytest.mark.skipif( - not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10" + not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10" ) def test_force_aot_load(monkeypatch: pytest.MonkeyPatch): with tempfile.TemporaryDirectory() as tmpdirname, monkeypatch.context() as m: @@ -94,7 +94,7 @@ def test_force_aot_load(monkeypatch: pytest.MonkeyPatch): @pytest.mark.skipif( - not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10" + not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10" ) def test_save_and_load(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as m: @@ -116,7 +116,7 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch): @pytest.mark.skipif( - not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10" + not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10" ) def test_shape_env(monkeypatch: pytest.MonkeyPatch): """ @@ -149,7 +149,7 @@ def test_shape_env(monkeypatch: pytest.MonkeyPatch): @pytest.mark.skipif( - not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10" + not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10" ) @create_new_process_for_each_test("spawn") def test_gpt2_cache_hit(monkeypatch: pytest.MonkeyPatch): diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py index 1fda21dea6361..0d9931ec33b2a 100644 --- a/tests/compile/test_dynamic_shapes_compilation.py +++ b/tests/compile/test_dynamic_shapes_compilation.py @@ -40,7 +40,7 @@ def get_test_models(): @pytest.mark.parametrize("use_bytecode_hook", [True, False]) @pytest.mark.parametrize("evaluate_guards", [False, True]) @pytest.mark.skipif( - not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10" + not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10" ) def test_dynamic_shapes_compilation( monkeypatch, diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh index ebf199a5056fb..a2fd3e358f2e2 100644 --- a/tests/standalone_tests/python_only_compile.sh +++ b/tests/standalone_tests/python_only_compile.sh @@ -67,7 +67,8 @@ apt autoremove -y echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py -VLLM_PRECOMPILED_WHEEL_COMMIT=$merge_base_commit VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e . + +VLLM_PRECOMPILED_WHEEL_COMMIT=$merge_base_commit VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e . --extra-index-url https://download.pytorch.org/whl/test/cu129/ # Run the script python3 -c 'import vllm' diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 40bde97ac61d8..89c625de9cebd 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -316,7 +316,7 @@ def _support_torch_compile( def _mark_dynamic_inputs(mod, type, *args, **kwargs): def mark_dynamic(arg, dims): if type == DynamicShapesType.UNBACKED: - if is_torch_equal_or_newer("2.10.0.dev"): + if is_torch_equal_or_newer("2.10.0"): for dim in dims: torch._dynamo.decorators.mark_unbacked( arg, dim, hint_override=arg.size()[dim] @@ -356,7 +356,7 @@ def _support_torch_compile( if isinstance(arg, torch.Tensor): # In case dims is specified with negative indexing dims = [arg.ndim + dim if dim < 0 else dim for dim in dims] - if is_torch_equal_or_newer("2.10.0.dev"): + if is_torch_equal_or_newer("2.10.0"): for dim in dims: torch._dynamo.decorators.mark_unbacked( arg, dim, hint_override=arg.size()[dim] @@ -496,9 +496,9 @@ def _support_torch_compile( fx_config_patches["backed_size_oblivious"] = True # Prepare inductor config patches - # assume_32bit_indexing is only available in torch 2.10.0.dev+ + # assume_32bit_indexing is only available in torch 2.10.0+ inductor_config_patches = {} - if is_torch_equal_or_newer("2.10.0.dev"): + if is_torch_equal_or_newer("2.10.0"): inductor_config_patches["assume_32bit_indexing"] = True with ( diff --git a/vllm/envs.py b/vllm/envs.py index 1d4128d74b95c..25e855d768aa9 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -286,7 +286,7 @@ def use_aot_compile() -> bool: default_value = ( "1" - if is_torch_equal_or_newer("2.10.0.dev") and not disable_compile_cache() + if is_torch_equal_or_newer("2.10.0") and not disable_compile_cache() else "0" ) diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 1058270889b29..dbd7540646d02 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -974,7 +974,7 @@ def enable_batch_invariant_mode(): ) reduced_precision_val = ( - (False, False) if is_torch_equal_or_newer("2.10.0.dev") else False + (False, False) if is_torch_equal_or_newer("2.10.0") else False ) torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = ( reduced_precision_val From 6ca18fcc57d2e01904e81cc51811b635e09ea272 Mon Sep 17 00:00:00 2001 From: atalman Date: Tue, 16 Dec 2025 15:37:36 -0800 Subject: [PATCH 2/3] release_210_testing --- RELEASE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE.md b/RELEASE.md index db0d51afc7be1..d3e6d3b9b7bdd 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -13,7 +13,7 @@ vLLM uses a “right-shifted” versioning scheme where a new patch release is o ## Release Cadence -Patch release is released on bi-weekly basis. Post release 1-3 days after patch release and uses same branch as patch release. +Patch release is released on bi-weekly basis. Post release 1-2 days after patch release and uses same branch as patch release. Following is the release cadence for year 2025. All future release dates below are tentative. Please note: Post releases are optional. | Release Date | Patch release versions | Post Release versions | From 902d7df99aec81621dd90923d57b5a189a732a8d Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Tue, 23 Dec 2025 13:10:02 -0800 Subject: [PATCH 3/3] Fix eagle dp tests on A100 `TP_SIZE=1 DP_SIZE=2 pytest -v -s tests/v1/distributed/test_eagle_dp.py` fails on A100 for me before this PR. Here's what I think is happening: - the test is checking that the tokens produced by a model with eagle is identical to a model without eagle - the model with eagle uses a draft model to produce draft tokens - the target model takes all of the draft tokens and then does a forward pass to see how many of the tokens to accept/reject. The target model is using a batch_size > 1. - the model without eagle just generates the tokens one-by-one, that is, it has batch_size = 1. - For these two models to be *consistent*, we need batch invariance. So I turned on batch invariance (which also required the selection of an attention backend) Signed-off-by: Richard Zou --- tests/v1/distributed/test_eagle_dp.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/v1/distributed/test_eagle_dp.py b/tests/v1/distributed/test_eagle_dp.py index 9f6a6614fc1fd..f529fce0ab068 100644 --- a/tests/v1/distributed/test_eagle_dp.py +++ b/tests/v1/distributed/test_eagle_dp.py @@ -16,7 +16,12 @@ DP_SIZE = int(os.getenv("DP_SIZE", 2)) @pytest.mark.asyncio -async def test_run_eagle_dp(): +async def test_run_eagle_dp(monkeypatch: pytest.MonkeyPatch): + # This test checks that running a model with and without eagle + # leads to identical tokens. This is only true in batch invariant mode + # (because the target model verifies all draft tokens in one big forward pass) + monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1") + target_model = "meta-llama/Llama-3.1-8B-Instruct" draft_model = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B" @@ -29,6 +34,7 @@ async def test_run_eagle_dp(): data_parallel_backend="mp", # ray takes more time trust_remote_code=True, max_model_len=16384, + attention_config={"backend": "FLASH_ATTN"}, ) eagle_engine_args = replace(