From 902d7df99aec81621dd90923d57b5a189a732a8d Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Tue, 23 Dec 2025 13:10:02 -0800
Subject: [PATCH] Fix eagle dp tests on A100

`TP_SIZE=1 DP_SIZE=2 pytest -v -s tests/v1/distributed/test_eagle_dp.py` fails
on A100 for me before this PR.

Here's what I think is happening:
- the test is checking that the tokens produced by a model with eagle is
  identical to a model without eagle
- the model with eagle uses a draft model to produce draft tokens
- the target model takes all of the draft tokens and then does a forward
  pass to see how many of the tokens to accept/reject. The target model
  is using a batch_size > 1.
- the model without eagle just generates the tokens one-by-one, that is,
  it has batch_size = 1.
- For these two models to be *consistent*, we need batch invariance. So
  I turned on batch invariance (which also required the selection of an
  attention backend)

Signed-off-by: Richard Zou <zou3519@gmail.com>
---
 tests/v1/distributed/test_eagle_dp.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/v1/distributed/test_eagle_dp.py b/tests/v1/distributed/test_eagle_dp.py
index 9f6a6614fc1fd..f529fce0ab068 100644
--- a/tests/v1/distributed/test_eagle_dp.py
+++ b/tests/v1/distributed/test_eagle_dp.py
@@ -16,7 +16,12 @@ DP_SIZE = int(os.getenv("DP_SIZE", 2))
 
 
 @pytest.mark.asyncio
-async def test_run_eagle_dp():
+async def test_run_eagle_dp(monkeypatch: pytest.MonkeyPatch):
+    # This test checks that running a model with and without eagle
+    # leads to identical tokens. This is only true in batch invariant mode
+    # (because the target model verifies all draft tokens in one big forward pass)
+    monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1")
+
     target_model = "meta-llama/Llama-3.1-8B-Instruct"
     draft_model = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
 
@@ -29,6 +34,7 @@ async def test_run_eagle_dp():
         data_parallel_backend="mp",  # ray takes more time
         trust_remote_code=True,
         max_model_len=16384,
+        attention_config={"backend": "FLASH_ATTN"},
     )
 
     eagle_engine_args = replace(