From 7b1895e6ce4942091e16da790af8c12772a1d384 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sun, 29 Jun 2025 11:31:37 +0900 Subject: [PATCH] [CI Fix] Try fixing eagle e2e test OOM by reducing block allocation (#20213) Signed-off-by: mgoin --- tests/spec_decode/e2e/test_eagle_correctness.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py index fd838285aba7c..7c369feec4152 100644 --- a/tests/spec_decode/e2e/test_eagle_correctness.py +++ b/tests/spec_decode/e2e/test_eagle_correctness.py @@ -370,6 +370,10 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, @pytest.mark.parametrize( "common_llm_kwargs", [{ + # 2 for small prompt, 256//16 for generated. + "num_gpu_blocks_override": 2 + 256 // 16, + "max_model_len": (2 + 256 // 16) * 16, + # Skip cuda graph recording for fast test. "enforce_eager": True, @@ -420,6 +424,10 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, @pytest.mark.parametrize( "common_llm_kwargs", [{ + # 2 for small prompt, 256//16 for generated. + "num_gpu_blocks_override": 2 + 256 // 16, + "max_model_len": (2 + 256 // 16) * 16, + # Skip cuda graph recording for fast test. "enforce_eager": True,