From 88fcf00ddaa99e9eb1da58c4d46dc5bf59bbf42d Mon Sep 17 00:00:00 2001 From: Huy Do Date: Tue, 29 Apr 2025 19:41:02 -0700 Subject: [PATCH] Fix some speculative decode tests with tl.dot (#17371) Signed-off-by: Huy Do --- tests/spec_decode/e2e/test_multistep_correctness.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index bb45be791fa8a..e187b6bc14347 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -456,7 +456,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( @pytest.mark.parametrize( "common_llm_kwargs", [{ - "block_size": 8, + "block_size": 16, # 2 for small prompt, 256//8 for generated. "num_gpu_blocks_override": 2 + 256 // 8, "max_model_len": (2 + 256 // 8) * 8, @@ -526,11 +526,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( @pytest.mark.parametrize( "per_test_common_llm_kwargs", [ - # As of this writing, vLLM only compiles with these 3 block sizes by - # default. - { - "block_size": 8, - }, + # https://github.com/triton-lang/triton/issues/2266 tl.dot + # doesn't support embedding < 16 { "block_size": 16, },