Disable enforce_eager for V1 TPU sampler and structured output tests (#17016)

Signed-off-by: mgoin <mgoin64@gmail.com>
2026-01-29 18:27:14 +08:00 · 2025-04-24 03:50:09 -06:00 · 2025-04-24 03:50:09 -06:00 · 14288d1332
commit 14288d1332
parent b411418ff0
3 changed files with 7 additions and 2 deletions
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -19,6 +19,7 @@ docker run --privileged --net host --shm-size=16G -it \
    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
    && python3 -m pip install pytest pytest-asyncio tpu-info \
    && python3 -m pip install lm_eval[api]==0.4.4 \
+    && export VLLM_XLA_CACHE_PATH= \
    && export VLLM_USE_V1=1 \
    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
    && echo HARDWARE \
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@ -13,6 +13,7 @@ from pydantic import BaseModel

 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
+from vllm.platforms import current_platform
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams

 PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
@ -63,10 +64,13 @@ def test_structured_output(
 ):
    monkeypatch.setenv("VLLM_USE_V1", "1")

+    # Don't use eager execution on TPUs because we want to test for no
+    # recompilation at runtime
+    enforce_eager = bool(not current_platform.is_tpu())
    # Use a single LLM instance for several scenarios to
    # speed up the test suite.
    llm = LLM(model=model_name,
-              enforce_eager=True,
+              enforce_eager=enforce_eager,
              max_model_len=1024,
              guided_decoding_backend=guided_decoding_backend,
              tokenizer_mode=tokenizer_mode)
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@ -23,7 +23,7 @@ def test_sampler_different(model_name: str):
    different results.
    """
    llm = LLM(model_name,
-              enforce_eager=True,
+              enforce_eager=False,
              max_num_seqs=1,
              max_model_len=512,
              max_num_batched_tokens=512)