diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py index a2d35486a5e8..7e6bd3664ebd 100644 --- a/tests/entrypoints/llm/test_accuracy.py +++ b/tests/entrypoints/llm/test_accuracy.py @@ -69,6 +69,12 @@ def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch): more_args = None if current_platform.is_tpu(): # Limit compilation time for TPU V1 + + if model == "google/gemma-3-1b-it": + pytest.skip( + "Temporarily disabled due to test failures" + "(timeout or accuracy mismatch). Re-enable once fixed.") + more_args = "max_model_len=2048,max_num_seqs=64" # Add TP test (if provided)