From 43721bc67f9f30dcd1c893af370727b835d3d6dd Mon Sep 17 00:00:00 2001
From: Tahsin Tunan <tahsintunan@gmail.com>
Date: Thu, 16 Oct 2025 20:51:27 +0600
Subject: [PATCH] [CI] Replace large models with tiny alternatives in tests
 (#24057)

Signed-off-by: Tahsin Tunan <tahsintunan@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../test_basic_correctness.py                 |  16 +--
 tests/basic_correctness/test_cpu_offload.py   |   2 +-
 tests/basic_correctness/test_cumem.py         |   4 +-
 tests/distributed/test_sequence_parallel.py   |   4 +-
 tests/entrypoints/llm/test_collective_rpc.py  |   5 +-
 tests/entrypoints/openai/test_run_batch.py    |   2 +-
 .../entrypoints/openai/test_serving_models.py |   2 +-
 tests/entrypoints/openai/test_shutdown.py     | 102 ++++++++++++++----
 tests/models/registry.py                      |   1 +
 tests/samplers/test_no_bad_words.py           |  10 +-
 tests/v1/core/test_scheduler_e2e.py           |   2 +-
 tests/v1/engine/test_engine_core.py           |   6 +-
 .../openai/test_multi_api_servers.py          |   2 +-
 tests/v1/sample/test_sampling_params_e2e.py   |   7 +-
 tests/v1/shutdown/test_delete.py              |   2 +-
 tests/v1/shutdown/test_forward_error.py       |   2 +-
 tests/v1/shutdown/test_startup_error.py       |   8 +-
 17 files changed, 118 insertions(+), 59 deletions(-)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 9b9d8cfea7fad..7f0e29d14f161 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -20,7 +20,7 @@ from ..models.utils import check_outputs_equal
 from ..utils import multi_gpu_test
 
 MODELS = [
-    "google/gemma-2-2b-it",
+    "hmellor/tiny-random-Gemma2ForCausalLM",
     "meta-llama/Llama-3.2-1B-Instruct",
 ]
 
@@ -29,7 +29,7 @@ TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
 
 def test_vllm_gc_ed():
     """Verify vllm instance is GC'ed when it is deleted"""
-    llm = LLM("distilbert/distilgpt2")
+    llm = LLM("hmellor/tiny-random-LlamaForCausalLM")
     weak_llm = weakref.ref(llm)
     del llm
     # If there's any circular reference to vllm, this fails
@@ -125,14 +125,14 @@ def test_models(
 @pytest.mark.parametrize(
     "model, distributed_executor_backend, attention_backend, test_suite, extra_env",
     [
-        ("distilbert/distilgpt2", "ray", "", "L4", {}),
-        ("distilbert/distilgpt2", "mp", "", "L4", {}),
-        ("distilbert/distilgpt2", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
-        ("distilbert/distilgpt2", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
+        ("facebook/opt-125m", "ray", "", "L4", {}),
+        ("facebook/opt-125m", "mp", "", "L4", {}),
+        ("facebook/opt-125m", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
+        ("facebook/opt-125m", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
         ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
         ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
-        ("distilbert/distilgpt2", "ray", "", "A100", {}),
-        ("distilbert/distilgpt2", "mp", "", "A100", {}),
+        ("facebook/opt-125m", "ray", "", "A100", {}),
+        ("facebook/opt-125m", "mp", "", "A100", {}),
     ],
 )
 @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
index 3c1e01d072b9e..89839372c309a 100644
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -6,5 +6,5 @@ from ..utils import compare_two_settings
 
 def test_cpu_offload():
     compare_two_settings(
-        "meta-llama/Llama-3.2-1B-Instruct", [], ["--cpu-offload-gb", "1"]
+        "hmellor/tiny-random-LlamaForCausalLM", [], ["--cpu-offload-gb", "1"]
     )
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index f1b0f7b2de891..4c5971da1ce98 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -120,7 +120,7 @@ def test_cumem_with_cudagraph():
     "model",
     [
         # sleep mode with safetensors
-        "meta-llama/Llama-3.2-1B",
+        "hmellor/tiny-random-LlamaForCausalLM",
         # sleep mode with pytorch checkpoint
         "facebook/opt-125m",
     ],
@@ -174,7 +174,7 @@ def test_end_to_end(model: str):
 
 @create_new_process_for_each_test()
 def test_deep_sleep():
-    model = "Qwen/Qwen3-0.6B"
+    model = "hmellor/tiny-random-LlamaForCausalLM"
     free, total = torch.cuda.mem_get_info()
     used_bytes_baseline = total - free  # in case other process is running
     llm = LLM(model, enable_sleep_mode=True)
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index deefdf22ba06b..fc55a39bd573b 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -273,14 +273,14 @@ def _compare_sp(
 
 SP_TEXT_GENERATION_MODELS = {
     # [Decoder-only]
-    "meta-llama/Llama-3.2-1B-Instruct": SPTestSettings.fast(),
+    "hmellor/tiny-random-LlamaForCausalLM": SPTestSettings.fast(),
     "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8": SPTestSettings.fp8_quant(),
 }
 
 SP_TEST_MODELS = [
     # TODO support other models
     # [LANGUAGE GENERATION]
-    "meta-llama/Llama-3.2-1B-Instruct",
+    "hmellor/tiny-random-LlamaForCausalLM",
     "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
 ]
 
diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py
index 937aa5c132461..747676ac95675 100644
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
+import torch
 
 from vllm import LLM
 
@@ -12,6 +13,8 @@ from ...utils import create_new_process_for_each_test
 @pytest.mark.parametrize("backend", ["mp", "ray"])
 @create_new_process_for_each_test()
 def test_collective_rpc(tp_size, backend, monkeypatch):
+    if torch.cuda.device_count() < tp_size:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
     if tp_size == 1 and backend == "ray":
         pytest.skip("Skip duplicate test case")
     if tp_size == 1:
@@ -24,7 +27,7 @@ def test_collective_rpc(tp_size, backend, monkeypatch):
 
     monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
     llm = LLM(
-        model="meta-llama/Llama-3.2-1B-Instruct",
+        model="hmellor/tiny-random-LlamaForCausalLM",
         enforce_eager=True,
         load_format="dummy",
         tensor_parallel_size=tp_size,
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index 5a97739e5a347..2f678a0535cc6 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -9,7 +9,7 @@ import pytest
 
 from vllm.entrypoints.openai.protocol import BatchRequestOutput
 
-MODEL_NAME = "Qwen/Qwen3-0.6B"
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
 
 # ruff: noqa: E501
 INPUT_BATCH = (
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index df5bf07a8bd41..3c022870dba4b 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -16,7 +16,7 @@ from vllm.entrypoints.openai.protocol import (
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.lora.request import LoRARequest
 
-MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
 BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
 LORA_LOADING_SUCCESS_MESSAGE = "Success: LoRA adapter '{lora_name}' added successfully."
 LORA_UNLOADING_SUCCESS_MESSAGE = (
diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py
index ff46df81d0fff..f3394b2dbac34 100644
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@@ -1,37 +1,93 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import signal
+import subprocess
+import sys
+import time
+
 import openai
 import pytest
 
-from ...utils import RemoteOpenAIServer
+from ...utils import get_open_port
 
-MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
 
 
 @pytest.mark.asyncio
 async def test_shutdown_on_engine_failure():
-    # dtype, max-len etc set so that this can run in CI
-    args = [
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "8192",
-        "--enforce-eager",
-        "--max-num-seqs",
-        "128",
-    ]
+    """Verify that API returns connection error when server process is killed.
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        async with remote_server.get_async_client() as client:
-            with pytest.raises((openai.APIConnectionError, openai.InternalServerError)):
-                # Asking for lots of prompt logprobs will currently crash the
-                # engine. This may change in the future when that bug is fixed
-                prompt = "Hello " * 4000
-                await client.completions.create(
-                    model=MODEL_NAME, prompt=prompt, extra_body={"prompt_logprobs": 10}
+    Starts a vLLM server, kills it to simulate a crash, then verifies that
+    subsequent API calls fail appropriately.
+    """
+
+    port = get_open_port()
+
+    proc = subprocess.Popen(
+        [
+            # dtype, max-len etc set so that this can run in CI
+            sys.executable,
+            "-m",
+            "vllm.entrypoints.openai.api_server",
+            "--model",
+            MODEL_NAME,
+            "--dtype",
+            "bfloat16",
+            "--max-model-len",
+            "128",
+            "--enforce-eager",
+            "--port",
+            str(port),
+            "--gpu-memory-utilization",
+            "0.05",
+            "--max-num-seqs",
+            "2",
+            "--disable-frontend-multiprocessing",
+        ],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN),
+    )
+
+    # Wait for server startup
+    start_time = time.time()
+    client = openai.AsyncOpenAI(
+        base_url=f"http://localhost:{port}/v1",
+        api_key="dummy",
+        max_retries=0,
+        timeout=10,
+    )
+
+    # Poll until server is ready
+    while time.time() - start_time < 30:
+        try:
+            await client.completions.create(
+                model=MODEL_NAME, prompt="Hello", max_tokens=1
+            )
+            break
+        except Exception:
+            time.sleep(0.5)
+            if proc.poll() is not None:
+                stdout, stderr = proc.communicate(timeout=1)
+                pytest.fail(
+                    f"Server died during startup. stdout: {stdout}, stderr: {stderr}"
                 )
+    else:
+        proc.terminate()
+        proc.wait(timeout=5)
+        pytest.fail("Server failed to start in 30 seconds")
 
-            # Now the server should shut down
-            return_code = remote_server.proc.wait(timeout=8)
-            assert return_code is not None
+    # Kill server to simulate crash
+    proc.terminate()
+    time.sleep(1)
+
+    # Verify API calls now fail
+    with pytest.raises((openai.APIConnectionError, openai.APIStatusError)):
+        await client.completions.create(
+            model=MODEL_NAME, prompt="This should fail", max_tokens=1
+        )
+
+    return_code = proc.wait(timeout=5)
+    assert return_code is not None
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 617dc30691aa8..1d3d7fe6595a8 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -330,6 +330,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
             "guard": "meta-llama/Llama-Guard-3-1B",
             "hermes": "NousResearch/Hermes-3-Llama-3.1-8B",
             "fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
+            "tiny": "hmellor/tiny-random-LlamaForCausalLM",
         },
     ),
     "LLaMAForCausalLM": _HfExamplesInfo(
diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
index 1b76b909629c9..74047d2f03558 100644
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -35,15 +35,13 @@ def _generate(
 
 
 class TestOneTokenBadWord:
-    MODEL = "TheBloke/Llama-2-7B-fp16"
+    MODEL = "hmellor/tiny-random-LlamaForCausalLM"
 
-    PROMPT = "Hi! How are"
-    TARGET_TOKEN = "you"
+    PROMPT = "How old are "
+    TARGET_TOKEN = "mn"
 
     def setup_method(self, method):
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.MODEL, add_prefix_space=True
-        )
+        self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL)
 
         self.num_prompt_tokens = len(self._encode(self.PROMPT))
         self.target_token_id = self._encode(
diff --git a/tests/v1/core/test_scheduler_e2e.py b/tests/v1/core/test_scheduler_e2e.py
index 90f8757ae4939..f1df4e95d5f49 100644
--- a/tests/v1/core/test_scheduler_e2e.py
+++ b/tests/v1/core/test_scheduler_e2e.py
@@ -5,7 +5,7 @@ import pytest
 
 from vllm import LLM
 
-MODEL = "meta-llama/Llama-3.2-1B"
+MODEL = "hmellor/tiny-random-LlamaForCausalLM"
 PROMPT = "Hello my name is Robert and I"
 
 
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 997b2b74bb6b5..2eb391d676009 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -24,9 +24,11 @@ from ...utils import create_new_process_for_each_test, multi_gpu_test
 if not current_platform.is_cuda():
     pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True)
 
-MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
 TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
-PROMPT = "Hello my name is Robert and I love quantization kernels"
+# test_engine_core_concurrent_batches assumes exactly 12 tokens per prompt.
+# Adjust prompt if changing model to maintain 12-token length.
+PROMPT = "I am Gyoubu Masataka Oniwa"
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
 
 
diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/v1/entrypoints/openai/test_multi_api_servers.py
index 55328f0cf0f09..db52aef70f607 100644
--- a/tests/v1/entrypoints/openai/test_multi_api_servers.py
+++ b/tests/v1/entrypoints/openai/test_multi_api_servers.py
@@ -10,7 +10,7 @@ import pytest_asyncio
 from tests.utils import RemoteOpenAIServer
 from tests.v1.utils import check_request_balancing
 
-MODEL_NAME = "ibm-research/PowerMoE-3b"
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
 
 DP_SIZE = os.getenv("DP_SIZE", "1")
 
diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index bdde28fe0342a..915b9957031d8 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -5,16 +5,13 @@ import pytest
 
 from vllm import LLM, SamplingParams
 
-MODEL = "meta-llama/Llama-3.2-1B"
+MODEL = "hmellor/tiny-random-LlamaForCausalLM"
 PROMPT = "Hello my name is Robert and I"
 
 
 @pytest.fixture(scope="module")
 def llm() -> LLM:
-    # Disable prefix caching so that we can test prompt logprobs.
-    # TODO remove this after https://github.com/vllm-project/vllm/pull/13949
-    # is merged
-    return LLM(MODEL, enforce_eager=True, enable_prefix_caching=False)
+    return LLM(MODEL, enforce_eager=True)
 
 
 def test_n_gt_1(llm):
diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py
index d943578278641..286575b0d50c7 100644
--- a/tests/v1/shutdown/test_delete.py
+++ b/tests/v1/shutdown/test_delete.py
@@ -15,7 +15,7 @@ from vllm.sampling_params import RequestOutputKind
 from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
 
-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = ["hmellor/tiny-random-LlamaForCausalLM"]
 
 
 @pytest.mark.asyncio
diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py
index 383348e88540a..cacc71be43a78 100644
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -18,7 +18,7 @@ from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.exceptions import EngineDeadError
 
-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = ["hmellor/tiny-random-LlamaForCausalLM"]
 
 
 def evil_forward(self, *args, **kwargs):
diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py
index 019c0c4d7cf07..66c9a52b6de2f 100644
--- a/tests/v1/shutdown/test_startup_error.py
+++ b/tests/v1/shutdown/test_startup_error.py
@@ -16,7 +16,7 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
 
-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = ["hmellor/tiny-random-LlamaForCausalLM"]
 
 
 def evil_method(self, *args, **kwargs):
@@ -76,8 +76,10 @@ def test_llm_startup_error(
     Test profiling (forward()) and load weights failures.
     TODO(andy) - LLM without multiprocessing.
     """
-    if model != "meta-llama/Llama-3.2-1B":
-        pytest.skip(reason="Only test meta-llama/Llama-3.2-1B")
+    # Skip non-Llama models since we monkeypatch LlamaForCausalLM specifically.
+    # If MODELS list grows, each architecture needs its own test variant.
+    if model != "JackFram/llama-68m":
+        pytest.skip(reason="Only test JackFram/llama-68m")
     if cuda_device_count_stateless() < tensor_parallel_size:
         pytest.skip(reason="Not enough CUDA devices")