From 43721bc67f9f30dcd1c893af370727b835d3d6dd Mon Sep 17 00:00:00 2001 From: Tahsin Tunan Date: Thu, 16 Oct 2025 20:51:27 +0600 Subject: [PATCH] [CI] Replace large models with tiny alternatives in tests (#24057) Signed-off-by: Tahsin Tunan Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Nick Hill Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../test_basic_correctness.py | 16 +-- tests/basic_correctness/test_cpu_offload.py | 2 +- tests/basic_correctness/test_cumem.py | 4 +- tests/distributed/test_sequence_parallel.py | 4 +- tests/entrypoints/llm/test_collective_rpc.py | 5 +- tests/entrypoints/openai/test_run_batch.py | 2 +- .../entrypoints/openai/test_serving_models.py | 2 +- tests/entrypoints/openai/test_shutdown.py | 102 ++++++++++++++---- tests/models/registry.py | 1 + tests/samplers/test_no_bad_words.py | 10 +- tests/v1/core/test_scheduler_e2e.py | 2 +- tests/v1/engine/test_engine_core.py | 6 +- .../openai/test_multi_api_servers.py | 2 +- tests/v1/sample/test_sampling_params_e2e.py | 7 +- tests/v1/shutdown/test_delete.py | 2 +- tests/v1/shutdown/test_forward_error.py | 2 +- tests/v1/shutdown/test_startup_error.py | 8 +- 17 files changed, 118 insertions(+), 59 deletions(-) diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 9b9d8cfea7fad..7f0e29d14f161 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -20,7 +20,7 @@ from ..models.utils import check_outputs_equal from ..utils import multi_gpu_test MODELS = [ - "google/gemma-2-2b-it", + "hmellor/tiny-random-Gemma2ForCausalLM", "meta-llama/Llama-3.2-1B-Instruct", ] @@ -29,7 +29,7 @@ TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4") def test_vllm_gc_ed(): """Verify vllm instance is GC'ed when it is deleted""" - llm = LLM("distilbert/distilgpt2") + llm = LLM("hmellor/tiny-random-LlamaForCausalLM") weak_llm = weakref.ref(llm) del llm # If there's any circular reference to vllm, this fails @@ -125,14 +125,14 @@ def test_models( @pytest.mark.parametrize( "model, distributed_executor_backend, attention_backend, test_suite, extra_env", [ - ("distilbert/distilgpt2", "ray", "", "L4", {}), - ("distilbert/distilgpt2", "mp", "", "L4", {}), - ("distilbert/distilgpt2", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}), - ("distilbert/distilgpt2", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}), + ("facebook/opt-125m", "ray", "", "L4", {}), + ("facebook/opt-125m", "mp", "", "L4", {}), + ("facebook/opt-125m", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}), + ("facebook/opt-125m", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}), ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}), ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}), - ("distilbert/distilgpt2", "ray", "", "A100", {}), - ("distilbert/distilgpt2", "mp", "", "A100", {}), + ("facebook/opt-125m", "ray", "", "A100", {}), + ("facebook/opt-125m", "mp", "", "A100", {}), ], ) @pytest.mark.parametrize("enable_prompt_embeds", [True, False]) diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py index 3c1e01d072b9e..89839372c309a 100644 --- a/tests/basic_correctness/test_cpu_offload.py +++ b/tests/basic_correctness/test_cpu_offload.py @@ -6,5 +6,5 @@ from ..utils import compare_two_settings def test_cpu_offload(): compare_two_settings( - "meta-llama/Llama-3.2-1B-Instruct", [], ["--cpu-offload-gb", "1"] + "hmellor/tiny-random-LlamaForCausalLM", [], ["--cpu-offload-gb", "1"] ) diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index f1b0f7b2de891..4c5971da1ce98 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -120,7 +120,7 @@ def test_cumem_with_cudagraph(): "model", [ # sleep mode with safetensors - "meta-llama/Llama-3.2-1B", + "hmellor/tiny-random-LlamaForCausalLM", # sleep mode with pytorch checkpoint "facebook/opt-125m", ], @@ -174,7 +174,7 @@ def test_end_to_end(model: str): @create_new_process_for_each_test() def test_deep_sleep(): - model = "Qwen/Qwen3-0.6B" + model = "hmellor/tiny-random-LlamaForCausalLM" free, total = torch.cuda.mem_get_info() used_bytes_baseline = total - free # in case other process is running llm = LLM(model, enable_sleep_mode=True) diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py index deefdf22ba06b..fc55a39bd573b 100644 --- a/tests/distributed/test_sequence_parallel.py +++ b/tests/distributed/test_sequence_parallel.py @@ -273,14 +273,14 @@ def _compare_sp( SP_TEXT_GENERATION_MODELS = { # [Decoder-only] - "meta-llama/Llama-3.2-1B-Instruct": SPTestSettings.fast(), + "hmellor/tiny-random-LlamaForCausalLM": SPTestSettings.fast(), "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8": SPTestSettings.fp8_quant(), } SP_TEST_MODELS = [ # TODO support other models # [LANGUAGE GENERATION] - "meta-llama/Llama-3.2-1B-Instruct", + "hmellor/tiny-random-LlamaForCausalLM", "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", ] diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py index 937aa5c132461..747676ac95675 100644 --- a/tests/entrypoints/llm/test_collective_rpc.py +++ b/tests/entrypoints/llm/test_collective_rpc.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest +import torch from vllm import LLM @@ -12,6 +13,8 @@ from ...utils import create_new_process_for_each_test @pytest.mark.parametrize("backend", ["mp", "ray"]) @create_new_process_for_each_test() def test_collective_rpc(tp_size, backend, monkeypatch): + if torch.cuda.device_count() < tp_size: + pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") if tp_size == 1 and backend == "ray": pytest.skip("Skip duplicate test case") if tp_size == 1: @@ -24,7 +27,7 @@ def test_collective_rpc(tp_size, backend, monkeypatch): monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") llm = LLM( - model="meta-llama/Llama-3.2-1B-Instruct", + model="hmellor/tiny-random-LlamaForCausalLM", enforce_eager=True, load_format="dummy", tensor_parallel_size=tp_size, diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index 5a97739e5a347..2f678a0535cc6 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -9,7 +9,7 @@ import pytest from vllm.entrypoints.openai.protocol import BatchRequestOutput -MODEL_NAME = "Qwen/Qwen3-0.6B" +MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM" # ruff: noqa: E501 INPUT_BATCH = ( diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py index df5bf07a8bd41..3c022870dba4b 100644 --- a/tests/entrypoints/openai/test_serving_models.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -16,7 +16,7 @@ from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.lora.request import LoRARequest -MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" +MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM" BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)] LORA_LOADING_SUCCESS_MESSAGE = "Success: LoRA adapter '{lora_name}' added successfully." LORA_UNLOADING_SUCCESS_MESSAGE = ( diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py index ff46df81d0fff..f3394b2dbac34 100644 --- a/tests/entrypoints/openai/test_shutdown.py +++ b/tests/entrypoints/openai/test_shutdown.py @@ -1,37 +1,93 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import signal +import subprocess +import sys +import time + import openai import pytest -from ...utils import RemoteOpenAIServer +from ...utils import get_open_port -MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" +MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM" @pytest.mark.asyncio async def test_shutdown_on_engine_failure(): - # dtype, max-len etc set so that this can run in CI - args = [ - "--dtype", - "bfloat16", - "--max-model-len", - "8192", - "--enforce-eager", - "--max-num-seqs", - "128", - ] + """Verify that API returns connection error when server process is killed. - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: - async with remote_server.get_async_client() as client: - with pytest.raises((openai.APIConnectionError, openai.InternalServerError)): - # Asking for lots of prompt logprobs will currently crash the - # engine. This may change in the future when that bug is fixed - prompt = "Hello " * 4000 - await client.completions.create( - model=MODEL_NAME, prompt=prompt, extra_body={"prompt_logprobs": 10} + Starts a vLLM server, kills it to simulate a crash, then verifies that + subsequent API calls fail appropriately. + """ + + port = get_open_port() + + proc = subprocess.Popen( + [ + # dtype, max-len etc set so that this can run in CI + sys.executable, + "-m", + "vllm.entrypoints.openai.api_server", + "--model", + MODEL_NAME, + "--dtype", + "bfloat16", + "--max-model-len", + "128", + "--enforce-eager", + "--port", + str(port), + "--gpu-memory-utilization", + "0.05", + "--max-num-seqs", + "2", + "--disable-frontend-multiprocessing", + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN), + ) + + # Wait for server startup + start_time = time.time() + client = openai.AsyncOpenAI( + base_url=f"http://localhost:{port}/v1", + api_key="dummy", + max_retries=0, + timeout=10, + ) + + # Poll until server is ready + while time.time() - start_time < 30: + try: + await client.completions.create( + model=MODEL_NAME, prompt="Hello", max_tokens=1 + ) + break + except Exception: + time.sleep(0.5) + if proc.poll() is not None: + stdout, stderr = proc.communicate(timeout=1) + pytest.fail( + f"Server died during startup. stdout: {stdout}, stderr: {stderr}" ) + else: + proc.terminate() + proc.wait(timeout=5) + pytest.fail("Server failed to start in 30 seconds") - # Now the server should shut down - return_code = remote_server.proc.wait(timeout=8) - assert return_code is not None + # Kill server to simulate crash + proc.terminate() + time.sleep(1) + + # Verify API calls now fail + with pytest.raises((openai.APIConnectionError, openai.APIStatusError)): + await client.completions.create( + model=MODEL_NAME, prompt="This should fail", max_tokens=1 + ) + + return_code = proc.wait(timeout=5) + assert return_code is not None diff --git a/tests/models/registry.py b/tests/models/registry.py index 617dc30691aa8..1d3d7fe6595a8 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -330,6 +330,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "guard": "meta-llama/Llama-Guard-3-1B", "hermes": "NousResearch/Hermes-3-Llama-3.1-8B", "fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", + "tiny": "hmellor/tiny-random-LlamaForCausalLM", }, ), "LLaMAForCausalLM": _HfExamplesInfo( diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py index 1b76b909629c9..74047d2f03558 100644 --- a/tests/samplers/test_no_bad_words.py +++ b/tests/samplers/test_no_bad_words.py @@ -35,15 +35,13 @@ def _generate( class TestOneTokenBadWord: - MODEL = "TheBloke/Llama-2-7B-fp16" + MODEL = "hmellor/tiny-random-LlamaForCausalLM" - PROMPT = "Hi! How are" - TARGET_TOKEN = "you" + PROMPT = "How old are " + TARGET_TOKEN = "mn" def setup_method(self, method): - self.tokenizer = AutoTokenizer.from_pretrained( - self.MODEL, add_prefix_space=True - ) + self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL) self.num_prompt_tokens = len(self._encode(self.PROMPT)) self.target_token_id = self._encode( diff --git a/tests/v1/core/test_scheduler_e2e.py b/tests/v1/core/test_scheduler_e2e.py index 90f8757ae4939..f1df4e95d5f49 100644 --- a/tests/v1/core/test_scheduler_e2e.py +++ b/tests/v1/core/test_scheduler_e2e.py @@ -5,7 +5,7 @@ import pytest from vllm import LLM -MODEL = "meta-llama/Llama-3.2-1B" +MODEL = "hmellor/tiny-random-LlamaForCausalLM" PROMPT = "Hello my name is Robert and I" diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 997b2b74bb6b5..2eb391d676009 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -24,9 +24,11 @@ from ...utils import create_new_process_for_each_test, multi_gpu_test if not current_platform.is_cuda(): pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True) -MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" +MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM" TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME) -PROMPT = "Hello my name is Robert and I love quantization kernels" +# test_engine_core_concurrent_batches assumes exactly 12 tokens per prompt. +# Adjust prompt if changing model to maintain 12-token length. +PROMPT = "I am Gyoubu Masataka Oniwa" PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/v1/entrypoints/openai/test_multi_api_servers.py index 55328f0cf0f09..db52aef70f607 100644 --- a/tests/v1/entrypoints/openai/test_multi_api_servers.py +++ b/tests/v1/entrypoints/openai/test_multi_api_servers.py @@ -10,7 +10,7 @@ import pytest_asyncio from tests.utils import RemoteOpenAIServer from tests.v1.utils import check_request_balancing -MODEL_NAME = "ibm-research/PowerMoE-3b" +MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM" DP_SIZE = os.getenv("DP_SIZE", "1") diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py index bdde28fe0342a..915b9957031d8 100644 --- a/tests/v1/sample/test_sampling_params_e2e.py +++ b/tests/v1/sample/test_sampling_params_e2e.py @@ -5,16 +5,13 @@ import pytest from vllm import LLM, SamplingParams -MODEL = "meta-llama/Llama-3.2-1B" +MODEL = "hmellor/tiny-random-LlamaForCausalLM" PROMPT = "Hello my name is Robert and I" @pytest.fixture(scope="module") def llm() -> LLM: - # Disable prefix caching so that we can test prompt logprobs. - # TODO remove this after https://github.com/vllm-project/vllm/pull/13949 - # is merged - return LLM(MODEL, enforce_eager=True, enable_prefix_caching=False) + return LLM(MODEL, enforce_eager=True) def test_n_gt_1(llm): diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py index d943578278641..286575b0d50c7 100644 --- a/tests/v1/shutdown/test_delete.py +++ b/tests/v1/shutdown/test_delete.py @@ -15,7 +15,7 @@ from vllm.sampling_params import RequestOutputKind from vllm.utils import cuda_device_count_stateless from vllm.v1.engine.async_llm import AsyncLLM -MODELS = ["meta-llama/Llama-3.2-1B"] +MODELS = ["hmellor/tiny-random-LlamaForCausalLM"] @pytest.mark.asyncio diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index 383348e88540a..cacc71be43a78 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -18,7 +18,7 @@ from vllm.utils import cuda_device_count_stateless from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.exceptions import EngineDeadError -MODELS = ["meta-llama/Llama-3.2-1B"] +MODELS = ["hmellor/tiny-random-LlamaForCausalLM"] def evil_forward(self, *args, **kwargs): diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py index 019c0c4d7cf07..66c9a52b6de2f 100644 --- a/tests/v1/shutdown/test_startup_error.py +++ b/tests/v1/shutdown/test_startup_error.py @@ -16,7 +16,7 @@ from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.utils import cuda_device_count_stateless from vllm.v1.engine.async_llm import AsyncLLM -MODELS = ["meta-llama/Llama-3.2-1B"] +MODELS = ["hmellor/tiny-random-LlamaForCausalLM"] def evil_method(self, *args, **kwargs): @@ -76,8 +76,10 @@ def test_llm_startup_error( Test profiling (forward()) and load weights failures. TODO(andy) - LLM without multiprocessing. """ - if model != "meta-llama/Llama-3.2-1B": - pytest.skip(reason="Only test meta-llama/Llama-3.2-1B") + # Skip non-Llama models since we monkeypatch LlamaForCausalLM specifically. + # If MODELS list grows, each architecture needs its own test variant. + if model != "JackFram/llama-68m": + pytest.skip(reason="Only test JackFram/llama-68m") if cuda_device_count_stateless() < tensor_parallel_size: pytest.skip(reason="Not enough CUDA devices")