[CI] Prune tests/models/decoder_only/language/* tests (#9940)

Signed-off-by: mgoin <michael@neuralmagic.com>
2025-12-10 11:41:58 +08:00 · 2024-11-05 16:02:23 -05:00 · 2024-11-05 16:02:23 -05:00 · 02462465ea
commit 02462465ea
parent b9c64c0ca7
9 changed files with 70 additions and 270 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -321,7 +321,6 @@ steps:
  - tests/models/decoder_only/language
  commands:
    - pytest -v -s models/decoder_only/language/test_models.py
    - pytest -v -s models/decoder_only/language/test_big_models.py
 - label: Decoder-only Language Models Test (Extended) # 1h20min
  nightly: true
@ -329,7 +328,7 @@ steps:
  - vllm/
  - tests/models/decoder_only/language
  commands:
-    - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py
+    - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py
 - label: Decoder-only Multi-Modal Models Test (Standard)
  #mirror_hardwares: [amd]
--- a/tests/models/decoder_only/language/test_big_models.py
+++ b/tests/models/decoder_only/language/test_big_models.py
@ -1,93 +0,0 @@
 """Compare the outputs of HF and vLLM when using greedy sampling.
 This tests bigger models and use half precision.
 Run `pytest tests/models/test_big_models.py`.
 """
 import pytest
 from vllm.platforms import current_platform
 from ...utils import check_logprobs_close, check_outputs_equal
 MODELS = [
    "meta-llama/Llama-2-7b-hf",
    # "mistralai/Mistral-7B-v0.1",  # Tested by test_mistral.py
    # "Deci/DeciLM-7b",  # Broken
    # "tiiuae/falcon-7b",  # Broken
    "EleutherAI/gpt-j-6b",
    # "mosaicml/mpt-7b",  # Broken
    # "Qwen/Qwen1.5-0.5B"  # Broken,
 ]
 if not current_platform.is_cpu():
    MODELS += [
        # fused_moe which not supported on CPU
        "openbmb/MiniCPM3-4B",
        # Head size isn't supported on CPU
        "h2oai/h2o-danube3-4b-base",
    ]
 # TODO: remove this after CPU float16 support ready
 target_dtype = "float" if current_platform.is_cpu() else "half"
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [32])
 def test_models(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
    max_tokens: int,
 ) -> None:
    if model == "openbmb/MiniCPM3-4B":
        # the output becomes slightly different when upgrading to
        # pytorch 2.5 . Changing to logprobs checks instead of exact
        # output checks.
        NUM_LOG_PROBS = 8
        with hf_runner(model, dtype=dtype) as hf_model:
            hf_outputs = hf_model.generate_greedy_logprobs_limit(
                example_prompts, max_tokens, NUM_LOG_PROBS)
        with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
            vllm_outputs = vllm_model.generate_greedy_logprobs(
                example_prompts, max_tokens, NUM_LOG_PROBS)
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_outputs,
            name_0="hf",
            name_1="vllm",
        )
    else:
        with hf_runner(model, dtype=dtype) as hf_model:
            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
        with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
            vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                      max_tokens)
        check_outputs_equal(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_outputs,
            name_0="hf",
            name_1="vllm",
        )
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", [target_dtype])
 def test_model_print(
    vllm_runner,
    model: str,
    dtype: str,
 ) -> None:
    with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
        # This test is for verifying whether the model's extra_repr
        # can be printed correctly.
        print(vllm_model.model.llm_engine.model_executor.driver_worker.
              model_runner.model)
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@ -21,11 +21,11 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
    "kv_cache_dtype,base_model,test_model,scale_path",
    [
        # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
-        ("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct",
+        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
-         "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None),
+         "nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None),
        # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
-        ("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct",
+        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
-         "meta-llama/Meta-Llama-3-8B-Instruct", None),
+         "meta-llama/Llama-3.2-1B-Instruct", None),
        # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
        ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
         "meta-llama/Llama-2-7b-chat-hf",
@ -33,7 +33,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
    ])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
-@pytest.mark.parametrize("enforce_eager", [False, True])
+@pytest.mark.parametrize("enforce_eager", [True])
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
--- a/tests/models/decoder_only/language/test_gptq_marlin.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin.py
@ -22,24 +22,11 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
 MAX_MODEL_LEN = 1024
 MODELS = [
    # act_order==False, group_size=channelwise
    ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
    # act_order==False, group_size=128
    ("TheBloke/Llama-2-7B-GPTQ", "main"),
    # act_order==True, group_size=128
    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
    # act_order==True, group_size=64
    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"),
    # act_order==True, group_size=32
    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"),
    # 8-bit, act_order==True, group_size=channelwise
    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
    # 8-bit, act_order==True, group_size=128
    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"),
    # 8-bit, act_order==True, group_size=32
    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"),
    # 4-bit, act_order==True, group_size=128
    ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
--- a/tests/models/decoder_only/language/test_gptq_marlin_24.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py
@ -25,16 +25,16 @@ model_pairs = [
    # 4-bit, group_size == 128
    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
              model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
-    # 4-bit, group_size == channelwise
+    # # 4-bit, group_size == channelwise
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
+    # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
-              model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
+    #           model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
    # 8-bit, group_size == 128
    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
              model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
-    # 8-bit, group_size == channelwise
+    # # 8-bit, group_size == channelwise
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
+    # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
-              model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
+    #           model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
 ]
--- a/tests/models/decoder_only/language/test_marlin.py
+++ b/tests/models/decoder_only/language/test_marlin.py
@ -1,69 +0,0 @@
 """Compare the outputs of a GPTQ model to a Marlin model.
 Note: GPTQ and Marlin do not have bitwise correctness.
 As a result, in this test, we just confirm that the top selected tokens of the
 Marlin/GPTQ models are in the top 3 selections of each other.
 Note: Marlin internally uses locks to synchronize the threads. This can
 result in very slight nondeterminism for Marlin. As a result, we re-run the test
 up to 3 times to see if we pass.
 Run `pytest tests/models/test_marlin.py`.
 """
 from dataclasses import dataclass
 import pytest
 from tests.quantization.utils import is_quant_method_supported
 from ...utils import check_logprobs_close
@dataclass
 class ModelPair:
    model_marlin: str
    model_gptq: str
 model_pairs = [
    ModelPair(model_marlin="nm-testing/zephyr-beta-7b-marlin-g128",
              model_gptq="nm-testing/zephyr-beta-7b-gptq-g128"),
    ModelPair(model_marlin="robertgshaw2/zephyr-7b-beta-channelwise-marlin",
              model_gptq="robertgshaw2/zephyr-7b-beta-channelwise-gptq"),
    ModelPair(model_marlin="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin",
              model_gptq="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq")
 ]
@pytest.mark.flaky(reruns=2)
@pytest.mark.skipif(not is_quant_method_supported("marlin"),
                    reason="Marlin is not supported on this GPU type.")
@pytest.mark.parametrize("model_pair", model_pairs)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_models(
    vllm_runner,
    example_prompts,
    model_pair: ModelPair,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
    with vllm_runner(model_pair.model_marlin,
                     dtype=dtype,
                     quantization="marlin") as marlin_model:
        marlin_outputs = marlin_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)
    with vllm_runner(model_pair.model_gptq, dtype=dtype,
                     quantization="gptq") as gptq_model:
        gptq_outputs = gptq_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)
    check_logprobs_close(
        outputs_0_lst=gptq_outputs,
        outputs_1_lst=marlin_outputs,
        name_0="gptq",
        name_1="marlin",
    )
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@ -4,7 +4,7 @@ Run `pytest tests/models/test_mistral.py`.
 """
 import pytest
-from vllm import LLM, SamplingParams
+from vllm import SamplingParams
 from ...utils import check_logprobs_close
@ -15,6 +15,10 @@ MODELS = [
    # "mistralai/Mistral-Nemo-Instruct-2407"
 ]
 MISTRAL_FORMAT_MODELS = [
    "mistralai/Mistral-7B-Instruct-v0.3",
 ]
 SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
 SYMBOLIC_LANG_PROMPTS = [
    "勇敢な船乗りについての詩を書く",  # japanese
@ -95,7 +99,7 @@ def test_models(
    )
-@pytest.mark.parametrize("model", MODELS[1:])
+@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
@ -135,28 +139,29 @@ def test_mistral_format(
    )
-@pytest.mark.parametrize("model", MODELS[1:])
+@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("prompt", SYMBOLIC_LANG_PROMPTS)
 def test_mistral_symbolic_languages(
    vllm_runner,
    model: str,
    dtype: str,
    prompt: str,
 ) -> None:
-    prompt = "hi"
+    with vllm_runner(model,
-    msg = {"role": "user", "content": prompt}
+                     dtype=dtype,
-    llm = LLM(model=model,
+                     max_model_len=8192,
-              dtype=dtype,
+                     tokenizer_mode="mistral",
-              max_model_len=8192,
+                     config_format="mistral",
-              tokenizer_mode="mistral",
+                     load_format="mistral") as vllm_model:
-              config_format="mistral",
+        for prompt in SYMBOLIC_LANG_PROMPTS:
-              load_format="mistral")
+            msg = {"role": "user", "content": prompt}
-    outputs = llm.chat([msg], sampling_params=SAMPLING_PARAMS)
+            outputs = vllm_model.model.chat([msg],
-    assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
+                                            sampling_params=SAMPLING_PARAMS)
            assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("model", MODELS[1:])  # v1 can't do func calling
+@pytest.mark.parametrize("model",
                         MISTRAL_FORMAT_MODELS)  # v1 can't do func calling
 def test_mistral_function_calling(
    vllm_runner,
    model: str,
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@ -7,25 +7,39 @@ Run `pytest tests/models/test_models.py`.
 """
 import pytest
-from ...utils import check_outputs_equal
+from vllm.platforms import current_platform
 from ...utils import check_logprobs_close
 MODELS = [
-    "facebook/opt-125m",
+    "facebook/opt-125m",  # opt
-    "gpt2",
+    "openai-community/gpt2",  # gpt2
-    "bigcode/tiny_starcoder_py",
+    # "Milos/slovak-gpt-j-405M",  # gptj
-    "EleutherAI/pythia-70m",
+    # "bigcode/tiny_starcoder_py",  # gpt_bigcode
-    "bigscience/bloom-560m",  # Testing alibi slopes.
+    # "EleutherAI/pythia-70m",  # gpt_neox
-    "microsoft/phi-2",
+    "bigscience/bloom-560m",  # bloom - testing alibi slopes
-    "stabilityai/stablelm-3b-4e1t",
+    "microsoft/phi-2",  # phi
-    # "allenai/OLMo-1B",  # Broken
+    # "stabilityai/stablelm-3b-4e1t",  # stablelm
-    "bigcode/starcoder2-3b",
+    # "bigcode/starcoder2-3b",  # starcoder2
-    "google/gemma-1.1-2b-it",
+    "google/gemma-1.1-2b-it",  # gemma
    "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
    "meta-llama/Llama-3.2-1B-Instruct",  # llama
 ]
 if not current_platform.is_cpu():
    MODELS += [
        # fused_moe which not supported on CPU
        "openbmb/MiniCPM3-4B",
    ]
 # TODO: remove this after CPU float16 support ready
 target_dtype = "float" if current_platform.is_cpu() else "half"
@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [96])
+@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_models(
    hf_runner,
    vllm_runner,
@ -33,33 +47,24 @@ def test_models(
    model: str,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
    # To pass the small model tests, we need full precision.
    assert dtype == "float"
    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)
    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)
        # This test is for verifying whether the model's extra_repr
        # can be printed correctly.
        print(vllm_model.model.llm_engine.model_executor.driver_worker.
              model_runner.model)
-    check_outputs_equal(
+    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
 def test_model_print(
    vllm_runner,
    model: str,
    dtype: str,
 ) -> None:
    with vllm_runner(model, dtype=dtype) as vllm_model:
        # This test is for verifying whether the model's extra_repr
        # can be printed correctly.
        print(vllm_model.model.llm_engine.model_executor.driver_worker.
              model_runner.model)
--- a/tests/models/decoder_only/language/test_qwen.py
+++ b/tests/models/decoder_only/language/test_qwen.py
@ -1,34 +0,0 @@
 """Ensure that a text-only Qwen model can be run without throwing an error.
 We explicitly test this because Qwen is implemented as a multimodal and
 supports a visual encoder for models like Qwen-VL.
 """
 from typing import List, Type
 import pytest
 from ....conftest import VllmRunner
 models = [
    "Qwen/Qwen-7B-Chat"  # Has no visual encoder
 ]
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_text_only_qwen_model_can_be_loaded_and_run(
    vllm_runner: Type[VllmRunner],
    example_prompts: List[str],
    model: str,
    *,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
 ):
    with vllm_runner(model, dtype=dtype) as vllm_model:
        vllm_model.generate_greedy_logprobs(
            example_prompts,
            max_tokens,
            num_logprobs=num_logprobs,
        )