[CI] Prune tests/models/decoder_only/language/* tests (#9940)

Signed-off-by: mgoin <michael@neuralmagic.com>
This commit is contained in:
Michael Goin 2024-11-05 16:02:23 -05:00 committed by GitHub
parent b9c64c0ca7
commit 02462465ea
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 70 additions and 270 deletions

View File

@ -321,7 +321,6 @@ steps:
- tests/models/decoder_only/language - tests/models/decoder_only/language
commands: commands:
- pytest -v -s models/decoder_only/language/test_models.py - pytest -v -s models/decoder_only/language/test_models.py
- pytest -v -s models/decoder_only/language/test_big_models.py
- label: Decoder-only Language Models Test (Extended) # 1h20min - label: Decoder-only Language Models Test (Extended) # 1h20min
nightly: true nightly: true
@ -329,7 +328,7 @@ steps:
- vllm/ - vllm/
- tests/models/decoder_only/language - tests/models/decoder_only/language
commands: commands:
- pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py
- label: Decoder-only Multi-Modal Models Test (Standard) - label: Decoder-only Multi-Modal Models Test (Standard)
#mirror_hardwares: [amd] #mirror_hardwares: [amd]

View File

@ -1,93 +0,0 @@
"""Compare the outputs of HF and vLLM when using greedy sampling.
This tests bigger models and use half precision.
Run `pytest tests/models/test_big_models.py`.
"""
import pytest
from vllm.platforms import current_platform
from ...utils import check_logprobs_close, check_outputs_equal
MODELS = [
"meta-llama/Llama-2-7b-hf",
# "mistralai/Mistral-7B-v0.1", # Tested by test_mistral.py
# "Deci/DeciLM-7b", # Broken
# "tiiuae/falcon-7b", # Broken
"EleutherAI/gpt-j-6b",
# "mosaicml/mpt-7b", # Broken
# "Qwen/Qwen1.5-0.5B" # Broken,
]
if not current_platform.is_cpu():
MODELS += [
# fused_moe which not supported on CPU
"openbmb/MiniCPM3-4B",
# Head size isn't supported on CPU
"h2oai/h2o-danube3-4b-base",
]
# TODO: remove this after CPU float16 support ready
target_dtype = "float" if current_platform.is_cpu() else "half"
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [32])
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
if model == "openbmb/MiniCPM3-4B":
# the output becomes slightly different when upgrading to
# pytorch 2.5 . Changing to logprobs checks instead of exact
# output checks.
NUM_LOG_PROBS = 8
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, NUM_LOG_PROBS)
with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
else:
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", [target_dtype])
def test_model_print(
vllm_runner,
model: str,
dtype: str,
) -> None:
with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model)

View File

@ -21,11 +21,11 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
"kv_cache_dtype,base_model,test_model,scale_path", "kv_cache_dtype,base_model,test_model,scale_path",
[ [
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors. # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct", ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None), "nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None),
# Test FP16 checkpoint w. fp8_e5m2 kv-cache. # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct", ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Meta-Llama-3-8B-Instruct", None), "meta-llama/Llama-3.2-1B-Instruct", None),
# Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json. # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf", ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
"meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-7b-chat-hf",
@ -33,7 +33,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
]) ])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens # Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4]) @pytest.mark.parametrize("max_tokens", [4])
@pytest.mark.parametrize("enforce_eager", [False, True]) @pytest.mark.parametrize("enforce_eager", [True])
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"]) @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
# NOTE: Increasing this in this suite will fail CI because we currently cannot # NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test. # reset distributed env properly. Use a value > 1 just when you test.

View File

@ -22,24 +22,11 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
MAX_MODEL_LEN = 1024 MAX_MODEL_LEN = 1024
MODELS = [ MODELS = [
# act_order==False, group_size=channelwise
("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
# act_order==False, group_size=128
("TheBloke/Llama-2-7B-GPTQ", "main"),
# act_order==True, group_size=128 # act_order==True, group_size=128
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"), ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
# act_order==True, group_size=64
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"),
# act_order==True, group_size=32
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"),
# 8-bit, act_order==True, group_size=channelwise # 8-bit, act_order==True, group_size=channelwise
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"), ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
# 8-bit, act_order==True, group_size=128
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"),
# 8-bit, act_order==True, group_size=32
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"),
# 4-bit, act_order==True, group_size=128 # 4-bit, act_order==True, group_size=128
("TechxGenus/gemma-1.1-2b-it-GPTQ", "main") ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")

View File

@ -25,16 +25,16 @@ model_pairs = [
# 4-bit, group_size == 128 # 4-bit, group_size == 128
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128", ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"), model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
# 4-bit, group_size == channelwise # # 4-bit, group_size == channelwise
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise", # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"), # model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
# 8-bit, group_size == 128 # 8-bit, group_size == 128
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128", ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"), model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
# 8-bit, group_size == channelwise # # 8-bit, group_size == channelwise
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise", # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"), # model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
] ]

View File

@ -1,69 +0,0 @@
"""Compare the outputs of a GPTQ model to a Marlin model.
Note: GPTQ and Marlin do not have bitwise correctness.
As a result, in this test, we just confirm that the top selected tokens of the
Marlin/GPTQ models are in the top 3 selections of each other.
Note: Marlin internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for Marlin. As a result, we re-run the test
up to 3 times to see if we pass.
Run `pytest tests/models/test_marlin.py`.
"""
from dataclasses import dataclass
import pytest
from tests.quantization.utils import is_quant_method_supported
from ...utils import check_logprobs_close
@dataclass
class ModelPair:
model_marlin: str
model_gptq: str
model_pairs = [
ModelPair(model_marlin="nm-testing/zephyr-beta-7b-marlin-g128",
model_gptq="nm-testing/zephyr-beta-7b-gptq-g128"),
ModelPair(model_marlin="robertgshaw2/zephyr-7b-beta-channelwise-marlin",
model_gptq="robertgshaw2/zephyr-7b-beta-channelwise-gptq"),
ModelPair(model_marlin="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin",
model_gptq="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq")
]
@pytest.mark.flaky(reruns=2)
@pytest.mark.skipif(not is_quant_method_supported("marlin"),
reason="Marlin is not supported on this GPU type.")
@pytest.mark.parametrize("model_pair", model_pairs)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(
vllm_runner,
example_prompts,
model_pair: ModelPair,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
with vllm_runner(model_pair.model_marlin,
dtype=dtype,
quantization="marlin") as marlin_model:
marlin_outputs = marlin_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
with vllm_runner(model_pair.model_gptq, dtype=dtype,
quantization="gptq") as gptq_model:
gptq_outputs = gptq_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
check_logprobs_close(
outputs_0_lst=gptq_outputs,
outputs_1_lst=marlin_outputs,
name_0="gptq",
name_1="marlin",
)

View File

@ -4,7 +4,7 @@ Run `pytest tests/models/test_mistral.py`.
""" """
import pytest import pytest
from vllm import LLM, SamplingParams from vllm import SamplingParams
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
@ -15,6 +15,10 @@ MODELS = [
# "mistralai/Mistral-Nemo-Instruct-2407" # "mistralai/Mistral-Nemo-Instruct-2407"
] ]
MISTRAL_FORMAT_MODELS = [
"mistralai/Mistral-7B-Instruct-v0.3",
]
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5) SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
SYMBOLIC_LANG_PROMPTS = [ SYMBOLIC_LANG_PROMPTS = [
"勇敢な船乗りについての詩を書く", # japanese "勇敢な船乗りについての詩を書く", # japanese
@ -95,7 +99,7 @@ def test_models(
) )
@pytest.mark.parametrize("model", MODELS[1:]) @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("num_logprobs", [5])
@ -135,28 +139,29 @@ def test_mistral_format(
) )
@pytest.mark.parametrize("model", MODELS[1:]) @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("prompt", SYMBOLIC_LANG_PROMPTS)
def test_mistral_symbolic_languages( def test_mistral_symbolic_languages(
vllm_runner,
model: str, model: str,
dtype: str, dtype: str,
prompt: str,
) -> None: ) -> None:
prompt = "hi" with vllm_runner(model,
msg = {"role": "user", "content": prompt}
llm = LLM(model=model,
dtype=dtype, dtype=dtype,
max_model_len=8192, max_model_len=8192,
tokenizer_mode="mistral", tokenizer_mode="mistral",
config_format="mistral", config_format="mistral",
load_format="mistral") load_format="mistral") as vllm_model:
outputs = llm.chat([msg], sampling_params=SAMPLING_PARAMS) for prompt in SYMBOLIC_LANG_PROMPTS:
msg = {"role": "user", "content": prompt}
outputs = vllm_model.model.chat([msg],
sampling_params=SAMPLING_PARAMS)
assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip() assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("model", MODELS[1:]) # v1 can't do func calling @pytest.mark.parametrize("model",
MISTRAL_FORMAT_MODELS) # v1 can't do func calling
def test_mistral_function_calling( def test_mistral_function_calling(
vllm_runner, vllm_runner,
model: str, model: str,

View File

@ -7,25 +7,39 @@ Run `pytest tests/models/test_models.py`.
""" """
import pytest import pytest
from ...utils import check_outputs_equal from vllm.platforms import current_platform
from ...utils import check_logprobs_close
MODELS = [ MODELS = [
"facebook/opt-125m", "facebook/opt-125m", # opt
"gpt2", "openai-community/gpt2", # gpt2
"bigcode/tiny_starcoder_py", # "Milos/slovak-gpt-j-405M", # gptj
"EleutherAI/pythia-70m", # "bigcode/tiny_starcoder_py", # gpt_bigcode
"bigscience/bloom-560m", # Testing alibi slopes. # "EleutherAI/pythia-70m", # gpt_neox
"microsoft/phi-2", "bigscience/bloom-560m", # bloom - testing alibi slopes
"stabilityai/stablelm-3b-4e1t", "microsoft/phi-2", # phi
# "allenai/OLMo-1B", # Broken # "stabilityai/stablelm-3b-4e1t", # stablelm
"bigcode/starcoder2-3b", # "bigcode/starcoder2-3b", # starcoder2
"google/gemma-1.1-2b-it", "google/gemma-1.1-2b-it", # gemma
"Qwen/Qwen2.5-0.5B-Instruct", # qwen2
"meta-llama/Llama-3.2-1B-Instruct", # llama
] ]
if not current_platform.is_cpu():
MODELS += [
# fused_moe which not supported on CPU
"openbmb/MiniCPM3-4B",
]
# TODO: remove this after CPU float16 support ready
target_dtype = "float" if current_platform.is_cpu() else "half"
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [96]) @pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models( def test_models(
hf_runner, hf_runner,
vllm_runner, vllm_runner,
@ -33,33 +47,24 @@ def test_models(
model: str, model: str,
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
num_logprobs: int,
) -> None: ) -> None:
# To pass the small model tests, we need full precision.
assert dtype == "float"
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
with vllm_runner(model, dtype=dtype) as vllm_model: with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model)
check_outputs_equal( check_logprobs_close(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
name_0="hf", name_0="hf",
name_1="vllm", name_1="vllm",
) )
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
def test_model_print(
vllm_runner,
model: str,
dtype: str,
) -> None:
with vllm_runner(model, dtype=dtype) as vllm_model:
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model)

View File

@ -1,34 +0,0 @@
"""Ensure that a text-only Qwen model can be run without throwing an error.
We explicitly test this because Qwen is implemented as a multimodal and
supports a visual encoder for models like Qwen-VL.
"""
from typing import List, Type
import pytest
from ....conftest import VllmRunner
models = [
"Qwen/Qwen-7B-Chat" # Has no visual encoder
]
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
def test_text_only_qwen_model_can_be_loaded_and_run(
vllm_runner: Type[VllmRunner],
example_prompts: List[str],
model: str,
*,
dtype: str,
max_tokens: int,
num_logprobs: int,
):
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_model.generate_greedy_logprobs(
example_prompts,
max_tokens,
num_logprobs=num_logprobs,
)