mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 10:30:37 +08:00
[CI] Prune tests/models/decoder_only/language/* tests (#9940)
Signed-off-by: mgoin <michael@neuralmagic.com>
This commit is contained in:
parent
b9c64c0ca7
commit
02462465ea
@ -321,7 +321,6 @@ steps:
|
||||
- tests/models/decoder_only/language
|
||||
commands:
|
||||
- pytest -v -s models/decoder_only/language/test_models.py
|
||||
- pytest -v -s models/decoder_only/language/test_big_models.py
|
||||
|
||||
- label: Decoder-only Language Models Test (Extended) # 1h20min
|
||||
nightly: true
|
||||
@ -329,7 +328,7 @@ steps:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/language
|
||||
commands:
|
||||
- pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py
|
||||
- pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py
|
||||
|
||||
- label: Decoder-only Multi-Modal Models Test (Standard)
|
||||
#mirror_hardwares: [amd]
|
||||
|
||||
@ -1,93 +0,0 @@
|
||||
"""Compare the outputs of HF and vLLM when using greedy sampling.
|
||||
|
||||
This tests bigger models and use half precision.
|
||||
|
||||
Run `pytest tests/models/test_big_models.py`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ...utils import check_logprobs_close, check_outputs_equal
|
||||
|
||||
MODELS = [
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
# "mistralai/Mistral-7B-v0.1", # Tested by test_mistral.py
|
||||
# "Deci/DeciLM-7b", # Broken
|
||||
# "tiiuae/falcon-7b", # Broken
|
||||
"EleutherAI/gpt-j-6b",
|
||||
# "mosaicml/mpt-7b", # Broken
|
||||
# "Qwen/Qwen1.5-0.5B" # Broken,
|
||||
]
|
||||
|
||||
if not current_platform.is_cpu():
|
||||
MODELS += [
|
||||
# fused_moe which not supported on CPU
|
||||
"openbmb/MiniCPM3-4B",
|
||||
# Head size isn't supported on CPU
|
||||
"h2oai/h2o-danube3-4b-base",
|
||||
]
|
||||
|
||||
# TODO: remove this after CPU float16 support ready
|
||||
target_dtype = "float" if current_platform.is_cpu() else "half"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
|
||||
if model == "openbmb/MiniCPM3-4B":
|
||||
# the output becomes slightly different when upgrading to
|
||||
# pytorch 2.5 . Changing to logprobs checks instead of exact
|
||||
# output checks.
|
||||
NUM_LOG_PROBS = 8
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
|
||||
with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
else:
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
def test_model_print(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
|
||||
# This test is for verifying whether the model's extra_repr
|
||||
# can be printed correctly.
|
||||
print(vllm_model.model.llm_engine.model_executor.driver_worker.
|
||||
model_runner.model)
|
||||
@ -21,11 +21,11 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
"kv_cache_dtype,base_model,test_model,scale_path",
|
||||
[
|
||||
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
|
||||
("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None),
|
||||
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
|
||||
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None),
|
||||
# Test FP16 checkpoint w. fp8_e5m2 kv-cache.
|
||||
("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
"meta-llama/Meta-Llama-3-8B-Instruct", None),
|
||||
("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
|
||||
"meta-llama/Llama-3.2-1B-Instruct", None),
|
||||
# Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
|
||||
("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
|
||||
"meta-llama/Llama-2-7b-chat-hf",
|
||||
@ -33,7 +33,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
])
|
||||
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
@pytest.mark.parametrize("enforce_eager", [False, True])
|
||||
@pytest.mark.parametrize("enforce_eager", [True])
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
|
||||
# NOTE: Increasing this in this suite will fail CI because we currently cannot
|
||||
# reset distributed env properly. Use a value > 1 just when you test.
|
||||
|
||||
@ -22,24 +22,11 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
MAX_MODEL_LEN = 1024
|
||||
|
||||
MODELS = [
|
||||
# act_order==False, group_size=channelwise
|
||||
("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
|
||||
# act_order==False, group_size=128
|
||||
("TheBloke/Llama-2-7B-GPTQ", "main"),
|
||||
|
||||
# act_order==True, group_size=128
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
|
||||
# act_order==True, group_size=64
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"),
|
||||
# act_order==True, group_size=32
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"),
|
||||
|
||||
# 8-bit, act_order==True, group_size=channelwise
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
|
||||
# 8-bit, act_order==True, group_size=128
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"),
|
||||
# 8-bit, act_order==True, group_size=32
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"),
|
||||
|
||||
# 4-bit, act_order==True, group_size=128
|
||||
("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
|
||||
|
||||
@ -25,16 +25,16 @@ model_pairs = [
|
||||
# 4-bit, group_size == 128
|
||||
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
|
||||
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
|
||||
# 4-bit, group_size == channelwise
|
||||
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
|
||||
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
|
||||
# # 4-bit, group_size == channelwise
|
||||
# ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
|
||||
# model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
|
||||
|
||||
# 8-bit, group_size == 128
|
||||
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
|
||||
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
|
||||
# 8-bit, group_size == channelwise
|
||||
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
|
||||
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
|
||||
# # 8-bit, group_size == channelwise
|
||||
# ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
|
||||
# model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
|
||||
]
|
||||
|
||||
|
||||
|
||||
@ -1,69 +0,0 @@
|
||||
"""Compare the outputs of a GPTQ model to a Marlin model.
|
||||
|
||||
Note: GPTQ and Marlin do not have bitwise correctness.
|
||||
As a result, in this test, we just confirm that the top selected tokens of the
|
||||
Marlin/GPTQ models are in the top 3 selections of each other.
|
||||
|
||||
Note: Marlin internally uses locks to synchronize the threads. This can
|
||||
result in very slight nondeterminism for Marlin. As a result, we re-run the test
|
||||
up to 3 times to see if we pass.
|
||||
|
||||
Run `pytest tests/models/test_marlin.py`.
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelPair:
|
||||
model_marlin: str
|
||||
model_gptq: str
|
||||
|
||||
|
||||
model_pairs = [
|
||||
ModelPair(model_marlin="nm-testing/zephyr-beta-7b-marlin-g128",
|
||||
model_gptq="nm-testing/zephyr-beta-7b-gptq-g128"),
|
||||
ModelPair(model_marlin="robertgshaw2/zephyr-7b-beta-channelwise-marlin",
|
||||
model_gptq="robertgshaw2/zephyr-7b-beta-channelwise-gptq"),
|
||||
ModelPair(model_marlin="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin",
|
||||
model_gptq="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq")
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.flaky(reruns=2)
|
||||
@pytest.mark.skipif(not is_quant_method_supported("marlin"),
|
||||
reason="Marlin is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model_pair", model_pairs)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model_pair: ModelPair,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(model_pair.model_marlin,
|
||||
dtype=dtype,
|
||||
quantization="marlin") as marlin_model:
|
||||
marlin_outputs = marlin_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model_pair.model_gptq, dtype=dtype,
|
||||
quantization="gptq") as gptq_model:
|
||||
gptq_outputs = gptq_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=gptq_outputs,
|
||||
outputs_1_lst=marlin_outputs,
|
||||
name_0="gptq",
|
||||
name_1="marlin",
|
||||
)
|
||||
@ -4,7 +4,7 @@ Run `pytest tests/models/test_mistral.py`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm import SamplingParams
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
@ -15,6 +15,10 @@ MODELS = [
|
||||
# "mistralai/Mistral-Nemo-Instruct-2407"
|
||||
]
|
||||
|
||||
MISTRAL_FORMAT_MODELS = [
|
||||
"mistralai/Mistral-7B-Instruct-v0.3",
|
||||
]
|
||||
|
||||
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
|
||||
SYMBOLIC_LANG_PROMPTS = [
|
||||
"勇敢な船乗りについての詩を書く", # japanese
|
||||
@ -95,7 +99,7 @@ def test_models(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS[1:])
|
||||
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@ -135,28 +139,29 @@ def test_mistral_format(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS[1:])
|
||||
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("prompt", SYMBOLIC_LANG_PROMPTS)
|
||||
def test_mistral_symbolic_languages(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
prompt: str,
|
||||
) -> None:
|
||||
prompt = "hi"
|
||||
msg = {"role": "user", "content": prompt}
|
||||
llm = LLM(model=model,
|
||||
dtype=dtype,
|
||||
max_model_len=8192,
|
||||
tokenizer_mode="mistral",
|
||||
config_format="mistral",
|
||||
load_format="mistral")
|
||||
outputs = llm.chat([msg], sampling_params=SAMPLING_PARAMS)
|
||||
assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
max_model_len=8192,
|
||||
tokenizer_mode="mistral",
|
||||
config_format="mistral",
|
||||
load_format="mistral") as vllm_model:
|
||||
for prompt in SYMBOLIC_LANG_PROMPTS:
|
||||
msg = {"role": "user", "content": prompt}
|
||||
outputs = vllm_model.model.chat([msg],
|
||||
sampling_params=SAMPLING_PARAMS)
|
||||
assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("model", MODELS[1:]) # v1 can't do func calling
|
||||
@pytest.mark.parametrize("model",
|
||||
MISTRAL_FORMAT_MODELS) # v1 can't do func calling
|
||||
def test_mistral_function_calling(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
|
||||
@ -7,25 +7,39 @@ Run `pytest tests/models/test_models.py`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from ...utils import check_outputs_equal
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = [
|
||||
"facebook/opt-125m",
|
||||
"gpt2",
|
||||
"bigcode/tiny_starcoder_py",
|
||||
"EleutherAI/pythia-70m",
|
||||
"bigscience/bloom-560m", # Testing alibi slopes.
|
||||
"microsoft/phi-2",
|
||||
"stabilityai/stablelm-3b-4e1t",
|
||||
# "allenai/OLMo-1B", # Broken
|
||||
"bigcode/starcoder2-3b",
|
||||
"google/gemma-1.1-2b-it",
|
||||
"facebook/opt-125m", # opt
|
||||
"openai-community/gpt2", # gpt2
|
||||
# "Milos/slovak-gpt-j-405M", # gptj
|
||||
# "bigcode/tiny_starcoder_py", # gpt_bigcode
|
||||
# "EleutherAI/pythia-70m", # gpt_neox
|
||||
"bigscience/bloom-560m", # bloom - testing alibi slopes
|
||||
"microsoft/phi-2", # phi
|
||||
# "stabilityai/stablelm-3b-4e1t", # stablelm
|
||||
# "bigcode/starcoder2-3b", # starcoder2
|
||||
"google/gemma-1.1-2b-it", # gemma
|
||||
"Qwen/Qwen2.5-0.5B-Instruct", # qwen2
|
||||
"meta-llama/Llama-3.2-1B-Instruct", # llama
|
||||
]
|
||||
|
||||
if not current_platform.is_cpu():
|
||||
MODELS += [
|
||||
# fused_moe which not supported on CPU
|
||||
"openbmb/MiniCPM3-4B",
|
||||
]
|
||||
|
||||
# TODO: remove this after CPU float16 support ready
|
||||
target_dtype = "float" if current_platform.is_cpu() else "half"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [96])
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
@ -33,33 +47,24 @@ def test_models(
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
# To pass the small model tests, we need full precision.
|
||||
assert dtype == "float"
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
# This test is for verifying whether the model's extra_repr
|
||||
# can be printed correctly.
|
||||
print(vllm_model.model.llm_engine.model_executor.driver_worker.
|
||||
model_runner.model)
|
||||
|
||||
check_outputs_equal(
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
def test_model_print(
|
||||
vllm_runner,
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
# This test is for verifying whether the model's extra_repr
|
||||
# can be printed correctly.
|
||||
print(vllm_model.model.llm_engine.model_executor.driver_worker.
|
||||
model_runner.model)
|
||||
|
||||
@ -1,34 +0,0 @@
|
||||
"""Ensure that a text-only Qwen model can be run without throwing an error.
|
||||
We explicitly test this because Qwen is implemented as a multimodal and
|
||||
supports a visual encoder for models like Qwen-VL.
|
||||
"""
|
||||
from typing import List, Type
|
||||
|
||||
import pytest
|
||||
|
||||
from ....conftest import VllmRunner
|
||||
|
||||
models = [
|
||||
"Qwen/Qwen-7B-Chat" # Has no visual encoder
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_text_only_qwen_model_can_be_loaded_and_run(
|
||||
vllm_runner: Type[VllmRunner],
|
||||
example_prompts: List[str],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
):
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
example_prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
)
|
||||
Loading…
x
Reference in New Issue
Block a user