mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 11:41:58 +08:00
[CI] Prune tests/models/decoder_only/language/* tests (#9940)
Signed-off-by: mgoin <michael@neuralmagic.com>
This commit is contained in:
parent
b9c64c0ca7
commit
02462465ea
@ -321,7 +321,6 @@ steps:
|
|||||||
- tests/models/decoder_only/language
|
- tests/models/decoder_only/language
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/decoder_only/language/test_models.py
|
- pytest -v -s models/decoder_only/language/test_models.py
|
||||||
- pytest -v -s models/decoder_only/language/test_big_models.py
|
|
||||||
|
|
||||||
- label: Decoder-only Language Models Test (Extended) # 1h20min
|
- label: Decoder-only Language Models Test (Extended) # 1h20min
|
||||||
nightly: true
|
nightly: true
|
||||||
@ -329,7 +328,7 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/decoder_only/language
|
- tests/models/decoder_only/language
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py
|
- pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py
|
||||||
|
|
||||||
- label: Decoder-only Multi-Modal Models Test (Standard)
|
- label: Decoder-only Multi-Modal Models Test (Standard)
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
|
|||||||
@ -1,93 +0,0 @@
|
|||||||
"""Compare the outputs of HF and vLLM when using greedy sampling.
|
|
||||||
|
|
||||||
This tests bigger models and use half precision.
|
|
||||||
|
|
||||||
Run `pytest tests/models/test_big_models.py`.
|
|
||||||
"""
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from vllm.platforms import current_platform
|
|
||||||
|
|
||||||
from ...utils import check_logprobs_close, check_outputs_equal
|
|
||||||
|
|
||||||
MODELS = [
|
|
||||||
"meta-llama/Llama-2-7b-hf",
|
|
||||||
# "mistralai/Mistral-7B-v0.1", # Tested by test_mistral.py
|
|
||||||
# "Deci/DeciLM-7b", # Broken
|
|
||||||
# "tiiuae/falcon-7b", # Broken
|
|
||||||
"EleutherAI/gpt-j-6b",
|
|
||||||
# "mosaicml/mpt-7b", # Broken
|
|
||||||
# "Qwen/Qwen1.5-0.5B" # Broken,
|
|
||||||
]
|
|
||||||
|
|
||||||
if not current_platform.is_cpu():
|
|
||||||
MODELS += [
|
|
||||||
# fused_moe which not supported on CPU
|
|
||||||
"openbmb/MiniCPM3-4B",
|
|
||||||
# Head size isn't supported on CPU
|
|
||||||
"h2oai/h2o-danube3-4b-base",
|
|
||||||
]
|
|
||||||
|
|
||||||
# TODO: remove this after CPU float16 support ready
|
|
||||||
target_dtype = "float" if current_platform.is_cpu() else "half"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
|
||||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [32])
|
|
||||||
def test_models(
|
|
||||||
hf_runner,
|
|
||||||
vllm_runner,
|
|
||||||
example_prompts,
|
|
||||||
model: str,
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
) -> None:
|
|
||||||
|
|
||||||
if model == "openbmb/MiniCPM3-4B":
|
|
||||||
# the output becomes slightly different when upgrading to
|
|
||||||
# pytorch 2.5 . Changing to logprobs checks instead of exact
|
|
||||||
# output checks.
|
|
||||||
NUM_LOG_PROBS = 8
|
|
||||||
with hf_runner(model, dtype=dtype) as hf_model:
|
|
||||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
|
||||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
|
||||||
|
|
||||||
with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
|
|
||||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
|
||||||
example_prompts, max_tokens, NUM_LOG_PROBS)
|
|
||||||
|
|
||||||
check_logprobs_close(
|
|
||||||
outputs_0_lst=hf_outputs,
|
|
||||||
outputs_1_lst=vllm_outputs,
|
|
||||||
name_0="hf",
|
|
||||||
name_1="vllm",
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
with hf_runner(model, dtype=dtype) as hf_model:
|
|
||||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
|
||||||
|
|
||||||
with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
|
|
||||||
vllm_outputs = vllm_model.generate_greedy(example_prompts,
|
|
||||||
max_tokens)
|
|
||||||
|
|
||||||
check_outputs_equal(
|
|
||||||
outputs_0_lst=hf_outputs,
|
|
||||||
outputs_1_lst=vllm_outputs,
|
|
||||||
name_0="hf",
|
|
||||||
name_1="vllm",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
|
||||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
|
||||||
def test_model_print(
|
|
||||||
vllm_runner,
|
|
||||||
model: str,
|
|
||||||
dtype: str,
|
|
||||||
) -> None:
|
|
||||||
with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
|
|
||||||
# This test is for verifying whether the model's extra_repr
|
|
||||||
# can be printed correctly.
|
|
||||||
print(vllm_model.model.llm_engine.model_executor.driver_worker.
|
|
||||||
model_runner.model)
|
|
||||||
@ -21,11 +21,11 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
|||||||
"kv_cache_dtype,base_model,test_model,scale_path",
|
"kv_cache_dtype,base_model,test_model,scale_path",
|
||||||
[
|
[
|
||||||
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
|
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
|
||||||
("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct",
|
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
|
||||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None),
|
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None),
|
||||||
# Test FP16 checkpoint w. fp8_e5m2 kv-cache.
|
# Test FP16 checkpoint w. fp8_e5m2 kv-cache.
|
||||||
("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct",
|
("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
|
||||||
"meta-llama/Meta-Llama-3-8B-Instruct", None),
|
"meta-llama/Llama-3.2-1B-Instruct", None),
|
||||||
# Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
|
# Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
|
||||||
("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
|
("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
|
||||||
"meta-llama/Llama-2-7b-chat-hf",
|
"meta-llama/Llama-2-7b-chat-hf",
|
||||||
@ -33,7 +33,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
|||||||
])
|
])
|
||||||
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
|
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
|
||||||
@pytest.mark.parametrize("max_tokens", [4])
|
@pytest.mark.parametrize("max_tokens", [4])
|
||||||
@pytest.mark.parametrize("enforce_eager", [False, True])
|
@pytest.mark.parametrize("enforce_eager", [True])
|
||||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
|
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
|
||||||
# NOTE: Increasing this in this suite will fail CI because we currently cannot
|
# NOTE: Increasing this in this suite will fail CI because we currently cannot
|
||||||
# reset distributed env properly. Use a value > 1 just when you test.
|
# reset distributed env properly. Use a value > 1 just when you test.
|
||||||
|
|||||||
@ -22,24 +22,11 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
|||||||
MAX_MODEL_LEN = 1024
|
MAX_MODEL_LEN = 1024
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
# act_order==False, group_size=channelwise
|
|
||||||
("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
|
|
||||||
# act_order==False, group_size=128
|
|
||||||
("TheBloke/Llama-2-7B-GPTQ", "main"),
|
|
||||||
|
|
||||||
# act_order==True, group_size=128
|
# act_order==True, group_size=128
|
||||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
|
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
|
||||||
# act_order==True, group_size=64
|
|
||||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"),
|
|
||||||
# act_order==True, group_size=32
|
|
||||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"),
|
|
||||||
|
|
||||||
# 8-bit, act_order==True, group_size=channelwise
|
# 8-bit, act_order==True, group_size=channelwise
|
||||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
|
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
|
||||||
# 8-bit, act_order==True, group_size=128
|
|
||||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"),
|
|
||||||
# 8-bit, act_order==True, group_size=32
|
|
||||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"),
|
|
||||||
|
|
||||||
# 4-bit, act_order==True, group_size=128
|
# 4-bit, act_order==True, group_size=128
|
||||||
("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
|
("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
|
||||||
|
|||||||
@ -25,16 +25,16 @@ model_pairs = [
|
|||||||
# 4-bit, group_size == 128
|
# 4-bit, group_size == 128
|
||||||
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
|
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
|
||||||
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
|
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
|
||||||
# 4-bit, group_size == channelwise
|
# # 4-bit, group_size == channelwise
|
||||||
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
|
# ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
|
||||||
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
|
# model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
|
||||||
|
|
||||||
# 8-bit, group_size == 128
|
# 8-bit, group_size == 128
|
||||||
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
|
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
|
||||||
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
|
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
|
||||||
# 8-bit, group_size == channelwise
|
# # 8-bit, group_size == channelwise
|
||||||
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
|
# ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
|
||||||
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
|
# model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,69 +0,0 @@
|
|||||||
"""Compare the outputs of a GPTQ model to a Marlin model.
|
|
||||||
|
|
||||||
Note: GPTQ and Marlin do not have bitwise correctness.
|
|
||||||
As a result, in this test, we just confirm that the top selected tokens of the
|
|
||||||
Marlin/GPTQ models are in the top 3 selections of each other.
|
|
||||||
|
|
||||||
Note: Marlin internally uses locks to synchronize the threads. This can
|
|
||||||
result in very slight nondeterminism for Marlin. As a result, we re-run the test
|
|
||||||
up to 3 times to see if we pass.
|
|
||||||
|
|
||||||
Run `pytest tests/models/test_marlin.py`.
|
|
||||||
"""
|
|
||||||
from dataclasses import dataclass
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from tests.quantization.utils import is_quant_method_supported
|
|
||||||
|
|
||||||
from ...utils import check_logprobs_close
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class ModelPair:
|
|
||||||
model_marlin: str
|
|
||||||
model_gptq: str
|
|
||||||
|
|
||||||
|
|
||||||
model_pairs = [
|
|
||||||
ModelPair(model_marlin="nm-testing/zephyr-beta-7b-marlin-g128",
|
|
||||||
model_gptq="nm-testing/zephyr-beta-7b-gptq-g128"),
|
|
||||||
ModelPair(model_marlin="robertgshaw2/zephyr-7b-beta-channelwise-marlin",
|
|
||||||
model_gptq="robertgshaw2/zephyr-7b-beta-channelwise-gptq"),
|
|
||||||
ModelPair(model_marlin="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin",
|
|
||||||
model_gptq="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq")
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.flaky(reruns=2)
|
|
||||||
@pytest.mark.skipif(not is_quant_method_supported("marlin"),
|
|
||||||
reason="Marlin is not supported on this GPU type.")
|
|
||||||
@pytest.mark.parametrize("model_pair", model_pairs)
|
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [32])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
def test_models(
|
|
||||||
vllm_runner,
|
|
||||||
example_prompts,
|
|
||||||
model_pair: ModelPair,
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
) -> None:
|
|
||||||
with vllm_runner(model_pair.model_marlin,
|
|
||||||
dtype=dtype,
|
|
||||||
quantization="marlin") as marlin_model:
|
|
||||||
marlin_outputs = marlin_model.generate_greedy_logprobs(
|
|
||||||
example_prompts, max_tokens, num_logprobs)
|
|
||||||
|
|
||||||
with vllm_runner(model_pair.model_gptq, dtype=dtype,
|
|
||||||
quantization="gptq") as gptq_model:
|
|
||||||
gptq_outputs = gptq_model.generate_greedy_logprobs(
|
|
||||||
example_prompts, max_tokens, num_logprobs)
|
|
||||||
|
|
||||||
check_logprobs_close(
|
|
||||||
outputs_0_lst=gptq_outputs,
|
|
||||||
outputs_1_lst=marlin_outputs,
|
|
||||||
name_0="gptq",
|
|
||||||
name_1="marlin",
|
|
||||||
)
|
|
||||||
@ -4,7 +4,7 @@ Run `pytest tests/models/test_mistral.py`.
|
|||||||
"""
|
"""
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import SamplingParams
|
||||||
|
|
||||||
from ...utils import check_logprobs_close
|
from ...utils import check_logprobs_close
|
||||||
|
|
||||||
@ -15,6 +15,10 @@ MODELS = [
|
|||||||
# "mistralai/Mistral-Nemo-Instruct-2407"
|
# "mistralai/Mistral-Nemo-Instruct-2407"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
MISTRAL_FORMAT_MODELS = [
|
||||||
|
"mistralai/Mistral-7B-Instruct-v0.3",
|
||||||
|
]
|
||||||
|
|
||||||
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
|
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
|
||||||
SYMBOLIC_LANG_PROMPTS = [
|
SYMBOLIC_LANG_PROMPTS = [
|
||||||
"勇敢な船乗りについての詩を書く", # japanese
|
"勇敢な船乗りについての詩を書く", # japanese
|
||||||
@ -95,7 +99,7 @@ def test_models(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS[1:])
|
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
|
||||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||||
@pytest.mark.parametrize("max_tokens", [64])
|
@pytest.mark.parametrize("max_tokens", [64])
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
@ -135,28 +139,29 @@ def test_mistral_format(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS[1:])
|
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
|
||||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||||
@pytest.mark.parametrize("prompt", SYMBOLIC_LANG_PROMPTS)
|
|
||||||
def test_mistral_symbolic_languages(
|
def test_mistral_symbolic_languages(
|
||||||
|
vllm_runner,
|
||||||
model: str,
|
model: str,
|
||||||
dtype: str,
|
dtype: str,
|
||||||
prompt: str,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
prompt = "hi"
|
with vllm_runner(model,
|
||||||
msg = {"role": "user", "content": prompt}
|
dtype=dtype,
|
||||||
llm = LLM(model=model,
|
max_model_len=8192,
|
||||||
dtype=dtype,
|
tokenizer_mode="mistral",
|
||||||
max_model_len=8192,
|
config_format="mistral",
|
||||||
tokenizer_mode="mistral",
|
load_format="mistral") as vllm_model:
|
||||||
config_format="mistral",
|
for prompt in SYMBOLIC_LANG_PROMPTS:
|
||||||
load_format="mistral")
|
msg = {"role": "user", "content": prompt}
|
||||||
outputs = llm.chat([msg], sampling_params=SAMPLING_PARAMS)
|
outputs = vllm_model.model.chat([msg],
|
||||||
assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
|
sampling_params=SAMPLING_PARAMS)
|
||||||
|
assert "<EFBFBD>" not in outputs[0].outputs[0].text.strip()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||||
@pytest.mark.parametrize("model", MODELS[1:]) # v1 can't do func calling
|
@pytest.mark.parametrize("model",
|
||||||
|
MISTRAL_FORMAT_MODELS) # v1 can't do func calling
|
||||||
def test_mistral_function_calling(
|
def test_mistral_function_calling(
|
||||||
vllm_runner,
|
vllm_runner,
|
||||||
model: str,
|
model: str,
|
||||||
|
|||||||
@ -7,25 +7,39 @@ Run `pytest tests/models/test_models.py`.
|
|||||||
"""
|
"""
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from ...utils import check_outputs_equal
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
from ...utils import check_logprobs_close
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
"facebook/opt-125m",
|
"facebook/opt-125m", # opt
|
||||||
"gpt2",
|
"openai-community/gpt2", # gpt2
|
||||||
"bigcode/tiny_starcoder_py",
|
# "Milos/slovak-gpt-j-405M", # gptj
|
||||||
"EleutherAI/pythia-70m",
|
# "bigcode/tiny_starcoder_py", # gpt_bigcode
|
||||||
"bigscience/bloom-560m", # Testing alibi slopes.
|
# "EleutherAI/pythia-70m", # gpt_neox
|
||||||
"microsoft/phi-2",
|
"bigscience/bloom-560m", # bloom - testing alibi slopes
|
||||||
"stabilityai/stablelm-3b-4e1t",
|
"microsoft/phi-2", # phi
|
||||||
# "allenai/OLMo-1B", # Broken
|
# "stabilityai/stablelm-3b-4e1t", # stablelm
|
||||||
"bigcode/starcoder2-3b",
|
# "bigcode/starcoder2-3b", # starcoder2
|
||||||
"google/gemma-1.1-2b-it",
|
"google/gemma-1.1-2b-it", # gemma
|
||||||
|
"Qwen/Qwen2.5-0.5B-Instruct", # qwen2
|
||||||
|
"meta-llama/Llama-3.2-1B-Instruct", # llama
|
||||||
]
|
]
|
||||||
|
|
||||||
|
if not current_platform.is_cpu():
|
||||||
|
MODELS += [
|
||||||
|
# fused_moe which not supported on CPU
|
||||||
|
"openbmb/MiniCPM3-4B",
|
||||||
|
]
|
||||||
|
|
||||||
|
# TODO: remove this after CPU float16 support ready
|
||||||
|
target_dtype = "float" if current_platform.is_cpu() else "half"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@pytest.mark.parametrize("dtype", ["float"])
|
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||||
@pytest.mark.parametrize("max_tokens", [96])
|
@pytest.mark.parametrize("max_tokens", [32])
|
||||||
|
@pytest.mark.parametrize("num_logprobs", [5])
|
||||||
def test_models(
|
def test_models(
|
||||||
hf_runner,
|
hf_runner,
|
||||||
vllm_runner,
|
vllm_runner,
|
||||||
@ -33,33 +47,24 @@ def test_models(
|
|||||||
model: str,
|
model: str,
|
||||||
dtype: str,
|
dtype: str,
|
||||||
max_tokens: int,
|
max_tokens: int,
|
||||||
|
num_logprobs: int,
|
||||||
) -> None:
|
) -> None:
|
||||||
# To pass the small model tests, we need full precision.
|
|
||||||
assert dtype == "float"
|
|
||||||
|
|
||||||
with hf_runner(model, dtype=dtype) as hf_model:
|
with hf_runner(model, dtype=dtype) as hf_model:
|
||||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||||
|
example_prompts, max_tokens, num_logprobs)
|
||||||
|
|
||||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||||
|
example_prompts, max_tokens, num_logprobs)
|
||||||
|
# This test is for verifying whether the model's extra_repr
|
||||||
|
# can be printed correctly.
|
||||||
|
print(vllm_model.model.llm_engine.model_executor.driver_worker.
|
||||||
|
model_runner.model)
|
||||||
|
|
||||||
check_outputs_equal(
|
check_logprobs_close(
|
||||||
outputs_0_lst=hf_outputs,
|
outputs_0_lst=hf_outputs,
|
||||||
outputs_1_lst=vllm_outputs,
|
outputs_1_lst=vllm_outputs,
|
||||||
name_0="hf",
|
name_0="hf",
|
||||||
name_1="vllm",
|
name_1="vllm",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
|
||||||
@pytest.mark.parametrize("dtype", ["float"])
|
|
||||||
def test_model_print(
|
|
||||||
vllm_runner,
|
|
||||||
model: str,
|
|
||||||
dtype: str,
|
|
||||||
) -> None:
|
|
||||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
|
||||||
# This test is for verifying whether the model's extra_repr
|
|
||||||
# can be printed correctly.
|
|
||||||
print(vllm_model.model.llm_engine.model_executor.driver_worker.
|
|
||||||
model_runner.model)
|
|
||||||
|
|||||||
@ -1,34 +0,0 @@
|
|||||||
"""Ensure that a text-only Qwen model can be run without throwing an error.
|
|
||||||
We explicitly test this because Qwen is implemented as a multimodal and
|
|
||||||
supports a visual encoder for models like Qwen-VL.
|
|
||||||
"""
|
|
||||||
from typing import List, Type
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from ....conftest import VllmRunner
|
|
||||||
|
|
||||||
models = [
|
|
||||||
"Qwen/Qwen-7B-Chat" # Has no visual encoder
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", models)
|
|
||||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
|
||||||
@pytest.mark.parametrize("max_tokens", [32])
|
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
|
||||||
def test_text_only_qwen_model_can_be_loaded_and_run(
|
|
||||||
vllm_runner: Type[VllmRunner],
|
|
||||||
example_prompts: List[str],
|
|
||||||
model: str,
|
|
||||||
*,
|
|
||||||
dtype: str,
|
|
||||||
max_tokens: int,
|
|
||||||
num_logprobs: int,
|
|
||||||
):
|
|
||||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
|
||||||
vllm_model.generate_greedy_logprobs(
|
|
||||||
example_prompts,
|
|
||||||
max_tokens,
|
|
||||||
num_logprobs=num_logprobs,
|
|
||||||
)
|
|
||||||
Loading…
x
Reference in New Issue
Block a user