From bd98842c8a018d02d09f8c4249ea0547f505286d Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Wed, 10 Sep 2025 21:16:39 +0800 Subject: [PATCH] [CI] Add PPL test for generation models (#24485) Signed-off-by: wang.yuqi --- .buildkite/test-pipeline.yaml | 10 ++ .../language/generation_ppl_test/__init__.py | 0 .../language/generation_ppl_test/ppl_utils.py | 131 ++++++++++++++++++ .../generation_ppl_test/test_gemma.py | 18 +++ .../language/generation_ppl_test/test_gpt.py | 14 ++ .../language/generation_ppl_test/test_qwen.py | 21 +++ tests/models/language/pooling/embed_utils.py | 2 +- tests/models/language/pooling/mteb_utils.py | 11 +- tests/models/utils.py | 11 +- 9 files changed, 211 insertions(+), 7 deletions(-) create mode 100644 tests/models/language/generation_ppl_test/__init__.py create mode 100644 tests/models/language/generation_ppl_test/ppl_utils.py create mode 100644 tests/models/language/generation_ppl_test/test_gemma.py create mode 100644 tests/models/language/generation_ppl_test/test_gpt.py create mode 100644 tests/models/language/generation_ppl_test/test_qwen.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0479c86f7a974..75a9c4a22cb4d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -604,6 +604,16 @@ steps: - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' +- label: Language Models Test (PPL) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation_ppl_test + commands: + - pytest -v -s models/language/generation_ppl_test + - label: Language Models Test (Extended Pooling) # 36min timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] diff --git a/tests/models/language/generation_ppl_test/__init__.py b/tests/models/language/generation_ppl_test/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/language/generation_ppl_test/ppl_utils.py b/tests/models/language/generation_ppl_test/ppl_utils.py new file mode 100644 index 0000000000000..550e874cf8579 --- /dev/null +++ b/tests/models/language/generation_ppl_test/ppl_utils.py @@ -0,0 +1,131 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Adapted from https://huggingface.co/docs/transformers/perplexity +from typing import Optional, cast + +import pytest +import torch +from datasets import load_dataset + +from tests.models.utils import (GenerateModelInfo, + TokensTextLogprobsPromptLogprobs) +from vllm.logprobs import Logprob + +# See #24485 +PPL_TOL = 0.01 +MAX_LENGTH = 1024 + + +@torch.inference_mode +def wikitext_ppl_test(hf_runner, + vllm_runner, + model_info: GenerateModelInfo, + max_length=MAX_LENGTH, + vllm_extra_kwargs=None, + atol=PPL_TOL): + + # A model family has many models with the same architecture, + # and we don't need to test each one. + if not model_info.enable_test: + pytest.skip("Skipping test.") + + dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") + + # Allow vllm to test using the given dtype, such as float32 + vllm_extra_kwargs = vllm_extra_kwargs or {} + vllm_extra_kwargs["dtype"] = model_info.dtype + + # Allow vllm to test using hf_overrides + if model_info.hf_overrides is not None: + vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides + + with vllm_runner(model_info.name, + gpu_memory_utilization=0.7, + max_model_len=max_length, + max_num_seqs=1, + enforce_eager=True, + **vllm_extra_kwargs) as vllm_model: + # Use max_num_seqs=1 to avoid OOM, + # and batch different requests together. + + model_config = vllm_model.llm.llm_engine.model_config + + # Confirm whether vllm is using the correct architecture + if model_info.architecture: + assert (model_info.architecture in model_config.architectures) + + max_length = min(model_config.max_model_len - 1, max_length) + stride = max_length + + tokenizer = vllm_model.llm.get_tokenizer() + tokens = tokenizer.encode("\n\n".join(dataset["text"])) + n_tokens = len(tokens) + + chunks = [] + for begin_loc in range(0, n_tokens, stride): + end_loc = min(begin_loc + max_length, n_tokens) + chunks.append(tokens[begin_loc:end_loc]) + + outputs = vllm_model.generate_greedy_logprobs(prompts=chunks, + max_tokens=1, + num_logprobs=None, + num_prompt_logprobs=0, + use_tqdm=False) + nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu") + n_tokens = 0 + for output in outputs: + output = cast(TokensTextLogprobsPromptLogprobs, output) + token_datas = cast(list[Optional[dict[int, Logprob]]], output[3]) + + assert token_datas[0] is None + token_log_probs = [] + for token_data in token_datas[1:]: + assert token_data is not None + assert len(token_data) == 1 + token_log_prob = list(token_data.values())[0].logprob + token_log_probs.append(token_log_prob) + + neg_log_likelihood = -torch.tensor( + token_log_probs, dtype=torch.float32, device="cpu").sum() + nll_sum += neg_log_likelihood + n_tokens += len(token_log_probs) + vllm_ppl = float(torch.exp(nll_sum / n_tokens)) + vllm_dtype = model_config.dtype + + # Accelerate ppl test by setting Transformers ppl score to a constant + if model_info.hf_ppl is None: + with hf_runner( + model_info.name, + dtype=model_info.hf_dtype, + ) as hf_model: + nll_sum = torch.tensor(0., dtype=torch.float32, device="cpu") + n_tokens = 0 + for chunk in chunks: + inputs = hf_model.wrap_device( + {"input_ids": torch.tensor([chunk])}) + input_ids = inputs["input_ids"] + outputs = hf_model.model(input_ids, labels=input_ids) + neg_log_likelihood = outputs.loss + + neg_log_likelihood = neg_log_likelihood.to(torch.float32).cpu() + + num_loss_tokens = len(chunk) - 1 + nll_sum += neg_log_likelihood * num_loss_tokens + n_tokens += num_loss_tokens + + hf_ppl = float(torch.exp(nll_sum / n_tokens)) + hf_dtype = next(hf_model.model.parameters()).dtype + else: + hf_ppl = model_info.hf_ppl + hf_dtype = "Constant" + + differ = (vllm_ppl - hf_ppl) / hf_ppl + print("Model:", model_info.name) + print("VLLM:", vllm_dtype, vllm_ppl) + print("Transformers:", hf_dtype, hf_ppl) + print("Difference (%):", differ * 100) + + # PPL the smaller, the better + # We are not concerned that the vllm PPL is less than Transformers, + # so we only perform one-sided testing. + assert differ < atol diff --git a/tests/models/language/generation_ppl_test/test_gemma.py b/tests/models/language/generation_ppl_test/test_gemma.py new file mode 100644 index 0000000000000..5324de143d674 --- /dev/null +++ b/tests/models/language/generation_ppl_test/test_gemma.py @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + +from tests.models.utils import GenerateModelInfo + +from .ppl_utils import wikitext_ppl_test + +MODELS = [ + GenerateModelInfo("google/gemma-2b"), + GenerateModelInfo("google/gemma-2-2b"), + GenerateModelInfo("google/gemma-3-4b-it"), +] + + +@pytest.mark.parametrize("model_info", MODELS) +def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo): + wikitext_ppl_test(hf_runner, vllm_runner, model_info) diff --git a/tests/models/language/generation_ppl_test/test_gpt.py b/tests/models/language/generation_ppl_test/test_gpt.py new file mode 100644 index 0000000000000..f3f9e55a24234 --- /dev/null +++ b/tests/models/language/generation_ppl_test/test_gpt.py @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + +from tests.models.utils import GenerateModelInfo + +from .ppl_utils import wikitext_ppl_test + +MODELS = [GenerateModelInfo("openai-community/gpt2-large")] + + +@pytest.mark.parametrize("model_info", MODELS) +def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo): + wikitext_ppl_test(hf_runner, vllm_runner, model_info) diff --git a/tests/models/language/generation_ppl_test/test_qwen.py b/tests/models/language/generation_ppl_test/test_qwen.py new file mode 100644 index 0000000000000..0d3127cbaac47 --- /dev/null +++ b/tests/models/language/generation_ppl_test/test_qwen.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from tests.models.utils import GenerateModelInfo + +from .ppl_utils import wikitext_ppl_test + +MODELS = [ + GenerateModelInfo("Qwen/Qwen3-0.6B"), + GenerateModelInfo("Qwen/Qwen3-0.6B-FP8"), + # transformers: + # Loading a GPTQ quantized model requires optimum, gptqmodel + # GenerateModelInfo("Qwen/Qwen3-0.6B-GPTQ-Int8"), +] + + +@pytest.mark.parametrize("model_info", MODELS) +def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo): + wikitext_ppl_test(hf_runner, vllm_runner, model_info) diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py index 8f8393c4e16fc..86751e0a4d5f4 100644 --- a/tests/models/language/pooling/embed_utils.py +++ b/tests/models/language/pooling/embed_utils.py @@ -59,7 +59,7 @@ def correctness_test_embed_models(hf_runner, with hf_runner( model_info.name, - dtype="float32", + dtype=model_info.hf_dtype, is_sentence_transformer=True, ) as hf_model: diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index 7336c30bdda33..56a105e96e5ee 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -213,7 +213,7 @@ def mteb_test_embed_models(hf_runner, if model_info.mteb_score is None: with hf_runner(model_info.name, is_sentence_transformer=True, - dtype="float32") as hf_model: + dtype=model_info.hf_dtype) as hf_model: # e.g. setting default parameters for the encode method of hf_runner if hf_model_callback is not None: @@ -278,9 +278,12 @@ def run_mteb_rerank(cross_encoder, tasks, languages): return main_score -def mteb_test_rerank_models_hf(hf_runner, model_name, hf_model_callback=None): +def mteb_test_rerank_models_hf(hf_runner, + model_name, + hf_dtype="float32", + hf_model_callback=None): with hf_runner(model_name, is_cross_encoder=True, - dtype="float32") as hf_model: + dtype=hf_dtype) as hf_model: original_predict = hf_model.predict @@ -357,7 +360,7 @@ def mteb_test_rerank_models(hf_runner, # SentenceTransformers mteb score to a constant if model_info.mteb_score is None: st_main_score, st_dtype = mteb_test_rerank_models_hf( - hf_runner, model_info.name, hf_model_callback) + hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback) else: st_main_score = model_info.mteb_score st_dtype = "Constant" diff --git a/tests/models/utils.py b/tests/models/utils.py index ab0b27af4d697..44e9bf539bc17 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -347,14 +347,15 @@ class ModelInfo: name: str architecture: str = "" dtype: str = "auto" + hf_dtype: str = "float32" hf_overrides: Optional[dict[str, Any]] = None default_pooling_type: str = "" - mteb_score: Optional[float] = None enable_test: bool = True @dataclass class EmbedModelInfo(ModelInfo): + mteb_score: Optional[float] = None is_matryoshka: bool = False matryoshka_dimensions: Optional[list[int]] = None @@ -371,7 +372,7 @@ class LASTPoolingEmbedModelInfo(EmbedModelInfo): @dataclass class RerankModelInfo(ModelInfo): - pass + mteb_score: Optional[float] = None @dataclass @@ -384,6 +385,12 @@ class LASTPoolingRerankModelInfo(RerankModelInfo): default_pooling_type: str = "LAST" +@dataclass +class GenerateModelInfo(ModelInfo): + hf_dtype: str = "auto" + hf_ppl: Optional[float] = None + + def dummy_hf_overrides( hf_config: PretrainedConfig, *,