diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 13ed64ed00ff..d3c07cdda454 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -395,10 +395,8 @@ steps: - csrc/ - vllm/model_executor/layers/quantization - tests/quantization - - tests/models/quantization commands: - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization - - pytest -v -s models/quantization - label: LM Eval Small Models # 53min working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" @@ -509,6 +507,14 @@ steps: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' +- label: Quantized Models Test + #mirror_hardwares: [amd] + source_file_dependencies: + - vllm/model_executor/layers/quantization + - tests/models/quantization + commands: + - pytest -v -s models/quantization + # This test is used only in PR development phase to test individual models and should never run on main - label: Custom Models Test mirror_hardwares: [amd] diff --git a/tests/models/language/generation/test_models.py b/tests/models/language/generation/test_common.py similarity index 97% rename from tests/models/language/generation/test_models.py rename to tests/models/language/generation/test_common.py index e55a682c04da..ab2898ffb2d0 100644 --- a/tests/models/language/generation/test_models.py +++ b/tests/models/language/generation/test_common.py @@ -1,9 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -"""Compare the outputs of HF and vLLM when using greedy sampling. - -Run `pytest tests/models/test_models.py`. -""" - import pytest import torch diff --git a/tests/models/language/generation/test_granite.py b/tests/models/language/generation/test_granite.py index 119b79d64c96..f381c34f44b8 100644 --- a/tests/models/language/generation/test_granite.py +++ b/tests/models/language/generation/test_granite.py @@ -1,8 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -"""Compare the outputs of HF and vLLM for Granite models using greedy sampling. - -Run `pytest tests/models/test_granite.py`. -""" import pytest from ...utils import check_logprobs_close diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py index 79778072cc8b..c1b612ae213b 100644 --- a/tests/models/language/generation/test_mistral.py +++ b/tests/models/language/generation/test_mistral.py @@ -1,8 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling. - -Run `pytest tests/models/test_mistral.py`. -""" import copy import json diff --git a/tests/models/language/generation/test_phimoe.py b/tests/models/language/generation/test_phimoe.py index f9757d6ac295..603ca1cb12a5 100644 --- a/tests/models/language/generation/test_phimoe.py +++ b/tests/models/language/generation/test_phimoe.py @@ -1,8 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -"""Compare the outputs of HF and vLLM for moe models using greedy sampling. - -Run `pytest tests/models/test_phimoe.py`. -""" import pytest import torch diff --git a/tests/models/language/pooling/test_cls_models.py b/tests/models/language/pooling/test_classification.py similarity index 91% rename from tests/models/language/pooling/test_cls_models.py rename to tests/models/language/pooling/test_classification.py index 6a3cd8a5c594..44af3df08a86 100644 --- a/tests/models/language/pooling/test_cls_models.py +++ b/tests/models/language/pooling/test_classification.py @@ -1,8 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -"""Compare the classification outputs of HF and vLLM models. - -Run `pytest tests/models/test_cls_models.py`. -""" import pytest import torch from transformers import AutoModelForSequenceClassification @@ -19,7 +15,7 @@ from vllm.platforms import current_platform ) @pytest.mark.parametrize("dtype", ["half"] if current_platform.is_rocm() else ["float"]) -def test_classification_models( +def test_models( hf_runner, vllm_runner, example_prompts, diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index 2a90f47af545..9db385e77bdb 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -1,8 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -"""Compare the embedding outputs of HF and vLLM models. - -Run `pytest tests/models/embedding/language/test_embedding.py`. -""" import pytest from vllm.config import PoolerConfig diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py index 154aefe594a7..5287ca37c0fb 100644 --- a/tests/models/language/pooling/test_jina.py +++ b/tests/models/language/pooling/test_jina.py @@ -1,9 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -# ruff: noqa: E501 -"""Compare the scoring outputs of HF and vLLM models. - -Run `pytest tests/models/embedding/language/test_jina.py`. -""" import math import pytest @@ -22,9 +17,9 @@ TEXTS_2 = [ "Organic skincare for sensitive skin with aloe vera and chamomile.", "New makeup trends focus on bold colors and innovative techniques", "Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille", - "Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken", - "Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla", - "Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras", + "Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken", # noqa: E501 + "Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla", # noqa: E501 + "Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras", # noqa: E501 "针对敏感肌专门设计的天然有机护肤产品", "新的化妆趋势注重鲜艳的颜色和创新的技巧", "敏感肌のために特別に設計された天然有機スキンケア製品", diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py index d6408258ffce..e9527700c3ca 100644 --- a/tests/models/language/pooling/test_scoring.py +++ b/tests/models/language/pooling/test_scoring.py @@ -1,15 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 -"""Compare the scoring outputs of HF and vLLM models. - -Run `pytest tests/models/embedding/language/test_scoring.py`. -""" import math import pytest import torch import torch.nn.functional as F -MODELS = [ +CROSS_ENCODER_MODELS = [ "cross-encoder/ms-marco-MiniLM-L-6-v2", # Bert "BAAI/bge-reranker-v2-m3", # Roberta ] @@ -28,21 +24,21 @@ TEXTS_2 = [ "The capital of Germany is Berlin.", ] +DTYPE = "half" -@pytest.fixture(scope="module", params=MODELS) + +@pytest.fixture(scope="module", params=CROSS_ENCODER_MODELS) def model_name(request): yield request.param -@pytest.mark.parametrize("dtype", ["half"]) -def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str): - +def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name): text_pair = [TEXTS_1[0], TEXTS_2[0]] - with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model: + with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model: hf_outputs = hf_model.predict([text_pair]).tolist() - with vllm_runner(model_name, task="score", dtype=dtype, + with vllm_runner(model_name, task="score", dtype=DTYPE, max_model_len=None) as vllm_model: vllm_outputs = vllm_model.score(text_pair[0], text_pair[1]) @@ -52,18 +48,16 @@ def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str): assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) -@pytest.mark.parametrize("dtype", ["half"]) -def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str): - +def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name): text_pairs = [ [TEXTS_1[0], TEXTS_2[0]], [TEXTS_1[0], TEXTS_2[1]], ] - with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model: + with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model: hf_outputs = hf_model.predict(text_pairs).tolist() - with vllm_runner(model_name, task="score", dtype=dtype, + with vllm_runner(model_name, task="score", dtype=DTYPE, max_model_len=None) as vllm_model: vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2) @@ -74,18 +68,16 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str): assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) -@pytest.mark.parametrize("dtype", ["half"]) -def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str): - +def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name): text_pairs = [ [TEXTS_1[0], TEXTS_2[0]], [TEXTS_1[1], TEXTS_2[1]], ] - with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model: + with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model: hf_outputs = hf_model.predict(text_pairs).tolist() - with vllm_runner(model_name, task="score", dtype=dtype, + with vllm_runner(model_name, task="score", dtype=DTYPE, max_model_len=None) as vllm_model: vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2) @@ -101,13 +93,10 @@ def emb_model_name(request): yield request.param -@pytest.mark.parametrize("dtype", ["half"]) -def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name, - dtype: str): - +def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name): text_pair = [TEXTS_1[0], TEXTS_2[0]] - with hf_runner(emb_model_name, dtype=dtype, + with hf_runner(emb_model_name, dtype=DTYPE, is_sentence_transformer=True) as hf_model: hf_embeddings = hf_model.encode(text_pair) hf_outputs = [ @@ -116,7 +105,7 @@ def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name, with vllm_runner(emb_model_name, task="embed", - dtype=dtype, + dtype=DTYPE, max_model_len=None) as vllm_model: vllm_outputs = vllm_model.score(text_pair[0], text_pair[1]) @@ -126,16 +115,13 @@ def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name, assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) -@pytest.mark.parametrize("dtype", ["half"]) -def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name, - dtype: str): - +def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name): text_pairs = [ [TEXTS_1[0], TEXTS_2[0]], [TEXTS_1[0], TEXTS_2[1]], ] - with hf_runner(emb_model_name, dtype=dtype, + with hf_runner(emb_model_name, dtype=DTYPE, is_sentence_transformer=True) as hf_model: hf_embeddings = [ hf_model.encode(text_pair) for text_pair in text_pairs @@ -147,7 +133,7 @@ def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name, with vllm_runner(emb_model_name, task="embed", - dtype=dtype, + dtype=DTYPE, max_model_len=None) as vllm_model: vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2) @@ -158,16 +144,13 @@ def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name, assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) -@pytest.mark.parametrize("dtype", ["half"]) -def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name, - dtype: str): - +def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name): text_pairs = [ [TEXTS_1[0], TEXTS_2[0]], [TEXTS_1[1], TEXTS_2[1]], ] - with hf_runner(emb_model_name, dtype=dtype, + with hf_runner(emb_model_name, dtype=DTYPE, is_sentence_transformer=True) as hf_model: hf_embeddings = [ hf_model.encode(text_pair) for text_pair in text_pairs @@ -179,7 +162,7 @@ def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name, with vllm_runner(emb_model_name, task="embed", - dtype=dtype, + dtype=DTYPE, max_model_len=None) as vllm_model: vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2) diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py index 81abc0e9e930..c050b35b76ba 100644 --- a/tests/models/language/pooling/test_snowflake_arctic_embed.py +++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py @@ -1,8 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -"""Compare the embedding outputs of HF and vLLM models. - -Run `pytest tests/models/embedding/language/test_snowflake_arctic_embed.py`. -""" import pytest from ...utils import EmbedModelInfo, check_embeddings_close diff --git a/tests/models/language/pooling/test_truncation_control.py b/tests/models/language/pooling/test_truncation_control.py index a215e1ec564d..1b8ac395ed17 100644 --- a/tests/models/language/pooling/test_truncation_control.py +++ b/tests/models/language/pooling/test_truncation_control.py @@ -5,18 +5,18 @@ MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2" max_model_len = 128 input_str = """Immerse yourself in the enchanting chronicle of calculus, a - mathematical domain that has radically transformed our comprehension of - change and motion. Despite its roots in ancient civilizations, the - formal birth of calculus predominantly occurred in the 17th century, - primarily under the influential guidance of Sir Isaac Newton and Gottfried - Wilhelm Leibniz. The earliest traces of calculus concepts are found in - ancient Greek mathematics,most notably in the works of Eudoxus and - Archimedes, around 300 BCE. They utilized the 'method of exhaustion'—a - technique for computing areas and volumes through the use of finite sums. - This methodology laid crucial foundational work for integral calculus. - In the 17th century, both Newton and Leibniz independently pioneered - calculus, each contributing unique perspectives that would shape this new - field.""" +mathematical domain that has radically transformed our comprehension of +change and motion. Despite its roots in ancient civilizations, the +formal birth of calculus predominantly occurred in the 17th century, +primarily under the influential guidance of Sir Isaac Newton and Gottfried +Wilhelm Leibniz. The earliest traces of calculus concepts are found in +ancient Greek mathematics,most notably in the works of Eudoxus and +Archimedes, around 300 BCE. They utilized the 'method of exhaustion'—a +technique for computing areas and volumes through the use of finite sums. +This methodology laid crucial foundational work for integral calculus. +In the 17th century, both Newton and Leibniz independently pioneered +calculus, each contributing unique perspectives that would shape this new +field.""" def test_smaller_truncation_size(vllm_runner, diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py index 6ebe75f0e812..506b71472f4a 100644 --- a/tests/models/multimodal/generation/test_pixtral.py +++ b/tests/models/multimodal/generation/test_pixtral.py @@ -1,8 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling. - -Run `pytest tests/models/test_mistral.py`. -""" import json from dataclasses import asdict from typing import TYPE_CHECKING, Any, Optional diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py index 4603b4e8e834..4e48bdbd0428 100644 --- a/tests/models/multimodal/generation/test_whisper.py +++ b/tests/models/multimodal/generation/test_whisper.py @@ -119,10 +119,10 @@ def run_test( assert output.outputs[0].text == expected -@create_new_process_for_each_test("spawn") @pytest.mark.core_model @pytest.mark.parametrize( "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"]) +@create_new_process_for_each_test() def test_models(vllm_runner, model) -> None: run_test( vllm_runner, @@ -131,11 +131,11 @@ def test_models(vllm_runner, model) -> None: ) -@create_new_process_for_each_test("spawn") @multi_gpu_test(num_gpus=2) @pytest.mark.core_model @pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) +@create_new_process_for_each_test() def test_models_distributed( vllm_runner, model, diff --git a/tests/models/quantization/test_aqlm.py b/tests/models/quantization/test_aqlm.py index c4e142fcc9b5..548053b7ae43 100644 --- a/tests/models/quantization/test_aqlm.py +++ b/tests/models/quantization/test_aqlm.py @@ -1,9 +1,4 @@ # SPDX-License-Identifier: Apache-2.0 -"""Compare the outputs of a AQLM model between vLLM and HF Transformers - -Run `pytest tests/models/test_aqlm.py`. -""" - import pytest from tests.quantization.utils import is_quant_method_supported diff --git a/tests/models/quantization/test_bitblas.py b/tests/models/quantization/test_bitblas.py index 6d7c30126fc0..f0781394d81d 100644 --- a/tests/models/quantization/test_bitblas.py +++ b/tests/models/quantization/test_bitblas.py @@ -8,8 +8,6 @@ bitblas/GPTQ models are in the top 3 selections of each other. Note: bitblas internally uses locks to synchronize the threads. This can result in very slight nondeterminism for bitblas. As a result, we re-run the test up to 3 times to see if we pass. - -Run `pytest tests/models/test_bitblas.py`. """ from dataclasses import dataclass diff --git a/tests/models/quantization/test_gptq_bitblas.py b/tests/models/quantization/test_gptq_bitblas.py index 98cd03eb741b..c8e96455fd0c 100644 --- a/tests/models/quantization/test_gptq_bitblas.py +++ b/tests/models/quantization/test_gptq_bitblas.py @@ -8,8 +8,6 @@ bitblas/GPTQ models are in the top 3 selections of each other. Note: bitblas internally uses locks to synchronize the threads. This can result in very slight nondeterminism for bitblas. As a result, we re-run the test up to 3 times to see if we pass. - -Run `pytest tests/models/test_bitblas.py`. """ from dataclasses import dataclass diff --git a/tests/models/quantization/test_gptq_marlin.py b/tests/models/quantization/test_gptq_marlin.py index c6e7d234d1a9..680134c6eae8 100644 --- a/tests/models/quantization/test_gptq_marlin.py +++ b/tests/models/quantization/test_gptq_marlin.py @@ -1,13 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 -"""Compares the outputs of gptq vs gptq_marlin +"""Compares the outputs of gptq vs gptq_marlin. + Note: GPTQ and Marlin do not have bitwise correctness. As a result, in this test, we just confirm that the top selected tokens of the Marlin/GPTQ models are in the top 5 selections of each other. Note: Marlin internally uses locks to synchronize the threads. This can result in very slight nondeterminism for Marlin. As a result, we re-run the test up to 3 times to see if we pass. - -Run `pytest tests/models/test_gptq_marlin.py`. """ import os diff --git a/tests/models/quantization/test_gptq_marlin_24.py b/tests/models/quantization/test_gptq_marlin_24.py index c1000b181b07..ce28f964d544 100644 --- a/tests/models/quantization/test_gptq_marlin_24.py +++ b/tests/models/quantization/test_gptq_marlin_24.py @@ -4,8 +4,6 @@ Note: GPTQ and Marlin_24 do not have bitwise correctness. As a result, in this test, we just confirm that the top selected tokens of the Marlin/GPTQ models are in the top 3 selections of each other. - -Run `pytest tests/models/test_marlin_24.py`. """ from dataclasses import dataclass diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index 65bb11d6b5e4..6da488897be5 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -1,8 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -"""Test the functionality of the Transformers backend. - -Run `pytest tests/models/test_transformers.py`. -""" +"""Test the functionality of the Transformers backend.""" import pytest from ..conftest import HfRunner, VllmRunner