[CI/Build] Reorganize models tests (#17459)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-05-01 14:03:08 +08:00 committed by GitHub
parent aa4502e7f3
commit afb4429b4f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
65 changed files with 316 additions and 323 deletions

View File

@ -390,12 +390,15 @@ steps:
commands: commands:
- pytest -v -s benchmarks/ - pytest -v -s benchmarks/
- label: Quantization Test # 33min - label: Quantization Test
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
- tests/quantization - tests/quantization
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization - tests/models/quantization
commands:
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- pytest -v -s models/quantization
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 53min
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
@ -441,82 +444,70 @@ steps:
commands: commands:
- pytest -v -s models/test_transformers.py - pytest -v -s models/test_transformers.py
- pytest -v -s models/test_registry.py - pytest -v -s models/test_registry.py
- pytest -v -s models/test_utils.py
- pytest -v -s models/test_vision.py
# V1 Test: https://github.com/vllm-project/vllm/issues/14531 # V1 Test: https://github.com/vllm-project/vllm/issues/14531
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2' - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4' - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2' - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
- label: Language Models Test (Standard) # 32min - label: Language Models Test (Standard)
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/decoder_only/language - tests/models/language
- tests/models/embedding/language
- tests/models/encoder_decoder/language
commands: commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model' - pytest -v -s models/language -m core_model
- pytest -v -s models/embedding/language -m core_model
- label: Language Models Test (Extended) # 1h10min - label: Language Models Test (Extended)
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/decoder_only/language - tests/models/language
- tests/models/embedding/language
- tests/models/encoder_decoder/language
commands: commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- pip install causal-conv1d - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' - pytest -v -s models/language -m 'not core_model'
- pytest -v -s models/embedding/language -m 'not core_model'
- label: Multi-Modal Models Test (Standard) # 40min - label: Multi-Modal Models Test (Standard)
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/decoder_only/audio_language - tests/models/multimodal
- tests/models/decoder_only/vision_language
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/audio_language
- tests/models/encoder_decoder/vision_language
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal - pytest -v -s models/multimodal/processing
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model' - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
- pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model' - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- pytest -v -s models/embedding/vision_language -m core_model
- pytest -v -s models/encoder_decoder/audio_language -m core_model
- pytest -v -s models/encoder_decoder/language -m core_model
- pytest -v -s models/encoder_decoder/vision_language -m core_model
- pytest -v -s models/decoder_only/vision_language/test_interleaved.py
- label: Multi-Modal Models Test (Extended) 1 # 48m - label: Multi-Modal Models Test (Extended) 1
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/decoder_only/audio_language - tests/models/multimodal
- tests/models/decoder_only/vision_language
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/vision_language
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model' - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
- pytest -v -s models/embedding/vision_language -m 'not core_model'
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
- label: Multi-Modal Models Test (Extended) 2 # 38m - label: Multi-Modal Models Test (Extended) 2
optional: true optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/decoder_only/vision_language - tests/models/multimodal
commands: commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model' - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
- label: Multi-Modal Models Test (Extended) 3
optional: true
source_file_dependencies:
- vllm/
- tests/models/multimodal
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
# This test is used only in PR development phase to test individual models and should never run on main # This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test - label: Custom Models Test
@ -586,9 +577,8 @@ steps:
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
# Avoid importing model tests that cause CUDA reinitialization error # Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)' - pytest models/language -v -s -m 'distributed(num_gpus=2)'
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)' - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
# test sequence parallel # test sequence parallel
- pytest -v -s distributed/test_sequence_parallel.py - pytest -v -s distributed/test_sequence_parallel.py
# this test fails consistently. # this test fails consistently.

View File

@ -158,7 +158,6 @@ markers = [
"skip_global_cleanup", "skip_global_cleanup",
"core_model: enable this model test in each PR instead of only nightly", "core_model: enable this model test in each PR instead of only nightly",
"cpu_model: enable this model test in CPU tests", "cpu_model: enable this model test in CPU tests",
"quant_model: run this model test under Quantized category",
"split: run this test as part of a split", "split: run this test as part of a split",
"distributed: run this test only in distributed GPU tests", "distributed: run this test only in distributed GPU tests",
"skip_v1: do not run this test with v1", "skip_v1: do not run this test with v1",

View File

@ -11,7 +11,7 @@ import requests
from vllm.entrypoints.openai.protocol import EmbeddingResponse from vllm.entrypoints.openai.protocol import EmbeddingResponse
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
from ...models.embedding.utils import correctness_test from ...models.utils import run_embedding_correctness_test
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
MODEL_NAME = "intfloat/multilingual-e5-small" MODEL_NAME = "intfloat/multilingual-e5-small"
@ -76,7 +76,7 @@ async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
assert embeddings.usage.total_tokens == 11 assert embeddings.usage.total_tokens == 11
vllm_outputs = [d.embedding for d in embeddings.data] vllm_outputs = [d.embedding for d in embeddings.data]
correctness_test(hf_model, input_texts, vllm_outputs) run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
# test using token IDs # test using token IDs
input_tokens = [1, 1, 1, 1, 1] input_tokens = [1, 1, 1, 1, 1]
@ -121,7 +121,7 @@ async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
assert embeddings.usage.total_tokens == 33 assert embeddings.usage.total_tokens == 33
vllm_outputs = [d.embedding for d in embeddings.data] vllm_outputs = [d.embedding for d in embeddings.data]
correctness_test(hf_model, input_texts, vllm_outputs) run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
# test list[list[int]] # test list[list[int]]
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
@ -208,7 +208,7 @@ async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
model=model_name, model=model_name,
encoding_format="float") encoding_format="float")
float_data = [d.embedding for d in responses_float.data] float_data = [d.embedding for d in responses_float.data]
correctness_test(hf_model, input_texts, float_data) run_embedding_correctness_test(hf_model, input_texts, float_data)
responses_base64 = await client.embeddings.create(input=input_texts, responses_base64 = await client.embeddings.create(input=input_texts,
model=model_name, model=model_name,
@ -219,13 +219,13 @@ async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
np.frombuffer(base64.b64decode(data.embedding), np.frombuffer(base64.b64decode(data.embedding),
dtype="float32").tolist()) dtype="float32").tolist())
correctness_test(hf_model, input_texts, base64_data) run_embedding_correctness_test(hf_model, input_texts, base64_data)
# Default response is float32 decoded from base64 by OpenAI Client # Default response is float32 decoded from base64 by OpenAI Client
responses_default = await client.embeddings.create(input=input_texts, responses_default = await client.embeddings.create(input=input_texts,
model=model_name) model=model_name)
default_data = [d.embedding for d in responses_default.data] default_data = [d.embedding for d in responses_default.data]
correctness_test(hf_model, input_texts, default_data) run_embedding_correctness_test(hf_model, input_texts, default_data)
@pytest.mark.asyncio @pytest.mark.asyncio

View File

@ -11,7 +11,7 @@ import pytest
from vllm.entrypoints.openai.protocol import EmbeddingResponse from vllm.entrypoints.openai.protocol import EmbeddingResponse
from ...conftest import HfRunner from ...conftest import HfRunner
from ...models.embedding.utils import EmbedModelInfo, correctness_test from ...models.utils import EmbedModelInfo, run_embedding_correctness_test
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
MODELS = [ MODELS = [
@ -95,7 +95,8 @@ async def test_matryoshka(model_info: EmbedModelInfo,
assert len(embeddings.data[0].embedding) == dimensions assert len(embeddings.data[0].embedding) == dimensions
vllm_outputs = [d.embedding for d in embeddings.data] vllm_outputs = [d.embedding for d in embeddings.data]
correctness_test(hf_model, prompts, vllm_outputs, dimensions) run_embedding_correctness_test(hf_model, prompts, vllm_outputs,
dimensions)
if model_info.is_matryoshka: if model_info.is_matryoshka:
valid_dimensions: list[Optional[int]] = [None] valid_dimensions: list[Optional[int]] = [None]

View File

@ -1,66 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
from collections.abc import Sequence
from typing import NamedTuple, Optional
import torch
import torch.nn.functional as F
def check_embeddings_close(
*,
embeddings_0_lst: Sequence[list[float]],
embeddings_1_lst: Sequence[list[float]],
name_0: str,
name_1: str,
tol: float = 1e-3,
) -> None:
assert len(embeddings_0_lst) == len(embeddings_1_lst)
for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
zip(embeddings_0_lst, embeddings_1_lst)):
assert len(embeddings_0) == len(embeddings_1), (
f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
sim = F.cosine_similarity(torch.tensor(embeddings_0),
torch.tensor(embeddings_1),
dim=0)
fail_msg = (f"Test{prompt_idx}:"
f"\n{name_0}:\t{embeddings_0[:16]!r}"
f"\n{name_1}:\t{embeddings_1[:16]!r}")
assert sim >= 1 - tol, fail_msg
def matryoshka_fy(tensor, dimensions):
tensor = torch.tensor(tensor)
tensor = tensor[..., :dimensions]
tensor = F.normalize(tensor, p=2, dim=1)
return tensor
class EmbedModelInfo(NamedTuple):
name: str
is_matryoshka: bool
matryoshka_dimensions: Optional[list[int]] = None
architecture: str = ""
enable_test: bool = True
def correctness_test(hf_model,
inputs,
vllm_outputs: Sequence[list[float]],
dimensions: Optional[int] = None):
hf_outputs = hf_model.encode(inputs)
if dimensions:
hf_outputs = matryoshka_fy(hf_outputs, dimensions)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
tol=1e-2,
)

View File

@ -1,37 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
import pytest
from ....utils import multi_gpu_test
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", [
"meta-llama/Llama-3.2-11B-Vision-Instruct",
])
def test_models(hf_runner, vllm_runner, image_assets,
distributed_executor_backend, model) -> None:
dtype = "half"
max_tokens = 5
num_logprobs = 5
tensor_parallel_size = 2
if model.startswith("meta-llama/Llama-3.2-11B-Vision-Instruct"):
from .test_mllama import models, run_test
else:
raise NotImplementedError(f"Unsupported model: {model}")
run_test(
hf_runner,
vllm_runner,
image_assets,
model=models[0],
size_factors=[0.25, 0.5, 1.0],
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
)

View File

@ -1,8 +1,4 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for BART models using greedy sampling.
Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
"""
from typing import Optional from typing import Optional
import pytest import pytest

View File

@ -289,23 +289,25 @@ def test_multistep_correctness(
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]]) @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("max_tokens", [64])
def test_hybrid_distributed_produces_identical_generation( @pytest.mark.parametrize("num_logprobs", [5])
def test_distributed_correctness(
vllm_runner, vllm_runner,
example_prompts, example_prompts,
model: str, model: str,
max_tokens: int, max_tokens: int,
num_logprobs: int,
) -> None: ) -> None:
with vllm_runner(model, tensor_parallel_size=2,
max_num_seqs=2) as vllm_model:
vllm_outputs_tp_2 = vllm_model.generate_greedy(example_prompts,
max_tokens)
with vllm_runner(model, tensor_parallel_size=1, with vllm_runner(model, tensor_parallel_size=1,
max_num_seqs=2) as vllm_model: max_num_seqs=2) as vllm_model:
vllm_outputs_tp_1 = vllm_model.generate_greedy(example_prompts, vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
max_tokens) example_prompts, max_tokens, num_logprobs)
check_outputs_equal( with vllm_runner(model, tensor_parallel_size=2,
max_num_seqs=2) as vllm_model:
vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
check_logprobs_close(
outputs_0_lst=vllm_outputs_tp_1, outputs_0_lst=vllm_outputs_tp_1,
outputs_1_lst=vllm_outputs_tp_2, outputs_1_lst=vllm_outputs_tp_2,
name_0="vllm_tp_1", name_0="vllm_tp_1",

View File

@ -8,7 +8,7 @@ import pytest
from vllm.config import PoolerConfig from vllm.config import PoolerConfig
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ..utils import check_embeddings_close from ...utils import check_embeddings_close
@pytest.mark.parametrize( @pytest.mark.parametrize(

View File

@ -7,11 +7,10 @@ from array import array
import openai import openai
import pytest import pytest
import pytest_asyncio
from scipy.spatial.distance import cosine from scipy.spatial.distance import cosine
import vllm from vllm import LLM, SamplingParams
import vllm.config from vllm.config import ModelConfig
from vllm.utils import STR_BACKEND_ENV_VAR from vllm.utils import STR_BACKEND_ENV_VAR
from ....utils import RemoteOpenAIServer from ....utils import RemoteOpenAIServer
@ -31,73 +30,45 @@ def _arr(arr):
return array("i", arr) return array("i", arr)
def test_find_array(monkeypatch: pytest.MonkeyPatch): def test_find_array():
# GritLM embedding implementation is only supported by XFormers backend. from vllm.model_executor.models.gritlm import GritLMPooler
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
from vllm.model_executor.models.gritlm import GritLMPooler model_config = ModelConfig(
MODEL_NAME,
task="embed",
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
dtype="bfloat16",
seed=0,
)
pooler = GritLMPooler(model_config=model_config)
# Create an LLM object to get the model config. arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3 with pytest.raises(ValueError):
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3 pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
with pytest.raises(ValueError):
pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
@pytest.fixture(scope="module")
def server_embedding():
# GritLM embedding implementation is only supported by XFormers backend.
args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
with pytest.MonkeyPatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def server_generate():
args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
with pytest.MonkeyPatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client_embedding(server_embedding: RemoteOpenAIServer):
async with server_embedding.get_async_client() as async_client:
yield async_client
@pytest_asyncio.fixture
async def client_generate(server_generate: RemoteOpenAIServer):
async with server_generate.get_async_client() as async_client:
yield async_client
def run_llm_encode( def run_llm_encode(
llm: vllm.LLM, llm: LLM,
queries: list[str], queries: list[str],
instruction: str, instruction: str,
) -> list[float]: ) -> list[list[float]]:
outputs = llm.encode([instruction + q for q in queries], ) outputs = llm.embed([instruction + q for q in queries])
return [output.outputs.embedding for output in outputs] return [output.outputs.embedding for output in outputs]
async def run_client_embeddings( async def run_client_embeddings(
client: vllm.LLM, client: openai.AsyncOpenAI,
queries: list[str], queries: list[str],
instruction: str, instruction: str,
) -> list[float]: ) -> list[list[float]]:
outputs = await client.embeddings.create( outputs = await client.embeddings.create(
model=MODEL_NAME, model=MODEL_NAME,
input=[instruction + q for q in queries], input=[instruction + q for q in queries],
@ -132,7 +103,7 @@ def get_test_data():
return queries, q_instruction, documents, d_instruction return queries, q_instruction, documents, d_instruction
def validate_embed_output(q_rep: list[float], d_rep: list[float]): def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]):
cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0]) cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001) assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001)
@ -143,70 +114,100 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]):
assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001) assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001)
cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1]) cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001) assert math.isclose(cosine_sim_q1_d1, 0.534, abs_tol=0.001)
def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch): def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch,
vllm_runner):
# GritLM embedding implementation is only supported by XFormers backend. # GritLM embedding implementation is only supported by XFormers backend.
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS") m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
queries, q_instruction, documents, d_instruction = get_test_data() queries, q_instruction, documents, d_instruction = get_test_data()
llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) with vllm_runner(
MODEL_NAME,
task="embed",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.model
d_rep = run_llm_encode( d_rep = run_llm_encode(
llm, llm,
documents, documents,
d_instruction, d_instruction,
) )
q_rep = run_llm_encode( q_rep = run_llm_encode(
llm, llm,
queries, queries,
q_instruction, q_instruction,
) )
validate_embed_output(q_rep, d_rep) validate_embed_output(q_rep, d_rep)
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_gritlm_api_server_embedding( async def test_gritlm_api_server_embedding():
client_embedding: openai.AsyncOpenAI, ):
queries, q_instruction, documents, d_instruction = get_test_data() queries, q_instruction, documents, d_instruction = get_test_data()
d_rep = await run_client_embeddings( # GritLM embedding implementation is only supported by XFormers backend.
client_embedding, args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
documents, env_dict = {STR_BACKEND_ENV_VAR: "XFORMERS"}
d_instruction,
) with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as server:
q_rep = await run_client_embeddings( client_embedding = server.get_async_client()
client_embedding,
queries, d_rep = await run_client_embeddings(
q_instruction, client_embedding,
) documents,
d_instruction,
)
q_rep = await run_client_embeddings(
client_embedding,
queries,
q_instruction,
)
validate_embed_output(q_rep, d_rep) validate_embed_output(q_rep, d_rep)
def test_gritlm_offline_gen(): def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n" # GritLM embedding implementation is only supported by XFormers backend.
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "0")
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
llm = vllm.LLM(MODEL_NAME, max_model_len=MAX_MODEL_LEN) input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
sampling_params = vllm.SamplingParams(temperature=0.0, max_tokens=256)
outputs = llm.generate(input, sampling_params=sampling_params)
assert outputs[0].outputs[0].text == "The capital of France is Paris." with vllm_runner(
MODEL_NAME,
task="generate",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.model
sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
outputs = llm.generate(input, sampling_params=sampling_params)
assert outputs[0].outputs[0].text == "The capital of France is Paris."
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_gritlm_api_server_gen(client_generate: openai.AsyncOpenAI): async def test_gritlm_api_server_generate():
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n" input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
outputs = await client_generate.completions.create( # GritLM embedding implementation is only supported by XFormers backend.
model=MODEL_NAME, args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
prompt=input, env_dict = {"VLLM_USE_V1": "0", STR_BACKEND_ENV_VAR: "XFORMERS"}
max_tokens=256,
temperature=0.0, with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as server:
) client_generate = server.get_async_client()
outputs = await client_generate.completions.create(
model=MODEL_NAME,
prompt=input,
max_tokens=256,
temperature=0.0,
)
assert outputs.choices[0].text == "The capital of France is Paris." assert outputs.choices[0].text == "The capital of France is Paris."

View File

@ -8,9 +8,10 @@ import math
import pytest import pytest
from tests.models.embedding.utils import check_embeddings_close, matryoshka_fy
from vllm import PoolingParams from vllm import PoolingParams
from ...utils import check_embeddings_close, matryoshka_fy
SCORING_MODELS = [ SCORING_MODELS = [
"jinaai/jina-reranker-v2-base-multilingual", # Roberta "jinaai/jina-reranker-v2-base-multilingual", # Roberta
] ]

View File

@ -5,9 +5,7 @@ Run `pytest tests/models/embedding/language/test_snowflake_arctic_embed.py`.
""" """
import pytest import pytest
from tests.models.embedding.utils import EmbedModelInfo from ...utils import EmbedModelInfo, check_embeddings_close
from ..utils import check_embeddings_close
EMBEDDING_PROMPTS = [ EMBEDDING_PROMPTS = [
'what is snowflake?', 'Where can I get the best tacos?', 'The Data Cloud!', 'what is snowflake?', 'Where can I get the best tacos?', 'The Data Cloud!',

View File

@ -267,6 +267,7 @@ VLM_TEST_SETTINGS = {
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501 multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
dtype="bfloat16",
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}}, vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
patch_hf_runner=model_utils.gemma3_patch_hf_runner, patch_hf_runner=model_utils.gemma3_patch_hf_runner,
@ -423,6 +424,8 @@ VLM_TEST_SETTINGS = {
get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id], get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner, patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
# FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
marks=[pytest.mark.skip("HF import fails")],
), ),
"minicpmo_26": VLMTestInfo( "minicpmo_26": VLMTestInfo(
models=["openbmb/MiniCPM-o-2_6"], models=["openbmb/MiniCPM-o-2_6"],
@ -434,6 +437,8 @@ VLM_TEST_SETTINGS = {
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner, patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
# FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
marks=[pytest.mark.skip("HF import fails")],
), ),
"minicpmv_26": VLMTestInfo( "minicpmv_26": VLMTestInfo(
models=["openbmb/MiniCPM-V-2_6"], models=["openbmb/MiniCPM-V-2_6"],
@ -445,6 +450,8 @@ VLM_TEST_SETTINGS = {
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner, patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
# FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
marks=[pytest.mark.skip("HF import fails")],
), ),
"minimax_vl_01": VLMTestInfo( "minimax_vl_01": VLMTestInfo(
models=["MiniMaxAI/MiniMax-VL-01"], models=["MiniMaxAI/MiniMax-VL-01"],

View File

@ -16,6 +16,7 @@ INTERLEAVED_PROMPT = base_prompt("<image><video><image>\n")
NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n") NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n")
@pytest.mark.core_model
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["float16"]) @pytest.mark.parametrize("dtype", ["float16"])
@pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("max_tokens", [128])

View File

@ -17,7 +17,8 @@ from vllm.sequence import SampleLogprobs
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
_ImageAssets) _ImageAssets)
from ....quantization.utils import is_quant_method_supported from ....quantization.utils import is_quant_method_supported
from ....utils import large_gpu_test from ....utils import (create_new_process_for_each_test, large_gpu_test,
multi_gpu_test)
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
_LIMIT_IMAGE_PER_PROMPT = 3 _LIMIT_IMAGE_PER_PROMPT = 3
@ -393,6 +394,37 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
) )
@create_new_process_for_each_test()
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models_distributed(
hf_runner,
vllm_runner,
image_assets,
distributed_executor_backend,
model,
dtype,
max_tokens,
num_logprobs,
) -> None:
run_test(
hf_runner,
vllm_runner,
image_assets,
model=model,
size_factors=[0.25, 0.5, 1.0],
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
)
@large_gpu_test(min_gb=48) @large_gpu_test(min_gb=48)
@pytest.mark.core_model @pytest.mark.core_model
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)

View File

@ -1,15 +1,12 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
"""
from typing import Optional from typing import Optional
import pytest import pytest
from vllm import LLM, SamplingParams from vllm import SamplingParams
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from ....conftest import VllmRunner
from ....utils import create_new_process_for_each_test, multi_gpu_test from ....utils import create_new_process_for_each_test, multi_gpu_test
PROMPTS = [ PROMPTS = [
@ -92,6 +89,7 @@ EXPECTED = {
def run_test( def run_test(
vllm_runner: type[VllmRunner],
model: str, model: str,
*, *,
tensor_parallel_size: int, tensor_parallel_size: int,
@ -100,38 +98,52 @@ def run_test(
prompt_list = PROMPTS * 10 prompt_list = PROMPTS * 10
expected_list = EXPECTED[model] * 10 expected_list = EXPECTED[model] * 10
llm = LLM( with vllm_runner(
model=model, model,
tensor_parallel_size=tensor_parallel_size, max_model_len=448,
distributed_executor_backend=distributed_executor_backend, tensor_parallel_size=tensor_parallel_size,
) distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
llm = vllm_model.model
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0, temperature=0,
top_p=1.0, top_p=1.0,
max_tokens=200, max_tokens=200,
) )
outputs = llm.generate(prompt_list, sampling_params) outputs = llm.generate(prompt_list, sampling_params)
for output, expected in zip(outputs, expected_list): for output, expected in zip(outputs, expected_list):
print(output.outputs[0].text) print(output.outputs[0].text)
assert output.outputs[0].text == expected assert output.outputs[0].text == expected
@create_new_process_for_each_test() @create_new_process_for_each_test("spawn")
@pytest.mark.core_model @pytest.mark.core_model
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"]) "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
def test_models(model) -> None: def test_models(vllm_runner, model) -> None:
run_test(model, tensor_parallel_size=1) run_test(
vllm_runner,
model,
tensor_parallel_size=1,
)
@create_new_process_for_each_test("spawn")
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
@pytest.mark.core_model @pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) @pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
def test_models_distributed(model, distributed_executor_backend) -> None: def test_models_distributed(
run_test(model, vllm_runner,
tensor_parallel_size=2, model,
distributed_executor_backend=distributed_executor_backend) distributed_executor_backend,
) -> None:
run_test(
vllm_runner,
model,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
)

View File

@ -10,7 +10,7 @@ from transformers import Qwen2VLForConditionalGeneration
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test from ....utils import large_gpu_test
from ..utils import check_embeddings_close from ...utils import check_embeddings_close
HF_TEXT_PROMPTS = [ HF_TEXT_PROMPTS = [
# T -> X # T -> X

View File

@ -8,7 +8,7 @@ from vllm.platforms import current_platform
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test from ....utils import large_gpu_test
from ..utils import check_embeddings_close from ...utils import check_embeddings_close
# Llava Next embedding implementation is only supported by CUDA. # Llava Next embedding implementation is only supported by CUDA.
# If run on ROCm, hf_model.model.resize_token_embeddings will # If run on ROCm, hf_model.model.resize_token_embeddings will

View File

@ -9,7 +9,7 @@ from vllm.assets.image import VLM_IMAGES_DIR
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test from ....utils import large_gpu_test
from ..utils import check_embeddings_close from ...utils import check_embeddings_close
HF_TEXT_PROMPTS = [ HF_TEXT_PROMPTS = [
# T -> X # T -> X

View File

@ -39,7 +39,6 @@ ground_truth_generations = [
] ]
@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("aqlm"), @pytest.mark.skipif(not is_quant_method_supported("aqlm"),
reason="AQLM is not supported on this GPU type.") reason="AQLM is not supported on this GPU type.")
@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"]) @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])

View File

@ -7,8 +7,8 @@ import torch
from vllm.multimodal.image import rescale_image_size from vllm.multimodal.image import rescale_image_size
from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets from ...conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
from ...utils import check_logprobs_close from ..utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign": "stop_sign":
@ -85,7 +85,6 @@ def run_awq_test(
) )
@pytest.mark.quant_model
@pytest.mark.parametrize( @pytest.mark.parametrize(
("source_model", "quant_model"), ("source_model", "quant_model"),
[("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")], [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")],

View File

@ -15,7 +15,7 @@ from dataclasses import dataclass
import pytest import pytest
from .utils import check_logprobs_close from ..utils import check_logprobs_close
@dataclass @dataclass

View File

@ -4,20 +4,15 @@
"""Tests fp8 models against ground truth generation """Tests fp8 models against ground truth generation
Note: these tests will only pass on L4 GPU. Note: these tests will only pass on L4 GPU.
""" """
import os
from typing import Optional
import pytest import pytest
from tests.kernels.utils import override_backend_env_variable
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import STR_BACKEND_ENV_VAR from vllm.utils import STR_BACKEND_ENV_VAR
from ...utils import check_logprobs_close from ..utils import check_logprobs_close
@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("fp8"), @pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.") reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize( @pytest.mark.parametrize(

View File

@ -14,9 +14,9 @@ from transformers import AutoTokenizer
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from ....conftest import VllmRunner from ...conftest import VllmRunner
from ....utils import multi_gpu_test from ...utils import multi_gpu_test
from ...utils import check_logprobs_close from ..utils import check_logprobs_close
os.environ["TOKENIZERS_PARALLELISM"] = "true" os.environ["TOKENIZERS_PARALLELISM"] = "true"
@ -38,7 +38,6 @@ LLAMA_CONFIG = GGUFTestConfig(
original_model="meta-llama/Llama-3.2-1B-Instruct", original_model="meta-llama/Llama-3.2-1B-Instruct",
gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF", gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf", gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf",
marks=[pytest.mark.quant_model],
) )
QWEN2_CONFIG = GGUFTestConfig( QWEN2_CONFIG = GGUFTestConfig(

View File

@ -15,7 +15,7 @@ from dataclasses import dataclass
import pytest import pytest
from .utils import check_logprobs_close from ..utils import check_logprobs_close
@dataclass @dataclass

View File

@ -16,7 +16,7 @@ import pytest
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
from ...utils import check_logprobs_close from ..utils import check_logprobs_close
os.environ["TOKENIZERS_PARALLELISM"] = "true" os.environ["TOKENIZERS_PARALLELISM"] = "true"
@ -34,7 +34,6 @@ MODELS = [
] ]
@pytest.mark.quant_model
@pytest.mark.flaky(reruns=3) @pytest.mark.flaky(reruns=3)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
reason="gptq_marlin is not supported on this GPU type.") reason="gptq_marlin is not supported on this GPU type.")

View File

@ -13,7 +13,7 @@ import pytest
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from ...utils import check_logprobs_close from ..utils import check_logprobs_close
@dataclass @dataclass
@ -39,7 +39,6 @@ model_pairs = [
] ]
@pytest.mark.quant_model
@pytest.mark.flaky(reruns=2) @pytest.mark.flaky(reruns=2)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"), @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"),
reason="Marlin24 is not supported on this GPU type.") reason="Marlin24 is not supported on this GPU type.")

View File

@ -40,7 +40,6 @@ EXPECTED_STRS_MAP = {
@pytest.mark.skip( @pytest.mark.skip(
reason= reason=
"Prevent unstable test based on golden strings from breaking the build.") "Prevent unstable test based on golden strings from breaking the build.")
@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("fp8"), @pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.") reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS) @pytest.mark.parametrize("model_name", MODELS)

View File

@ -41,7 +41,6 @@ EXPECTED_STRS_MAP = {
reason= reason=
"Prevent unstable test based on golden strings from breaking the build " "Prevent unstable test based on golden strings from breaking the build "
" and test input model being too large and hanging the system.") " and test input model being too large and hanging the system.")
@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("nvfp4"), @pytest.mark.skipif(not is_quant_method_supported("nvfp4"),
reason="nvfp4 is not supported on this GPU type.") reason="nvfp4 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS) @pytest.mark.parametrize("model_name", MODELS)

View File

@ -2,9 +2,10 @@
import warnings import warnings
from collections.abc import Sequence from collections.abc import Sequence
from typing import Any, Optional, Union from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
import torch import torch
import torch.nn.functional as F
from vllm.config import ModelConfig, TaskOption from vllm.config import ModelConfig, TaskOption
from vllm.inputs import InputContext from vllm.inputs import InputContext
@ -12,6 +13,9 @@ from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
from .registry import HF_EXAMPLE_MODELS from .registry import HF_EXAMPLE_MODELS
if TYPE_CHECKING:
from ..conftest import HfRunner
TokensText = tuple[list[int], str] TokensText = tuple[list[int], str]
@ -291,3 +295,63 @@ def build_model_context(
**model_config_kwargs, **model_config_kwargs,
) )
return InputContext(model_config) return InputContext(model_config)
def check_embeddings_close(
*,
embeddings_0_lst: Sequence[list[float]],
embeddings_1_lst: Sequence[list[float]],
name_0: str,
name_1: str,
tol: float = 1e-3,
) -> None:
assert len(embeddings_0_lst) == len(embeddings_1_lst)
for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
zip(embeddings_0_lst, embeddings_1_lst)):
assert len(embeddings_0) == len(embeddings_1), (
f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
sim = F.cosine_similarity(torch.tensor(embeddings_0),
torch.tensor(embeddings_1),
dim=0)
fail_msg = (f"Test{prompt_idx}:"
f"\n{name_0}:\t{embeddings_0[:16]!r}"
f"\n{name_1}:\t{embeddings_1[:16]!r}")
assert sim >= 1 - tol, fail_msg
def matryoshka_fy(tensor: torch.Tensor, dimensions: int):
tensor = torch.tensor(tensor)
tensor = tensor[..., :dimensions]
tensor = F.normalize(tensor, p=2, dim=1)
return tensor
class EmbedModelInfo(NamedTuple):
name: str
is_matryoshka: bool
matryoshka_dimensions: Optional[list[int]] = None
architecture: str = ""
enable_test: bool = True
def run_embedding_correctness_test(
hf_model: "HfRunner",
inputs: list[str],
vllm_outputs: Sequence[list[float]],
dimensions: Optional[int] = None,
):
hf_outputs = hf_model.encode(inputs)
if dimensions:
hf_outputs = matryoshka_fy(hf_outputs, dimensions)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
tol=1e-2,
)

View File

@ -1043,8 +1043,10 @@ class ModelConfig:
if self.is_attention_free: if self.is_attention_free:
return 0 return 0
if hasattr(self.hf_text_config, "head_dim"): # NOTE: Some configs may set head_dim=None in the config
if getattr(self.hf_text_config, "head_dim", None) is not None:
return self.hf_text_config.head_dim return self.hf_text_config.head_dim
# FIXME(woosuk): This may not be true for all models. # FIXME(woosuk): This may not be true for all models.
return (self.hf_text_config.hidden_size // return (self.hf_text_config.hidden_size //
self.hf_text_config.num_attention_heads) self.hf_text_config.num_attention_heads)

View File

@ -127,8 +127,10 @@ class LlamaAttention(nn.Module):
assert tp_size % self.total_num_kv_heads == 0 assert tp_size % self.total_num_kv_heads == 0
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
# MistralConfig has an optional head_dim introduced by Mistral-Nemo # MistralConfig has an optional head_dim introduced by Mistral-Nemo
self.head_dim = getattr(config, "head_dim", head_dim = getattr(config, "head_dim", None)
self.hidden_size // self.total_num_heads) if head_dim is None:
head_dim = self.hidden_size // self.total_num_heads
self.head_dim = head_dim
# Phi models introduced a partial_rotary_factor parameter in the config # Phi models introduced a partial_rotary_factor parameter in the config
self.partial_rotary_factor = getattr(config, "partial_rotary_factor", self.partial_rotary_factor = getattr(config, "partial_rotary_factor",
1) 1)