mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-24 13:55:50 +08:00
[CI/Build] Reorganize models tests (#17459)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
aa4502e7f3
commit
afb4429b4f
@ -390,12 +390,15 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s benchmarks/
|
||||
|
||||
- label: Quantization Test # 33min
|
||||
- label: Quantization Test
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/quantization
|
||||
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||
- tests/models/quantization
|
||||
commands:
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||
- pytest -v -s models/quantization
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||
@ -441,82 +444,70 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s models/test_transformers.py
|
||||
- pytest -v -s models/test_registry.py
|
||||
- pytest -v -s models/test_utils.py
|
||||
- pytest -v -s models/test_vision.py
|
||||
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
|
||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
|
||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
|
||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
|
||||
|
||||
- label: Language Models Test (Standard) # 32min
|
||||
- label: Language Models Test (Standard)
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/language
|
||||
- tests/models/embedding/language
|
||||
- tests/models/encoder_decoder/language
|
||||
- tests/models/language
|
||||
commands:
|
||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
|
||||
- pytest -v -s models/embedding/language -m core_model
|
||||
- pytest -v -s models/language -m core_model
|
||||
|
||||
- label: Language Models Test (Extended) # 1h10min
|
||||
- label: Language Models Test (Extended)
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/language
|
||||
- tests/models/embedding/language
|
||||
- tests/models/encoder_decoder/language
|
||||
- tests/models/language
|
||||
commands:
|
||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||
- pip install causal-conv1d
|
||||
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s models/embedding/language -m 'not core_model'
|
||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||
- pytest -v -s models/language -m 'not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Standard) # 40min
|
||||
- label: Multi-Modal Models Test (Standard)
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/audio_language
|
||||
- tests/models/decoder_only/vision_language
|
||||
- tests/models/embedding/vision_language
|
||||
- tests/models/encoder_decoder/audio_language
|
||||
- tests/models/encoder_decoder/vision_language
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal
|
||||
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
||||
- pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
|
||||
- pytest -v -s models/embedding/vision_language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/audio_language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
||||
- pytest -v -s models/decoder_only/vision_language/test_interleaved.py
|
||||
- pytest -v -s models/multimodal/processing
|
||||
- pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
|
||||
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 1 # 48m
|
||||
- label: Multi-Modal Models Test (Extended) 1
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/audio_language
|
||||
- tests/models/decoder_only/vision_language
|
||||
- tests/models/embedding/vision_language
|
||||
- tests/models/encoder_decoder/vision_language
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
|
||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s models/embedding/vision_language -m 'not core_model'
|
||||
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
|
||||
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
|
||||
- pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 2 # 38m
|
||||
- label: Multi-Modal Models Test (Extended) 2
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/vision_language
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||
|
||||
- label: Multi-Modal Models Test (Extended) 3
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
commands:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
||||
|
||||
# This test is used only in PR development phase to test individual models and should never run on main
|
||||
- label: Custom Models Test
|
||||
@ -586,9 +577,8 @@ steps:
|
||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||
# Avoid importing model tests that cause CUDA reinitialization error
|
||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
||||
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
|
||||
# test sequence parallel
|
||||
- pytest -v -s distributed/test_sequence_parallel.py
|
||||
# this test fails consistently.
|
||||
|
||||
@ -158,7 +158,6 @@ markers = [
|
||||
"skip_global_cleanup",
|
||||
"core_model: enable this model test in each PR instead of only nightly",
|
||||
"cpu_model: enable this model test in CPU tests",
|
||||
"quant_model: run this model test under Quantized category",
|
||||
"split: run this test as part of a split",
|
||||
"distributed: run this test only in distributed GPU tests",
|
||||
"skip_v1: do not run this test with v1",
|
||||
|
||||
@ -11,7 +11,7 @@ import requests
|
||||
from vllm.entrypoints.openai.protocol import EmbeddingResponse
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
from ...models.embedding.utils import correctness_test
|
||||
from ...models.utils import run_embedding_correctness_test
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
@ -76,7 +76,7 @@ async def test_single_embedding(hf_model, client: openai.AsyncOpenAI,
|
||||
assert embeddings.usage.total_tokens == 11
|
||||
|
||||
vllm_outputs = [d.embedding for d in embeddings.data]
|
||||
correctness_test(hf_model, input_texts, vllm_outputs)
|
||||
run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
|
||||
|
||||
# test using token IDs
|
||||
input_tokens = [1, 1, 1, 1, 1]
|
||||
@ -121,7 +121,7 @@ async def test_batch_embedding(hf_model, client: openai.AsyncOpenAI,
|
||||
assert embeddings.usage.total_tokens == 33
|
||||
|
||||
vllm_outputs = [d.embedding for d in embeddings.data]
|
||||
correctness_test(hf_model, input_texts, vllm_outputs)
|
||||
run_embedding_correctness_test(hf_model, input_texts, vllm_outputs)
|
||||
|
||||
# test list[list[int]]
|
||||
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
|
||||
@ -208,7 +208,7 @@ async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
|
||||
model=model_name,
|
||||
encoding_format="float")
|
||||
float_data = [d.embedding for d in responses_float.data]
|
||||
correctness_test(hf_model, input_texts, float_data)
|
||||
run_embedding_correctness_test(hf_model, input_texts, float_data)
|
||||
|
||||
responses_base64 = await client.embeddings.create(input=input_texts,
|
||||
model=model_name,
|
||||
@ -219,13 +219,13 @@ async def test_batch_base64_embedding(hf_model, client: openai.AsyncOpenAI,
|
||||
np.frombuffer(base64.b64decode(data.embedding),
|
||||
dtype="float32").tolist())
|
||||
|
||||
correctness_test(hf_model, input_texts, base64_data)
|
||||
run_embedding_correctness_test(hf_model, input_texts, base64_data)
|
||||
|
||||
# Default response is float32 decoded from base64 by OpenAI Client
|
||||
responses_default = await client.embeddings.create(input=input_texts,
|
||||
model=model_name)
|
||||
default_data = [d.embedding for d in responses_default.data]
|
||||
correctness_test(hf_model, input_texts, default_data)
|
||||
run_embedding_correctness_test(hf_model, input_texts, default_data)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@ -11,7 +11,7 @@ import pytest
|
||||
from vllm.entrypoints.openai.protocol import EmbeddingResponse
|
||||
|
||||
from ...conftest import HfRunner
|
||||
from ...models.embedding.utils import EmbedModelInfo, correctness_test
|
||||
from ...models.utils import EmbedModelInfo, run_embedding_correctness_test
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODELS = [
|
||||
@ -95,7 +95,8 @@ async def test_matryoshka(model_info: EmbedModelInfo,
|
||||
assert len(embeddings.data[0].embedding) == dimensions
|
||||
|
||||
vllm_outputs = [d.embedding for d in embeddings.data]
|
||||
correctness_test(hf_model, prompts, vllm_outputs, dimensions)
|
||||
run_embedding_correctness_test(hf_model, prompts, vllm_outputs,
|
||||
dimensions)
|
||||
|
||||
if model_info.is_matryoshka:
|
||||
valid_dimensions: list[Optional[int]] = [None]
|
||||
|
||||
@ -1,66 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from collections.abc import Sequence
|
||||
from typing import NamedTuple, Optional
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
def check_embeddings_close(
|
||||
*,
|
||||
embeddings_0_lst: Sequence[list[float]],
|
||||
embeddings_1_lst: Sequence[list[float]],
|
||||
name_0: str,
|
||||
name_1: str,
|
||||
tol: float = 1e-3,
|
||||
) -> None:
|
||||
assert len(embeddings_0_lst) == len(embeddings_1_lst)
|
||||
|
||||
for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
|
||||
zip(embeddings_0_lst, embeddings_1_lst)):
|
||||
assert len(embeddings_0) == len(embeddings_1), (
|
||||
f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
|
||||
|
||||
sim = F.cosine_similarity(torch.tensor(embeddings_0),
|
||||
torch.tensor(embeddings_1),
|
||||
dim=0)
|
||||
|
||||
fail_msg = (f"Test{prompt_idx}:"
|
||||
f"\n{name_0}:\t{embeddings_0[:16]!r}"
|
||||
f"\n{name_1}:\t{embeddings_1[:16]!r}")
|
||||
|
||||
assert sim >= 1 - tol, fail_msg
|
||||
|
||||
|
||||
def matryoshka_fy(tensor, dimensions):
|
||||
tensor = torch.tensor(tensor)
|
||||
tensor = tensor[..., :dimensions]
|
||||
tensor = F.normalize(tensor, p=2, dim=1)
|
||||
return tensor
|
||||
|
||||
|
||||
class EmbedModelInfo(NamedTuple):
|
||||
name: str
|
||||
is_matryoshka: bool
|
||||
matryoshka_dimensions: Optional[list[int]] = None
|
||||
architecture: str = ""
|
||||
enable_test: bool = True
|
||||
|
||||
|
||||
def correctness_test(hf_model,
|
||||
inputs,
|
||||
vllm_outputs: Sequence[list[float]],
|
||||
dimensions: Optional[int] = None):
|
||||
|
||||
hf_outputs = hf_model.encode(inputs)
|
||||
if dimensions:
|
||||
hf_outputs = matryoshka_fy(hf_outputs, dimensions)
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=1e-2,
|
||||
)
|
||||
@ -1,37 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import pytest
|
||||
|
||||
from ....utils import multi_gpu_test
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
||||
@pytest.mark.parametrize("model", [
|
||||
"meta-llama/Llama-3.2-11B-Vision-Instruct",
|
||||
])
|
||||
def test_models(hf_runner, vllm_runner, image_assets,
|
||||
distributed_executor_backend, model) -> None:
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
num_logprobs = 5
|
||||
tensor_parallel_size = 2
|
||||
|
||||
if model.startswith("meta-llama/Llama-3.2-11B-Vision-Instruct"):
|
||||
from .test_mllama import models, run_test
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported model: {model}")
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model=models[0],
|
||||
size_factors=[0.25, 0.5, 1.0],
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
||||
@ -1,8 +1,4 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Compare the outputs of HF and vLLM for BART models using greedy sampling.
|
||||
|
||||
Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
|
||||
"""
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
@ -289,23 +289,25 @@ def test_multistep_correctness(
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
def test_hybrid_distributed_produces_identical_generation(
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_distributed_correctness(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(model, tensor_parallel_size=2,
|
||||
max_num_seqs=2) as vllm_model:
|
||||
vllm_outputs_tp_2 = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens)
|
||||
|
||||
with vllm_runner(model, tensor_parallel_size=1,
|
||||
max_num_seqs=2) as vllm_model:
|
||||
vllm_outputs_tp_1 = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens)
|
||||
vllm_outputs_tp_1 = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
check_outputs_equal(
|
||||
with vllm_runner(model, tensor_parallel_size=2,
|
||||
max_num_seqs=2) as vllm_model:
|
||||
vllm_outputs_tp_2 = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=vllm_outputs_tp_1,
|
||||
outputs_1_lst=vllm_outputs_tp_2,
|
||||
name_0="vllm_tp_1",
|
||||
@ -8,7 +8,7 @@ import pytest
|
||||
from vllm.config import PoolerConfig
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ..utils import check_embeddings_close
|
||||
from ...utils import check_embeddings_close
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -7,11 +7,10 @@ from array import array
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from scipy.spatial.distance import cosine
|
||||
|
||||
import vllm
|
||||
import vllm.config
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.utils import STR_BACKEND_ENV_VAR
|
||||
|
||||
from ....utils import RemoteOpenAIServer
|
||||
@ -31,73 +30,45 @@ def _arr(arr):
|
||||
return array("i", arr)
|
||||
|
||||
|
||||
def test_find_array(monkeypatch: pytest.MonkeyPatch):
|
||||
# GritLM embedding implementation is only supported by XFormers backend.
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
|
||||
def test_find_array():
|
||||
from vllm.model_executor.models.gritlm import GritLMPooler
|
||||
|
||||
from vllm.model_executor.models.gritlm import GritLMPooler
|
||||
model_config = ModelConfig(
|
||||
MODEL_NAME,
|
||||
task="embed",
|
||||
tokenizer=MODEL_NAME,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
dtype="bfloat16",
|
||||
seed=0,
|
||||
)
|
||||
pooler = GritLMPooler(model_config=model_config)
|
||||
|
||||
# Create an LLM object to get the model config.
|
||||
llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
|
||||
pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
|
||||
arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
|
||||
|
||||
arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
|
||||
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
|
||||
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
|
||||
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
|
||||
assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
|
||||
|
||||
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
|
||||
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
|
||||
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
|
||||
assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server_embedding():
|
||||
# GritLM embedding implementation is only supported by XFormers backend.
|
||||
args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
|
||||
with pytest.MonkeyPatch.context() as m:
|
||||
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server_generate():
|
||||
args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
|
||||
with pytest.MonkeyPatch.context() as m:
|
||||
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client_embedding(server_embedding: RemoteOpenAIServer):
|
||||
async with server_embedding.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client_generate(server_generate: RemoteOpenAIServer):
|
||||
async with server_generate.get_async_client() as async_client:
|
||||
yield async_client
|
||||
with pytest.raises(ValueError):
|
||||
pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
|
||||
|
||||
|
||||
def run_llm_encode(
|
||||
llm: vllm.LLM,
|
||||
llm: LLM,
|
||||
queries: list[str],
|
||||
instruction: str,
|
||||
) -> list[float]:
|
||||
outputs = llm.encode([instruction + q for q in queries], )
|
||||
) -> list[list[float]]:
|
||||
outputs = llm.embed([instruction + q for q in queries])
|
||||
return [output.outputs.embedding for output in outputs]
|
||||
|
||||
|
||||
async def run_client_embeddings(
|
||||
client: vllm.LLM,
|
||||
client: openai.AsyncOpenAI,
|
||||
queries: list[str],
|
||||
instruction: str,
|
||||
) -> list[float]:
|
||||
) -> list[list[float]]:
|
||||
outputs = await client.embeddings.create(
|
||||
model=MODEL_NAME,
|
||||
input=[instruction + q for q in queries],
|
||||
@ -132,7 +103,7 @@ def get_test_data():
|
||||
return queries, q_instruction, documents, d_instruction
|
||||
|
||||
|
||||
def validate_embed_output(q_rep: list[float], d_rep: list[float]):
|
||||
def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]):
|
||||
cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
|
||||
assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001)
|
||||
|
||||
@ -143,70 +114,100 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]):
|
||||
assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001)
|
||||
|
||||
cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
|
||||
assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)
|
||||
assert math.isclose(cosine_sim_q1_d1, 0.534, abs_tol=0.001)
|
||||
|
||||
|
||||
def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch):
|
||||
def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch,
|
||||
vllm_runner):
|
||||
# GritLM embedding implementation is only supported by XFormers backend.
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
|
||||
|
||||
queries, q_instruction, documents, d_instruction = get_test_data()
|
||||
|
||||
llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
|
||||
with vllm_runner(
|
||||
MODEL_NAME,
|
||||
task="embed",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
) as vllm_model:
|
||||
llm = vllm_model.model
|
||||
|
||||
d_rep = run_llm_encode(
|
||||
llm,
|
||||
documents,
|
||||
d_instruction,
|
||||
)
|
||||
q_rep = run_llm_encode(
|
||||
llm,
|
||||
queries,
|
||||
q_instruction,
|
||||
)
|
||||
d_rep = run_llm_encode(
|
||||
llm,
|
||||
documents,
|
||||
d_instruction,
|
||||
)
|
||||
q_rep = run_llm_encode(
|
||||
llm,
|
||||
queries,
|
||||
q_instruction,
|
||||
)
|
||||
|
||||
validate_embed_output(q_rep, d_rep)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_gritlm_api_server_embedding(
|
||||
client_embedding: openai.AsyncOpenAI, ):
|
||||
async def test_gritlm_api_server_embedding():
|
||||
queries, q_instruction, documents, d_instruction = get_test_data()
|
||||
|
||||
d_rep = await run_client_embeddings(
|
||||
client_embedding,
|
||||
documents,
|
||||
d_instruction,
|
||||
)
|
||||
q_rep = await run_client_embeddings(
|
||||
client_embedding,
|
||||
queries,
|
||||
q_instruction,
|
||||
)
|
||||
# GritLM embedding implementation is only supported by XFormers backend.
|
||||
args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
|
||||
env_dict = {STR_BACKEND_ENV_VAR: "XFORMERS"}
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as server:
|
||||
client_embedding = server.get_async_client()
|
||||
|
||||
d_rep = await run_client_embeddings(
|
||||
client_embedding,
|
||||
documents,
|
||||
d_instruction,
|
||||
)
|
||||
q_rep = await run_client_embeddings(
|
||||
client_embedding,
|
||||
queries,
|
||||
q_instruction,
|
||||
)
|
||||
|
||||
validate_embed_output(q_rep, d_rep)
|
||||
|
||||
|
||||
def test_gritlm_offline_gen():
|
||||
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
|
||||
def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
|
||||
# GritLM embedding implementation is only supported by XFormers backend.
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "0")
|
||||
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
|
||||
|
||||
llm = vllm.LLM(MODEL_NAME, max_model_len=MAX_MODEL_LEN)
|
||||
sampling_params = vllm.SamplingParams(temperature=0.0, max_tokens=256)
|
||||
outputs = llm.generate(input, sampling_params=sampling_params)
|
||||
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
|
||||
|
||||
assert outputs[0].outputs[0].text == "The capital of France is Paris."
|
||||
with vllm_runner(
|
||||
MODEL_NAME,
|
||||
task="generate",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
) as vllm_model:
|
||||
llm = vllm_model.model
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
|
||||
outputs = llm.generate(input, sampling_params=sampling_params)
|
||||
|
||||
assert outputs[0].outputs[0].text == "The capital of France is Paris."
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_gritlm_api_server_gen(client_generate: openai.AsyncOpenAI):
|
||||
async def test_gritlm_api_server_generate():
|
||||
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
|
||||
|
||||
outputs = await client_generate.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=input,
|
||||
max_tokens=256,
|
||||
temperature=0.0,
|
||||
)
|
||||
# GritLM embedding implementation is only supported by XFormers backend.
|
||||
args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
|
||||
env_dict = {"VLLM_USE_V1": "0", STR_BACKEND_ENV_VAR: "XFORMERS"}
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as server:
|
||||
client_generate = server.get_async_client()
|
||||
|
||||
outputs = await client_generate.completions.create(
|
||||
model=MODEL_NAME,
|
||||
prompt=input,
|
||||
max_tokens=256,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
assert outputs.choices[0].text == "The capital of France is Paris."
|
||||
@ -8,9 +8,10 @@ import math
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.embedding.utils import check_embeddings_close, matryoshka_fy
|
||||
from vllm import PoolingParams
|
||||
|
||||
from ...utils import check_embeddings_close, matryoshka_fy
|
||||
|
||||
SCORING_MODELS = [
|
||||
"jinaai/jina-reranker-v2-base-multilingual", # Roberta
|
||||
]
|
||||
@ -5,9 +5,7 @@ Run `pytest tests/models/embedding/language/test_snowflake_arctic_embed.py`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from tests.models.embedding.utils import EmbedModelInfo
|
||||
|
||||
from ..utils import check_embeddings_close
|
||||
from ...utils import EmbedModelInfo, check_embeddings_close
|
||||
|
||||
EMBEDDING_PROMPTS = [
|
||||
'what is snowflake?', 'Where can I get the best tacos?', 'The Data Cloud!',
|
||||
@ -267,6 +267,7 @@ VLM_TEST_SETTINGS = {
|
||||
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
dtype="bfloat16",
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
|
||||
patch_hf_runner=model_utils.gemma3_patch_hf_runner,
|
||||
@ -423,6 +424,8 @@ VLM_TEST_SETTINGS = {
|
||||
get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
|
||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||
patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
|
||||
# FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
|
||||
marks=[pytest.mark.skip("HF import fails")],
|
||||
),
|
||||
"minicpmo_26": VLMTestInfo(
|
||||
models=["openbmb/MiniCPM-o-2_6"],
|
||||
@ -434,6 +437,8 @@ VLM_TEST_SETTINGS = {
|
||||
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
|
||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
|
||||
# FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
|
||||
marks=[pytest.mark.skip("HF import fails")],
|
||||
),
|
||||
"minicpmv_26": VLMTestInfo(
|
||||
models=["openbmb/MiniCPM-V-2_6"],
|
||||
@ -445,6 +450,8 @@ VLM_TEST_SETTINGS = {
|
||||
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
|
||||
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
|
||||
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
|
||||
# FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
|
||||
marks=[pytest.mark.skip("HF import fails")],
|
||||
),
|
||||
"minimax_vl_01": VLMTestInfo(
|
||||
models=["MiniMaxAI/MiniMax-VL-01"],
|
||||
@ -16,6 +16,7 @@ INTERLEAVED_PROMPT = base_prompt("<image><video><image>\n")
|
||||
NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n")
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["float16"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@ -17,7 +17,8 @@ from vllm.sequence import SampleLogprobs
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
||||
_ImageAssets)
|
||||
from ....quantization.utils import is_quant_method_supported
|
||||
from ....utils import large_gpu_test
|
||||
from ....utils import (create_new_process_for_each_test, large_gpu_test,
|
||||
multi_gpu_test)
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
_LIMIT_IMAGE_PER_PROMPT = 3
|
||||
@ -393,6 +394,37 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
|
||||
)
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models_distributed(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
distributed_executor_backend,
|
||||
model,
|
||||
dtype,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
) -> None:
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model=model,
|
||||
size_factors=[0.25, 0.5, 1.0],
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@ -1,15 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
|
||||
|
||||
Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
|
||||
"""
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm import SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
from ....conftest import VllmRunner
|
||||
from ....utils import create_new_process_for_each_test, multi_gpu_test
|
||||
|
||||
PROMPTS = [
|
||||
@ -92,6 +89,7 @@ EXPECTED = {
|
||||
|
||||
|
||||
def run_test(
|
||||
vllm_runner: type[VllmRunner],
|
||||
model: str,
|
||||
*,
|
||||
tensor_parallel_size: int,
|
||||
@ -100,38 +98,52 @@ def run_test(
|
||||
prompt_list = PROMPTS * 10
|
||||
expected_list = EXPECTED[model] * 10
|
||||
|
||||
llm = LLM(
|
||||
model=model,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=448,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
) as vllm_model:
|
||||
llm = vllm_model.model
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
top_p=1.0,
|
||||
max_tokens=200,
|
||||
)
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
top_p=1.0,
|
||||
max_tokens=200,
|
||||
)
|
||||
|
||||
outputs = llm.generate(prompt_list, sampling_params)
|
||||
outputs = llm.generate(prompt_list, sampling_params)
|
||||
|
||||
for output, expected in zip(outputs, expected_list):
|
||||
print(output.outputs[0].text)
|
||||
assert output.outputs[0].text == expected
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
@create_new_process_for_each_test("spawn")
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize(
|
||||
"model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
|
||||
def test_models(model) -> None:
|
||||
run_test(model, tensor_parallel_size=1)
|
||||
def test_models(vllm_runner, model) -> None:
|
||||
run_test(
|
||||
vllm_runner,
|
||||
model,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@create_new_process_for_each_test("spawn")
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
||||
def test_models_distributed(model, distributed_executor_backend) -> None:
|
||||
run_test(model,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend)
|
||||
def test_models_distributed(
|
||||
vllm_runner,
|
||||
model,
|
||||
distributed_executor_backend,
|
||||
) -> None:
|
||||
run_test(
|
||||
vllm_runner,
|
||||
model,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
||||
@ -10,7 +10,7 @@ from transformers import Qwen2VLForConditionalGeneration
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ....utils import large_gpu_test
|
||||
from ..utils import check_embeddings_close
|
||||
from ...utils import check_embeddings_close
|
||||
|
||||
HF_TEXT_PROMPTS = [
|
||||
# T -> X
|
||||
@ -8,7 +8,7 @@ from vllm.platforms import current_platform
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ....utils import large_gpu_test
|
||||
from ..utils import check_embeddings_close
|
||||
from ...utils import check_embeddings_close
|
||||
|
||||
# Llava Next embedding implementation is only supported by CUDA.
|
||||
# If run on ROCm, hf_model.model.resize_token_embeddings will
|
||||
@ -9,7 +9,7 @@ from vllm.assets.image import VLM_IMAGES_DIR
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ....utils import large_gpu_test
|
||||
from ..utils import check_embeddings_close
|
||||
from ...utils import check_embeddings_close
|
||||
|
||||
HF_TEXT_PROMPTS = [
|
||||
# T -> X
|
||||
@ -39,7 +39,6 @@ ground_truth_generations = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.quant_model
|
||||
@pytest.mark.skipif(not is_quant_method_supported("aqlm"),
|
||||
reason="AQLM is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
|
||||
@ -7,8 +7,8 @@ import torch
|
||||
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
|
||||
from ...utils import check_logprobs_close
|
||||
from ...conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
|
||||
from ..utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
@ -85,7 +85,6 @@ def run_awq_test(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.quant_model
|
||||
@pytest.mark.parametrize(
|
||||
("source_model", "quant_model"),
|
||||
[("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")],
|
||||
@ -15,7 +15,7 @@ from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
|
||||
from .utils import check_logprobs_close
|
||||
from ..utils import check_logprobs_close
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -4,20 +4,15 @@
|
||||
"""Tests fp8 models against ground truth generation
|
||||
Note: these tests will only pass on L4 GPU.
|
||||
"""
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.kernels.utils import override_backend_env_variable
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import STR_BACKEND_ENV_VAR
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
from ..utils import check_logprobs_close
|
||||
|
||||
|
||||
@pytest.mark.quant_model
|
||||
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
|
||||
reason="fp8 is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize(
|
||||
@ -14,9 +14,9 @@ from transformers import AutoTokenizer
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ....conftest import VllmRunner
|
||||
from ....utils import multi_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
from ...conftest import VllmRunner
|
||||
from ...utils import multi_gpu_test
|
||||
from ..utils import check_logprobs_close
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
@ -38,7 +38,6 @@ LLAMA_CONFIG = GGUFTestConfig(
|
||||
original_model="meta-llama/Llama-3.2-1B-Instruct",
|
||||
gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
|
||||
gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf",
|
||||
marks=[pytest.mark.quant_model],
|
||||
)
|
||||
|
||||
QWEN2_CONFIG = GGUFTestConfig(
|
||||
@ -15,7 +15,7 @@ from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
|
||||
from .utils import check_logprobs_close
|
||||
from ..utils import check_logprobs_close
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -16,7 +16,7 @@ import pytest
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
from ..utils import check_logprobs_close
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
@ -34,7 +34,6 @@ MODELS = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.quant_model
|
||||
@pytest.mark.flaky(reruns=3)
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
|
||||
reason="gptq_marlin is not supported on this GPU type.")
|
||||
@ -13,7 +13,7 @@ import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
from ..utils import check_logprobs_close
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -39,7 +39,6 @@ model_pairs = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.quant_model
|
||||
@pytest.mark.flaky(reruns=2)
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"),
|
||||
reason="Marlin24 is not supported on this GPU type.")
|
||||
@ -40,7 +40,6 @@ EXPECTED_STRS_MAP = {
|
||||
@pytest.mark.skip(
|
||||
reason=
|
||||
"Prevent unstable test based on golden strings from breaking the build.")
|
||||
@pytest.mark.quant_model
|
||||
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
|
||||
reason="fp8 is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model_name", MODELS)
|
||||
@ -41,7 +41,6 @@ EXPECTED_STRS_MAP = {
|
||||
reason=
|
||||
"Prevent unstable test based on golden strings from breaking the build "
|
||||
" and test input model being too large and hanging the system.")
|
||||
@pytest.mark.quant_model
|
||||
@pytest.mark.skipif(not is_quant_method_supported("nvfp4"),
|
||||
reason="nvfp4 is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model_name", MODELS)
|
||||
@ -2,9 +2,10 @@
|
||||
|
||||
import warnings
|
||||
from collections.abc import Sequence
|
||||
from typing import Any, Optional, Union
|
||||
from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm.config import ModelConfig, TaskOption
|
||||
from vllm.inputs import InputContext
|
||||
@ -12,6 +13,9 @@ from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
|
||||
|
||||
from .registry import HF_EXAMPLE_MODELS
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..conftest import HfRunner
|
||||
|
||||
TokensText = tuple[list[int], str]
|
||||
|
||||
|
||||
@ -291,3 +295,63 @@ def build_model_context(
|
||||
**model_config_kwargs,
|
||||
)
|
||||
return InputContext(model_config)
|
||||
|
||||
|
||||
def check_embeddings_close(
|
||||
*,
|
||||
embeddings_0_lst: Sequence[list[float]],
|
||||
embeddings_1_lst: Sequence[list[float]],
|
||||
name_0: str,
|
||||
name_1: str,
|
||||
tol: float = 1e-3,
|
||||
) -> None:
|
||||
assert len(embeddings_0_lst) == len(embeddings_1_lst)
|
||||
|
||||
for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
|
||||
zip(embeddings_0_lst, embeddings_1_lst)):
|
||||
assert len(embeddings_0) == len(embeddings_1), (
|
||||
f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
|
||||
|
||||
sim = F.cosine_similarity(torch.tensor(embeddings_0),
|
||||
torch.tensor(embeddings_1),
|
||||
dim=0)
|
||||
|
||||
fail_msg = (f"Test{prompt_idx}:"
|
||||
f"\n{name_0}:\t{embeddings_0[:16]!r}"
|
||||
f"\n{name_1}:\t{embeddings_1[:16]!r}")
|
||||
|
||||
assert sim >= 1 - tol, fail_msg
|
||||
|
||||
|
||||
def matryoshka_fy(tensor: torch.Tensor, dimensions: int):
|
||||
tensor = torch.tensor(tensor)
|
||||
tensor = tensor[..., :dimensions]
|
||||
tensor = F.normalize(tensor, p=2, dim=1)
|
||||
return tensor
|
||||
|
||||
|
||||
class EmbedModelInfo(NamedTuple):
|
||||
name: str
|
||||
is_matryoshka: bool
|
||||
matryoshka_dimensions: Optional[list[int]] = None
|
||||
architecture: str = ""
|
||||
enable_test: bool = True
|
||||
|
||||
|
||||
def run_embedding_correctness_test(
|
||||
hf_model: "HfRunner",
|
||||
inputs: list[str],
|
||||
vllm_outputs: Sequence[list[float]],
|
||||
dimensions: Optional[int] = None,
|
||||
):
|
||||
hf_outputs = hf_model.encode(inputs)
|
||||
if dimensions:
|
||||
hf_outputs = matryoshka_fy(hf_outputs, dimensions)
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=1e-2,
|
||||
)
|
||||
|
||||
@ -1043,8 +1043,10 @@ class ModelConfig:
|
||||
if self.is_attention_free:
|
||||
return 0
|
||||
|
||||
if hasattr(self.hf_text_config, "head_dim"):
|
||||
# NOTE: Some configs may set head_dim=None in the config
|
||||
if getattr(self.hf_text_config, "head_dim", None) is not None:
|
||||
return self.hf_text_config.head_dim
|
||||
|
||||
# FIXME(woosuk): This may not be true for all models.
|
||||
return (self.hf_text_config.hidden_size //
|
||||
self.hf_text_config.num_attention_heads)
|
||||
|
||||
@ -127,8 +127,10 @@ class LlamaAttention(nn.Module):
|
||||
assert tp_size % self.total_num_kv_heads == 0
|
||||
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
||||
# MistralConfig has an optional head_dim introduced by Mistral-Nemo
|
||||
self.head_dim = getattr(config, "head_dim",
|
||||
self.hidden_size // self.total_num_heads)
|
||||
head_dim = getattr(config, "head_dim", None)
|
||||
if head_dim is None:
|
||||
head_dim = self.hidden_size // self.total_num_heads
|
||||
self.head_dim = head_dim
|
||||
# Phi models introduced a partial_rotary_factor parameter in the config
|
||||
self.partial_rotary_factor = getattr(config, "partial_rotary_factor",
|
||||
1)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user