vllm/tests/models/test_transformers.py
Arthur a1a2aaadb9
[Model]: Add transformers backend support (#11330)
# Adds support for `transformers` as a backend

Following https://github.com/huggingface/transformers/pull/35235, a
bunch of models should already be supported, we are ramping up support
for more models.

Thanks @Isotr0py for the TP support, and @hmellor for his help as well!
This includes: 
- `trust_remote_code=True` support: any model on the hub, if it
implements attention the correct way can be natively supported!!
- tensor parallel support

---------

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <41363108+Isotr0py@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-02-03 21:30:38 +08:00

76 lines
2.2 KiB
Python

"""Test the functionality of the Transformers backend.
Run `pytest tests/models/test_transformers.py`.
"""
from contextlib import nullcontext
from typing import Type
import pytest
from ..conftest import HfRunner, VllmRunner
from ..utils import multi_gpu_test
from .utils import check_logprobs_close
def check_implementation(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
example_prompts: list[str],
model: str,
**kwargs,
):
max_tokens = 32
num_logprobs = 5
with vllm_runner(model, **kwargs) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
with hf_runner(model) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize(
"model,model_impl",
[
("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
("openai-community/gpt2", "transformers"),
("ArthurZ/Ilama-3.2-1B", "auto"), # CUSTOM CODE
("meta-llama/Llama-3.2-1B-Instruct", "auto"),
]) # trust_remote_code=True by default
def test_models(hf_runner, vllm_runner, example_prompts, model,
model_impl) -> None:
maybe_raises = nullcontext()
if model == "openai-community/gpt2" and model_impl == "transformers":
# Model is not backend compatible
maybe_raises = pytest.raises(
ValueError,
match="The Transformers implementation.*not compatible with vLLM")
with maybe_raises:
check_implementation(hf_runner,
vllm_runner,
example_prompts,
model,
model_impl=model_impl)
@multi_gpu_test(num_gpus=2)
def test_distributed(
hf_runner,
vllm_runner,
example_prompts,
):
kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
check_implementation(hf_runner, vllm_runner, example_prompts,
"meta-llama/Llama-3.2-1B-Instruct", **kwargs)