diff --git a/requirements-dev.txt b/requirements-dev.txt index 7a112771f539e..f770ccec90eef 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -10,3 +10,4 @@ types-setuptools # testing pytest +pytest-forked diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000000000..92b06f3857f6c --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,132 @@ +from typing import List, Optional, Tuple + +import pytest +import torch +from transformers import AutoModelForCausalLM + +from vllm import LLM, SamplingParams +from vllm.transformers_utils.tokenizer import get_tokenizer + +_TEST_PROMPTS = [ + "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.", + "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.", + "Compare and contrast artificial intelligence with human intelligence in terms of processing information.", + "Describe the basic components of a neural network and how it can be trained.", + "Write a short story about a robot that dreams for the first time.", + "Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.", + "Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.", + "Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'", +] + + +@pytest.fixture +def example_prompts() -> List[str]: + return _TEST_PROMPTS + + +_STR_DTYPE_TO_TORCH_DTYPE = { + "half": torch.half, + "bfloat16": torch.bfloat16, + "float": torch.float, +} + + +class HfRunner: + + def __init__( + self, + model_name: str, + tokenizer_name: Optional[str] = None, + dtype: str = "half", + ) -> None: + assert dtype in _STR_DTYPE_TO_TORCH_DTYPE + torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype] + self.model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=torch_dtype, + trust_remote_code=True, + ).cuda() + if tokenizer_name is None: + tokenizer_name = model_name + self.tokenizer = get_tokenizer(tokenizer_name, trust_remote_code=True) + + def generate( + self, + prompts: List[str], + **kwargs, + ) -> List[Tuple[List[int], str]]: + outputs: List[Tuple[List[int], str]] = [] + for prompt in prompts: + input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids + output_ids = self.model.generate( + input_ids.cuda(), + use_cache=True, + **kwargs, + ) + output_str = self.tokenizer.batch_decode( + output_ids, + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + )[0] + output_ids = output_ids[0].cpu().tolist() + outputs.append((output_ids, output_str)) + return outputs + + def generate_greedy( + self, + prompts: List[str], + max_tokens: int, + ) -> List[Tuple[List[int], str]]: + return self.generate(prompts, do_sample=False, + max_new_tokens=max_tokens) + + +@pytest.fixture +def hf_runner(): + return HfRunner + + +class VllmRunner: + + def __init__( + self, + model_name: str, + tokenizer_name: Optional[str] = None, + dtype: str = "half", + ) -> None: + self.model = LLM( + model=model_name, + tokenizer=tokenizer_name, + trust_remote_code=True, + dtype=dtype, + swap_space=0, + ) + + def generate( + self, + prompts: List[str], + sampling_params: SamplingParams, + ) -> List[Tuple[List[int], str]]: + req_outputs = self.model.generate( + prompts, sampling_params=sampling_params) + outputs = [] + for req_output in req_outputs: + prompt_str = req_output.prompt + prompt_ids = req_output.prompt_token_ids + output_str = req_output.outputs[0].text + output_ids = req_output.outputs[0].token_ids + outputs.append((prompt_ids + output_ids, prompt_str + output_str)) + return outputs + + def generate_greedy( + self, + prompts: List[str], + max_tokens: int, + ) -> List[Tuple[List[int], str]]: + greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) + return self.generate(prompts, greedy_params) + + +@pytest.fixture +def vllm_runner(): + return VllmRunner diff --git a/tests/models/test_models.py b/tests/models/test_models.py new file mode 100644 index 0000000000000..92949a8064147 --- /dev/null +++ b/tests/models/test_models.py @@ -0,0 +1,45 @@ +"""Compare the outputs of HF and vLLM when using greedy sampling. + +Run `pytest tests/models/test_models.py --forked`. +""" +import pytest + +MODELS = [ + "facebook/opt-125m", + "gpt2", + "bigcode/tiny_starcoder_py", + "EleutherAI/gpt-j-6b", + "EleutherAI/pythia-70m", + "bigscience/bloom-560m", + "mosaicml/mpt-7b", + "tiiuae/falcon-7b", + "meta-llama/Llama-2-7b-hf", +] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [128]) +def test_models( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, + max_tokens: int, +) -> None: + hf_model = hf_runner(model, dtype=dtype) + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + del hf_model + + vllm_model = vllm_runner(model, dtype=dtype) + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + del vllm_model + + for i in range(len(example_prompts)): + hf_output_ids, hf_output_str = hf_outputs[i] + vllm_output_ids, vllm_output_str = vllm_outputs[i] + assert hf_output_str == vllm_output_str, ( + f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") + assert hf_output_ids == vllm_output_ids, ( + f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")