mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-03 00:04:39 +08:00
[CI/Build] Reorganize models tests (#7820)
This commit is contained in:
parent
0a4806f0a9
commit
a84e598e21
@ -23,12 +23,10 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
|
||||
# Run basic model test
|
||||
docker exec cpu-test bash -c "
|
||||
pip install pytest matplotlib einops transformers_stream_generator
|
||||
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py \
|
||||
--ignore=tests/models/test_oot_registration.py \
|
||||
--ignore=tests/models/test_registry.py \
|
||||
--ignore=tests/models/test_fp8.py \
|
||||
--ignore=tests/models/test_jamba.py \
|
||||
--ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
||||
pytest -v -s tests/models/decoder_only/language \
|
||||
--ignore=tests/models/test_fp8.py \
|
||||
--ignore=tests/models/decoder_only/language/test_jamba.py \
|
||||
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
|
||||
|
||||
# Run compressed-tensor test
|
||||
docker exec cpu-test bash -c "
|
||||
|
||||
@ -94,7 +94,6 @@ steps:
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
|
||||
- label: Distributed Tests (4 GPUs) # 10min
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
@ -164,15 +163,6 @@ steps:
|
||||
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||
- python3 offline_inference_encoder_decoder.py
|
||||
|
||||
- label: Models Test # 1hr10min
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models
|
||||
commands:
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models -m \"not vlm\" --ignore=models/test_oot_registration.py
|
||||
|
||||
- label: torch compile integration test
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -180,14 +170,6 @@ steps:
|
||||
- pytest -v -s ./compile/test_full_graph.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
|
||||
|
||||
- label: Vision Language Models Test # 42min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
commands:
|
||||
- pytest -v -s models -m vlm
|
||||
|
||||
- label: Prefix Caching Test # 7min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
@ -286,6 +268,45 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s tool_use
|
||||
|
||||
##### models test #####
|
||||
|
||||
- label: Basic Models Test # 3min
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models
|
||||
commands:
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py
|
||||
|
||||
- label: Decoder-only Language Models Test # 1h3min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/language
|
||||
commands:
|
||||
- pytest -v -s models/decoder_only/language
|
||||
|
||||
- label: Decoder-only Multi-Modal Models Test # 56min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/decoder_only/audio_language
|
||||
- tests/models/decoder_only/vision_language
|
||||
commands:
|
||||
- pytest -v -s models/decoder_only/audio_language
|
||||
- pytest -v -s models/decoder_only/vision_language
|
||||
|
||||
- label: Other Models Test # 5min
|
||||
#mirror_hardwares: [amd]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/embedding/language
|
||||
- tests/models/encoder_decoder/language
|
||||
commands:
|
||||
- pytest -v -s models/embedding/language
|
||||
- pytest -v -s models/encoder_decoder/language
|
||||
|
||||
##### 1 GPU test #####
|
||||
##### multi gpus test #####
|
||||
|
||||
@ -311,11 +332,11 @@ steps:
|
||||
- tests/distributed/
|
||||
commands:
|
||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
|
||||
|
||||
- label: Distributed Tests (2 GPUs) # 28min
|
||||
#mirror_hardwares: [amd]
|
||||
@ -328,11 +349,10 @@ steps:
|
||||
- vllm/model_executor/models/
|
||||
- tests/distributed/
|
||||
commands:
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
|
||||
- TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- pytest -v -s distributed/test_basic_distributed_correctness_enc_dec.py
|
||||
- pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||
- pytest -v -s distributed/test_multimodal_broadcast.py
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
|
||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
|
||||
# Avoid importing model tests that cause CUDA reinitialization error
|
||||
- pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
|
||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||
- pip install -e ./plugins/vllm_add_dummy_model
|
||||
- pytest -v -s distributed/test_distributed_oot.py
|
||||
|
||||
@ -342,7 +342,7 @@ Note that, as an inference engine, vLLM does not introduce new models. Therefore
|
||||
|
||||
We have the following levels of testing for models:
|
||||
|
||||
1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `test_models.py <https://github.com/vllm-project/vllm/blob/main/tests/models/test_models.py>`_ and `test_big_models.py <https://github.com/vllm-project/vllm/blob/main/tests/models/test_big_models.py>`_ for the models that have passed this test.
|
||||
1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `models tests <https://github.com/vllm-project/vllm/blob/main/tests/models>`_ for the models that have passed this test.
|
||||
2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
|
||||
3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to `functionality tests <https://github.com/vllm-project/vllm/tree/main/tests>`_ and `examples <https://github.com/vllm-project/vllm/tree/main/examples>`_ for the models that have passed this test.
|
||||
4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
|
||||
|
||||
@ -85,5 +85,6 @@ skip_gitignore = true
|
||||
[tool.pytest.ini_options]
|
||||
markers = [
|
||||
"skip_global_cleanup",
|
||||
"vlm: run tests for vision language models only",
|
||||
"core_model: run this model test in each PR instead of just daily",
|
||||
"distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
|
||||
]
|
||||
|
||||
@ -15,12 +15,15 @@ from vllm.utils import is_hip
|
||||
from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
|
||||
|
||||
from ..models.utils import check_outputs_equal
|
||||
from ..utils import multi_gpu_test
|
||||
|
||||
MODELS = [
|
||||
"facebook/opt-125m",
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
]
|
||||
|
||||
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
|
||||
|
||||
|
||||
def test_vllm_gc_ed():
|
||||
"""Verify vllm instance is GC'ed when it is deleted"""
|
||||
@ -70,6 +73,65 @@ def test_models(
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"model, distributed_executor_backend, attention_backend, "
|
||||
"test_suite", [
|
||||
("facebook/opt-125m", "ray", "", "L4"),
|
||||
("facebook/opt-125m", "mp", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
|
||||
("facebook/opt-125m", "ray", "", "A100"),
|
||||
("facebook/opt-125m", "mp", "", "A100"),
|
||||
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
|
||||
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
|
||||
])
|
||||
def test_models_distributed(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
distributed_executor_backend: str,
|
||||
attention_backend: str,
|
||||
test_suite: str,
|
||||
) -> None:
|
||||
|
||||
if test_suite != TARGET_TEST_SUITE:
|
||||
pytest.skip(f"Skip test for {test_suite}")
|
||||
|
||||
if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
|
||||
# test ray adag
|
||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
||||
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
|
||||
|
||||
if attention_backend:
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
def test_model_with_failure(vllm_runner) -> None:
|
||||
try:
|
||||
with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
|
||||
|
||||
@ -6,11 +6,13 @@ prefill requests are chunked.
|
||||
|
||||
Run `pytest tests/models/test_chunked_prefill.py`.
|
||||
"""
|
||||
import os
|
||||
from contextlib import nullcontext
|
||||
|
||||
import pytest
|
||||
|
||||
from ..models.utils import check_logprobs_close, check_outputs_equal
|
||||
from ..utils import multi_gpu_test
|
||||
|
||||
MODELS = [
|
||||
"facebook/opt-125m",
|
||||
@ -66,6 +68,59 @@ def test_models(
|
||||
)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_models_distributed(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
distributed_executor_backend: str,
|
||||
) -> None:
|
||||
if (model == "meta-llama/Llama-2-7b-hf"
|
||||
and distributed_executor_backend == "ray"):
|
||||
# test ray adag
|
||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
||||
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
chunked_prefill_token_size = 16
|
||||
|
||||
# Add a chunked prefill config.
|
||||
max_num_seqs = min(chunked_prefill_token_size, 256)
|
||||
assert chunked_prefill_token_size != -1
|
||||
enable_chunked_prefill = True
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
max_num_seqs=max_num_seqs,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kv_cache_dtype,model",
|
||||
[("fp8_e4m3",
|
||||
|
||||
@ -19,10 +19,13 @@ MODELS = [
|
||||
"facebook/opt-125m",
|
||||
]
|
||||
|
||||
assert ENABLE_ARTIFICIAL_PREEMPT is True, (
|
||||
"Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
|
||||
"`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
|
||||
"tests/basic_correctness/test_preemption.py`")
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def check_settings():
|
||||
assert ENABLE_ARTIFICIAL_PREEMPT is True, (
|
||||
"Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
|
||||
"`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
|
||||
"tests/basic_correctness/test_preemption.py`")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
||||
@ -6,8 +6,8 @@ import sys
|
||||
import tempfile
|
||||
from collections import UserList
|
||||
from enum import Enum
|
||||
from typing import (Any, Callable, Dict, List, Optional, Tuple, TypedDict,
|
||||
TypeVar, Union)
|
||||
from typing import (Any, Callable, Dict, List, Optional, Tuple, Type,
|
||||
TypedDict, TypeVar, Union)
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
@ -18,6 +18,7 @@ from huggingface_hub import snapshot_download
|
||||
from PIL import Image
|
||||
from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
|
||||
BatchFeature)
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.image import ImageAsset
|
||||
@ -260,7 +261,7 @@ class HfRunner:
|
||||
*,
|
||||
model_kwargs: Optional[Dict[str, Any]] = None,
|
||||
is_embedding_model: bool = False,
|
||||
auto_cls=AutoModelForCausalLM,
|
||||
auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
|
||||
postprocess_inputs: Callable[[BatchEncoding],
|
||||
BatchEncoding] = identity,
|
||||
) -> None:
|
||||
@ -292,20 +293,14 @@ class HfRunner:
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
try:
|
||||
# don't put this import at the top level
|
||||
# it will call torch.cuda.device_count()
|
||||
from transformers import AutoProcessor # noqa: F401
|
||||
self.processor = AutoProcessor.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch_dtype,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"Unable to auto-load HuggingFace processor for model (%s). "
|
||||
"Using tokenizer instead. Reason: %s", model_name, exc)
|
||||
self.processor = self.tokenizer
|
||||
# don't put this import at the top level
|
||||
# it will call torch.cuda.device_count()
|
||||
from transformers import AutoProcessor # noqa: F401
|
||||
self.processor = AutoProcessor.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=torch_dtype,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
self.postprocess_inputs = postprocess_inputs
|
||||
|
||||
|
||||
@ -1,80 +0,0 @@
|
||||
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
||||
|
||||
Run:
|
||||
```sh
|
||||
cd $VLLM_PATH/tests
|
||||
|
||||
pytest distributed/test_basic_distributed_correctness.py
|
||||
```
|
||||
"""
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
|
||||
from ..models.utils import check_outputs_equal
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
|
||||
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
|
||||
|
||||
|
||||
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize(
|
||||
"model, distributed_executor_backend, attention_backend, "
|
||||
"test_suite", [
|
||||
("facebook/opt-125m", "ray", "", "L4"),
|
||||
("facebook/opt-125m", "mp", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
|
||||
("facebook/opt-125m", "ray", "", "A100"),
|
||||
("facebook/opt-125m", "mp", "", "A100"),
|
||||
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
|
||||
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
|
||||
])
|
||||
@fork_new_process_for_each_test
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
distributed_executor_backend: str,
|
||||
attention_backend: str,
|
||||
test_suite: str,
|
||||
) -> None:
|
||||
|
||||
if test_suite != TARGET_TEST_SUITE:
|
||||
pytest.skip(f"Skip test for {test_suite}")
|
||||
|
||||
if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
|
||||
# test ray adag
|
||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
||||
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
|
||||
|
||||
if attention_backend:
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
@ -1,102 +0,0 @@
|
||||
"""For encoder/decoder models only:
|
||||
Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
||||
|
||||
Run:
|
||||
```sh
|
||||
cd $VLLM_PATH/tests
|
||||
|
||||
pytest distributed/test_basic_distributed_correctness_enc_dec.py
|
||||
```
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from transformers import AutoModelForSeq2SeqLM
|
||||
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
|
||||
from ..conftest import DecoderPromptType
|
||||
from ..models.utils import check_logprobs_close
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
|
||||
|
||||
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("model, distributed_executor_backend", [
|
||||
("facebook/bart-large-cnn", "ray"),
|
||||
("facebook/bart-large-cnn", "mp"),
|
||||
])
|
||||
@fork_new_process_for_each_test
|
||||
def test_models(
|
||||
model: str,
|
||||
distributed_executor_backend: str,
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_encoder_decoder_prompts,
|
||||
) -> None:
|
||||
'''
|
||||
Test vLLM BART inference on more than one GPU, comparing
|
||||
outputs against HF as a baseline.
|
||||
|
||||
Fork a new process for each test, to prevent CUDA from
|
||||
being re-initialized by successive tests within the same
|
||||
process.
|
||||
|
||||
Arguments:
|
||||
|
||||
* model: the HF ID of the specific BART variant under test
|
||||
* distributed_executor_backend
|
||||
* hf_runner: HuggingFace (HF) test model runner
|
||||
* vllm_runner: vLLM test model runner
|
||||
* example_encoder_decoder_prompts: test fixture which provides a
|
||||
dictionary of dummy prompts
|
||||
'''
|
||||
|
||||
dtype = "float"
|
||||
max_tokens = 64
|
||||
num_logprobs = 5
|
||||
|
||||
# Example inputs with non-trivial (i.e. not None/empty) encoder &
|
||||
# decoder prompts.
|
||||
test_prompts = example_encoder_decoder_prompts[DecoderPromptType.CUSTOM]
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
|
||||
test_prompts, max_tokens, num_logprobs)
|
||||
|
||||
# Configuration settings for HF baseline
|
||||
hf_kwargs = {
|
||||
"top_k": None,
|
||||
"num_beams": 1,
|
||||
"repetition_penalty": 1.0,
|
||||
"top_p": 1.0,
|
||||
"length_penalty": 1.0,
|
||||
"early_stopping": False,
|
||||
"no_repeat_ngram_size": None,
|
||||
"min_length": 0
|
||||
}
|
||||
|
||||
with hf_runner(model, dtype=dtype,
|
||||
auto_cls=AutoModelForSeq2SeqLM) as hf_model:
|
||||
hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
|
||||
test_prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
**hf_kwargs,
|
||||
))
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
@ -1,75 +0,0 @@
|
||||
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
||||
|
||||
Run:
|
||||
```sh
|
||||
pytest test_chunked_prefill_distributed.py
|
||||
```
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
|
||||
from ..models.utils import check_outputs_equal
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
|
||||
|
||||
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("model, distributed_executor_backend", [
|
||||
("facebook/opt-125m", "ray"),
|
||||
("meta-llama/Llama-2-7b-hf", "ray"),
|
||||
("facebook/opt-125m", "mp"),
|
||||
("meta-llama/Llama-2-7b-hf", "mp"),
|
||||
])
|
||||
@fork_new_process_for_each_test
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
distributed_executor_backend: str,
|
||||
) -> None:
|
||||
if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray": # noqa
|
||||
assert distributed_executor_backend == "ray"
|
||||
# test ray adag
|
||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
||||
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
chunked_prefill_token_size = 16
|
||||
|
||||
# Add a chunked prefill config.
|
||||
max_num_seqs = min(chunked_prefill_token_size, 256)
|
||||
assert chunked_prefill_token_size != -1
|
||||
enable_chunked_prefill = True
|
||||
max_num_batched_tokens = chunked_prefill_token_size
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=2,
|
||||
max_num_seqs=max_num_seqs,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with hf_runner(model, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
@ -1,58 +0,0 @@
|
||||
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
||||
|
||||
Run:
|
||||
```sh
|
||||
pytest -s -v test_multimodal_broadcast.py
|
||||
```
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
|
||||
|
||||
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("model, distributed_executor_backend", [
|
||||
("llava-hf/llava-1.5-7b-hf", "ray"),
|
||||
("llava-hf/llava-v1.6-mistral-7b-hf", "ray"),
|
||||
("facebook/chameleon-7b", "ray"),
|
||||
("llava-hf/llava-1.5-7b-hf", "mp"),
|
||||
("llava-hf/llava-v1.6-mistral-7b-hf", "mp"),
|
||||
("facebook/chameleon-7b", "mp"),
|
||||
])
|
||||
@fork_new_process_for_each_test
|
||||
def test_models(hf_runner, vllm_runner, image_assets, model: str,
|
||||
distributed_executor_backend: str) -> None:
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
num_logprobs = 5
|
||||
tensor_parallel_size = 2
|
||||
|
||||
if model.startswith("llava-hf/llava-1.5"):
|
||||
from ..models.test_llava import models, run_test
|
||||
elif model.startswith("llava-hf/llava-v1.6"):
|
||||
from ..models.test_llava_next import run_test # type: ignore[no-redef]
|
||||
from ..models.test_llava_next import models
|
||||
elif model.startswith("facebook/chameleon"):
|
||||
from ..models.test_chameleon import run_test # type: ignore[no-redef]
|
||||
from ..models.test_chameleon import models
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported model: {model}")
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model=models[0],
|
||||
# So that LLaVA-NeXT processor may return nested list
|
||||
size_factors=[0.25, 0.5, 1.0],
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
||||
@ -1,13 +1,13 @@
|
||||
import os
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.parallel_state import in_the_same_node_as
|
||||
|
||||
torch.distributed.init_process_group(backend="gloo")
|
||||
test_result = all(
|
||||
in_the_same_node_as(torch.distributed.group.WORLD, source_rank=0))
|
||||
if __name__ == "__main__":
|
||||
dist.init_process_group(backend="gloo")
|
||||
test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0))
|
||||
|
||||
expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
|
||||
assert test_result == expected, f"Expected {expected}, got {test_result}"
|
||||
print("Same node test passed!")
|
||||
expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
|
||||
assert test_result == expected, f"Expected {expected}, got {test_result}"
|
||||
print("Same node test passed!")
|
||||
|
||||
@ -10,7 +10,6 @@ import pytest
|
||||
import torch
|
||||
|
||||
from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
|
||||
from vllm.attention.backends.xformers import XFormersBackend
|
||||
from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL,
|
||||
make_tensor_with_pad)
|
||||
|
||||
@ -521,6 +520,9 @@ def make_backend(backend_name: str) -> AttentionBackend:
|
||||
* Backend instance
|
||||
'''
|
||||
if backend_name == STR_XFORMERS_ATTN_VAL:
|
||||
# NOTE: xFormers backend cannot be imported for CPU and AMD GPUs.
|
||||
from vllm.attention.backends.xformers import XFormersBackend
|
||||
|
||||
return XFormersBackend()
|
||||
raise AssertionError(
|
||||
f"Unrecognized backend_name {backend_name} for unit test")
|
||||
|
||||
0
tests/models/decoder_only/__init__.py
Normal file
0
tests/models/decoder_only/__init__.py
Normal file
@ -7,10 +7,8 @@ from transformers import AutoModel, AutoTokenizer, BatchEncoding
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
from ..conftest import HfRunner, VllmRunner
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import HfRunner, VllmRunner
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODEL_NAME = "fixie-ai/ultravox-v0_3"
|
||||
|
||||
0
tests/models/decoder_only/language/__init__.py
Normal file
0
tests/models/decoder_only/language/__init__.py
Normal file
@ -7,7 +7,7 @@ Run `pytest tests/models/test_big_models.py`.
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from .utils import check_outputs_equal
|
||||
from ...utils import check_outputs_equal
|
||||
|
||||
MODELS = [
|
||||
"meta-llama/Llama-2-7b-hf",
|
||||
@ -6,7 +6,7 @@ Run `pytest tests/models/test_danube3_4b.py`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from .utils import check_outputs_equal
|
||||
from ...utils import check_outputs_equal
|
||||
|
||||
MODELS = ["h2oai/h2o-danube3-4b-base"]
|
||||
|
||||
@ -10,7 +10,7 @@ import pytest
|
||||
from tests.kernels.utils import override_backend_env_variable
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ..models.utils import check_logprobs_close
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
@ -11,7 +11,7 @@ from transformers import AutoTokenizer
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from .utils import check_logprobs_close
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
@ -15,7 +15,7 @@ import pytest
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
|
||||
|
||||
from .utils import check_logprobs_close
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
@ -10,9 +10,10 @@ from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.utils import check_logprobs_close
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelPair:
|
||||
@ -6,7 +6,7 @@ import importlib.metadata
|
||||
|
||||
import pytest
|
||||
|
||||
from .utils import check_logprobs_close
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
TRANSFORMERS_VERSION = tuple(
|
||||
map(int,
|
||||
@ -1,8 +1,9 @@
|
||||
import pytest
|
||||
|
||||
from tests.models.utils import check_outputs_equal
|
||||
from vllm.worker.model_runner import _get_graph_batch_size
|
||||
|
||||
from ...utils import check_outputs_equal
|
||||
|
||||
MODELS = ["ai21labs/Jamba-tiny-random"]
|
||||
|
||||
|
||||
@ -16,7 +16,7 @@ import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from .utils import check_logprobs_close
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -4,7 +4,7 @@ Run `pytest tests/models/test_mistral.py`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from .utils import check_logprobs_close
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = [
|
||||
"mistralai/Mistral-7B-Instruct-v0.1",
|
||||
@ -7,7 +7,7 @@ Run `pytest tests/models/test_models.py`.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from .utils import check_outputs_equal
|
||||
from ...utils import check_outputs_equal
|
||||
|
||||
MODELS = [
|
||||
"facebook/opt-125m",
|
||||
@ -7,7 +7,7 @@ import torch
|
||||
|
||||
from vllm.utils import is_cpu
|
||||
|
||||
from .utils import check_logprobs_close
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = [
|
||||
"microsoft/Phi-3.5-MoE-instruct",
|
||||
@ -6,10 +6,8 @@ from transformers import AutoModelForVision2Seq, AutoTokenizer
|
||||
from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ..conftest import IMAGE_ASSETS
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import IMAGE_ASSETS
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
@ -56,7 +54,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
dtype: str, max_tokens: int, num_logprobs: int) -> None:
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalData objects and corresponding
|
||||
MultiModalConfig as input.
|
||||
42
tests/models/decoder_only/vision_language/test_broadcast.py
Normal file
42
tests/models/decoder_only/vision_language/test_broadcast.py
Normal file
@ -0,0 +1,42 @@
|
||||
import pytest
|
||||
|
||||
from ....utils import multi_gpu_test
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
||||
@pytest.mark.parametrize("model", [
|
||||
"llava-hf/llava-1.5-7b-hf",
|
||||
"llava-hf/llava-v1.6-mistral-7b-hf",
|
||||
"facebook/chameleon-7b",
|
||||
])
|
||||
def test_models(hf_runner, vllm_runner, image_assets,
|
||||
distributed_executor_backend, model) -> None:
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
num_logprobs = 5
|
||||
tensor_parallel_size = 2
|
||||
|
||||
if model.startswith("llava-hf/llava-1.5"):
|
||||
from .test_llava import models, run_test
|
||||
elif model.startswith("llava-hf/llava-v1.6"):
|
||||
from .test_llava_next import models, run_test # type: ignore[no-redef]
|
||||
elif model.startswith("facebook/chameleon"):
|
||||
from .test_chameleon import models, run_test # type: ignore[no-redef]
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported model: {model}")
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model=models[0],
|
||||
# So that LLaVA-NeXT processor may return nested list
|
||||
size_factors=[0.25, 0.5, 1.0],
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
||||
@ -6,10 +6,8 @@ from transformers import AutoModelForVision2Seq, BatchEncoding
|
||||
from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||
from .utils import check_outputs_equal
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||
from ...utils import check_outputs_equal
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
@ -36,7 +34,7 @@ def run_test(
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding vision language config as input.
|
||||
@ -6,10 +6,8 @@ from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.utils import is_cpu
|
||||
|
||||
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
@ -46,7 +44,7 @@ def run_test(
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
@ -6,9 +6,7 @@ import torch.nn as nn
|
||||
from huggingface_hub import snapshot_download
|
||||
from transformers import AutoConfig, AutoModel, CLIPImageProcessor
|
||||
|
||||
from ..conftest import _ImageAssets, cleanup
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import _ImageAssets, cleanup
|
||||
|
||||
# we use snapshot_download to prevent conflicts between
|
||||
# dynamic_module and trust_remote_code for hf_runner
|
||||
@ -9,11 +9,9 @@ from transformers import AutoConfig
|
||||
from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.utils import is_cpu
|
||||
|
||||
from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
||||
_ImageAssets)
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
||||
_ImageAssets)
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
@ -78,7 +76,7 @@ def run_test(
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
@ -8,11 +8,9 @@ from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
|
||||
from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
||||
_ImageAssets)
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
||||
_ImageAssets)
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
_LIMIT_IMAGE_PER_PROMPT = 4
|
||||
|
||||
@ -143,7 +141,7 @@ def _run_test(
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
@ -239,7 +237,7 @@ def _run_test(
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
dtype: str, max_tokens: int, num_logprobs: int) -> None:
|
||||
dtype, max_tokens, num_logprobs) -> None:
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
@ -5,10 +5,8 @@ from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
|
||||
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
@ -62,7 +60,7 @@ def run_test(
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding vision language config as input.
|
||||
@ -6,11 +6,9 @@ from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
|
||||
from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
||||
_ImageAssets)
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
||||
_ImageAssets)
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
_LIMIT_IMAGE_PER_PROMPT = 4
|
||||
|
||||
@ -197,7 +195,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
dtype, max_tokens, num_logprobs) -> None:
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
@ -8,10 +8,8 @@ from vllm.multimodal.utils import (rescale_video_size, resize_video,
|
||||
sample_frames_from_video)
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ..conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
_PREFACE = (
|
||||
"A chat between a curious human and an artificial intelligence assistant. "
|
||||
@ -9,10 +9,8 @@ from transformers import BatchEncoding
|
||||
from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
# The image token is placed before "user" on purpose so that the test can pass
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
@ -65,7 +63,7 @@ def run_test(
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
@ -8,10 +8,8 @@ from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.utils import is_hip
|
||||
|
||||
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
@ -69,7 +67,7 @@ def run_test(
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
@ -9,10 +9,8 @@ from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.utils import is_cpu, is_hip
|
||||
|
||||
from ..conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
@ -71,7 +69,7 @@ def run_test(
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
@ -5,7 +5,7 @@ Run `pytest tests/models/test_mistral.py`.
|
||||
import json
|
||||
import uuid
|
||||
from dataclasses import asdict
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
|
||||
|
||||
import pytest
|
||||
from mistral_common.protocol.instruct.messages import ImageURLChunk
|
||||
@ -17,9 +17,11 @@ from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt
|
||||
from vllm.multimodal import MultiModalDataBuiltins
|
||||
from vllm.sequence import Logprob, SampleLogprobs
|
||||
|
||||
from .utils import check_logprobs_close
|
||||
from ....utils import VLLM_PATH
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
if TYPE_CHECKING:
|
||||
from _typeshed import StrPath
|
||||
|
||||
MODELS = ["mistralai/Pixtral-12B-2409"]
|
||||
IMG_URLS = [
|
||||
@ -83,14 +85,21 @@ SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
|
||||
LIMIT_MM_PER_PROMPT = dict(image=4)
|
||||
|
||||
MAX_MODEL_LEN = [8192, 65536]
|
||||
FIXTURE_LOGPROBS_CHAT = "tests/models/fixtures/pixtral_chat.json"
|
||||
FIXTURE_LOGPROBS_ENGINE = "tests/models/fixtures/pixtral_chat_engine.json"
|
||||
|
||||
FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
|
||||
assert FIXTURES_PATH.exists()
|
||||
|
||||
FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json"
|
||||
FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json"
|
||||
|
||||
OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]]
|
||||
|
||||
|
||||
# For the test author to store golden output in JSON
|
||||
def _dump_outputs_w_logprobs(outputs: OutputsLogprobs, filename: str) -> None:
|
||||
def _dump_outputs_w_logprobs(
|
||||
outputs: OutputsLogprobs,
|
||||
filename: "StrPath",
|
||||
) -> None:
|
||||
json_data = [(tokens, text,
|
||||
[{k: asdict(v)
|
||||
for k, v in token_logprobs.items()}
|
||||
@ -101,7 +110,7 @@ def _dump_outputs_w_logprobs(outputs: OutputsLogprobs, filename: str) -> None:
|
||||
json.dump(json_data, f)
|
||||
|
||||
|
||||
def load_outputs_w_logprobs(filename: str) -> OutputsLogprobs:
|
||||
def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
|
||||
with open(filename, "rb") as f:
|
||||
json_data = json.load(f)
|
||||
|
||||
@ -10,11 +10,9 @@ from vllm.inputs import InputContext, LLMInputs
|
||||
from vllm.multimodal.base import MultiModalInputs
|
||||
from vllm.multimodal.utils import cached_get_tokenizer, rescale_image_size
|
||||
|
||||
from ..conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput,
|
||||
VllmRunner, _ImageAssets)
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput,
|
||||
VllmRunner, _ImageAssets)
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
text_only_models = [
|
||||
"Qwen/Qwen-7B-Chat" # Has no visual component
|
||||
0
tests/models/embedding/__init__.py
Normal file
0
tests/models/embedding/__init__.py
Normal file
0
tests/models/embedding/language/__init__.py
Normal file
0
tests/models/embedding/language/__init__.py
Normal file
0
tests/models/encoder_decoder/__init__.py
Normal file
0
tests/models/encoder_decoder/__init__.py
Normal file
0
tests/models/encoder_decoder/language/__init__.py
Normal file
0
tests/models/encoder_decoder/language/__init__.py
Normal file
@ -1,8 +1,8 @@
|
||||
"""Compare the outputs of HF and vLLM for BART models using greedy sampling.
|
||||
|
||||
Run `pytest tests/models/test_bart.py`.
|
||||
Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
|
||||
"""
|
||||
from typing import List, Optional, Tuple
|
||||
from typing import List, Optional, Tuple, Type
|
||||
|
||||
from vllm.utils import is_cpu
|
||||
|
||||
@ -16,8 +16,10 @@ if not is_cpu():
|
||||
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ..conftest import DecoderPromptType
|
||||
from .utils import check_logprobs_close
|
||||
from ....conftest import (DecoderPromptType, ExplicitEncoderDecoderPrompt,
|
||||
HfRunner, VllmRunner)
|
||||
from ....utils import multi_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"]
|
||||
|
||||
@ -34,20 +36,18 @@ if not is_cpu():
|
||||
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_encoder_decoder_prompts,
|
||||
def run_test(
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
|
||||
decoder_prompt_type: DecoderPromptType,
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
decoder_prompt_type: DecoderPromptType,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
) -> None:
|
||||
'''
|
||||
Test the vLLM BART model for a variety of encoder/decoder input prompts,
|
||||
@ -116,8 +116,29 @@ if not is_cpu():
|
||||
token during the process of validating the vLLM decoded output.
|
||||
'''
|
||||
|
||||
test_case_prompts = example_encoder_decoder_prompts[
|
||||
decoder_prompt_type]
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default).
|
||||
|
||||
# Note: currently encoder/decoder models are only compatible with
|
||||
# enforce_eager=True. Normally this is not a problem because
|
||||
# for encoder/decoder models vLLM will
|
||||
# default to enforce_eager=True if enforce_eager
|
||||
# is left unspecified. However, the
|
||||
# VllmRunner test fixture (which wraps around the LLM class) defaults to
|
||||
# enforce_eager=False (a behavior which a number of already-exisitng
|
||||
# decoder-only unit tests expect), so when testing an encoder/decoder
|
||||
# model we must explicitly specify enforce_eager=True in the VllmRunner
|
||||
# constructor.
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
|
||||
prompts, max_tokens, num_logprobs)
|
||||
|
||||
# Configuration settings for HF baseline
|
||||
hf_kwargs = {
|
||||
@ -135,26 +156,12 @@ if not is_cpu():
|
||||
auto_cls=AutoModelForSeq2SeqLM) as hf_model:
|
||||
hf_outputs = (
|
||||
hf_model.generate_encoder_decoder_greedy_logprobs_limit(
|
||||
test_case_prompts,
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
**hf_kwargs,
|
||||
))
|
||||
|
||||
# Note: currently encoder/decoder models are only compatible with
|
||||
# enforce_eager=True. Normally this is not a problem because
|
||||
# for encoder/decoder models vLLM will
|
||||
# default to enforce_eager=True if enforce_eager
|
||||
# is left unspecified. However, the
|
||||
# VllmRunner test fixture (which wraps around the LLM class) defaults to
|
||||
# enforce_eager=False (a behavior which a number of already-exisitng
|
||||
# decoder-only unit tests expect), so when testing an encoder/decoder
|
||||
# model we must explicitly specify enforce_eager=True in the VllmRunner
|
||||
# constructor.
|
||||
with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
|
||||
test_case_prompts, max_tokens, num_logprobs)
|
||||
|
||||
hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
|
||||
else 0)
|
||||
|
||||
@ -168,3 +175,49 @@ if not is_cpu():
|
||||
name_1="vllm",
|
||||
num_outputs_0_skip_tokens=hf_skip_tokens,
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
|
||||
def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts,
|
||||
model, dtype, max_tokens, num_logprobs,
|
||||
decoder_prompt_type) -> None:
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_encoder_decoder_prompts[decoder_prompt_type],
|
||||
decoder_prompt_type,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
|
||||
@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [64])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("decoder_prompt_type", [DecoderPromptType.CUSTOM])
|
||||
def test_models_distributed(hf_runner, vllm_runner,
|
||||
example_encoder_decoder_prompts,
|
||||
distributed_executor_backend, model, dtype,
|
||||
max_tokens, num_logprobs,
|
||||
decoder_prompt_type) -> None:
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_encoder_decoder_prompts[decoder_prompt_type],
|
||||
decoder_prompt_type,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
)
|
||||
@ -10,6 +10,7 @@ from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import requests
|
||||
from openai.types.completion import Completion
|
||||
from transformers import AutoTokenizer
|
||||
@ -22,7 +23,8 @@ from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser
|
||||
from vllm.model_executor.model_loader.loader import get_model_loader
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import FlexibleArgumentParser, get_open_port, is_hip
|
||||
from vllm.utils import (FlexibleArgumentParser, cuda_device_count_stateless,
|
||||
get_open_port, is_hip)
|
||||
|
||||
if current_platform.is_rocm():
|
||||
from amdsmi import (amdsmi_get_gpu_vram_usage,
|
||||
@ -452,6 +454,22 @@ def fork_new_process_for_each_test(
|
||||
return wrapper
|
||||
|
||||
|
||||
def multi_gpu_test(*, num_gpus: int):
|
||||
"""
|
||||
Decorate a test to be run only when multiple GPUs are available.
|
||||
"""
|
||||
test_selector = getattr(pytest.mark, f"distributed_{num_gpus}_gpus")
|
||||
test_skipif = pytest.mark.skipif(
|
||||
cuda_device_count_stateless() < num_gpus,
|
||||
reason=f"Need at least {num_gpus} GPUs to run the test.",
|
||||
)
|
||||
|
||||
def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
|
||||
return test_selector(test_skipif(fork_new_process_for_each_test(f)))
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
async def completions_with_server_args(
|
||||
prompts: List[str],
|
||||
model_name: str,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user