mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 07:15:01 +08:00
89 lines
2.9 KiB
Python
89 lines
2.9 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
import gc
|
|
|
|
import pytest
|
|
import torch
|
|
|
|
from vllm import LLM, SamplingParams
|
|
from vllm.config.compilation import CompilationMode, DynamicShapesType
|
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
|
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
|
|
|
|
|
def get_test_models():
|
|
"""Get list of models to test based on PyTorch version"""
|
|
# TODO "Qwen/Qwen3-4B-Instruct-2507" fails Fix issue and support it.
|
|
return ["gpt2", "Qwen/Qwen2-7B-Instruct", "meta-llama/Llama-3.1-8B"]
|
|
|
|
|
|
@pytest.mark.parametrize("model_name", get_test_models())
|
|
@pytest.mark.parametrize(
|
|
"shapes_type",
|
|
[
|
|
DynamicShapesType.BACKED,
|
|
DynamicShapesType.UNBACKED,
|
|
DynamicShapesType.BACKED_SIZE_OBLIVIOUS,
|
|
],
|
|
)
|
|
@pytest.mark.parametrize("use_aot_compile", ["0"])
|
|
@pytest.mark.parametrize("use_bytecode_hook", [True, False])
|
|
@pytest.mark.skipif(
|
|
not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
|
|
)
|
|
def test_dynamic_shapes_compilation(
|
|
monkeypatch, model_name, shapes_type, use_aot_compile, use_bytecode_hook
|
|
):
|
|
"""Test that all dynamic shapes types compile successfully"""
|
|
print(
|
|
f"\nTesting model: {model_name} with {shapes_type.name}, "
|
|
f"AOT compile: {use_aot_compile}, "
|
|
f"Bytecode hook: {use_bytecode_hook}"
|
|
)
|
|
if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED:
|
|
pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0")
|
|
|
|
monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile)
|
|
monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
|
|
|
|
prompt = "Hello, my name is"
|
|
|
|
print(f"Testing {shapes_type.name} dynamic shapes...")
|
|
|
|
# Initialize the model with specific dynamic shapes configuration
|
|
model = LLM(
|
|
model=model_name,
|
|
compilation_config={
|
|
"mode": CompilationMode.VLLM_COMPILE,
|
|
"dynamic_shapes_config": {
|
|
"type": shapes_type.value,
|
|
},
|
|
},
|
|
)
|
|
|
|
output = model.generate(prompt)
|
|
result = output[0].outputs[0].text
|
|
# Example of setting the sampling parameters
|
|
tokenizer = get_tokenizer(model_name)
|
|
yes_tokens = tokenizer.encode("yes", add_special_tokens=False)
|
|
no_tokens = tokenizer.encode("no", add_special_tokens=False)
|
|
allowed_ids = list(set(yes_tokens + no_tokens))
|
|
sampling_params = SamplingParams(
|
|
max_tokens=1, temperature=0, allowed_token_ids=allowed_ids
|
|
)
|
|
|
|
output = model.generate(
|
|
"answer with yes or no is " + result + " rubbish for prompt " + prompt + "?",
|
|
sampling_params=sampling_params,
|
|
)
|
|
result = output[0].outputs[0].text
|
|
assert result == "yes"
|
|
|
|
# Clean up GPU memory
|
|
del model
|
|
gc.collect()
|
|
torch.cuda.empty_cache()
|
|
torch.cuda.synchronize()
|
|
print("GPU memory cleared")
|