[V0 Deprecation] Remove VLLM_USE_V1 from tests (#26341)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-10-07 23:42:31 +08:00 committed by GitHub
parent c0a7b89d8e
commit 1e4ecca1d0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
51 changed files with 817 additions and 1275 deletions

View File

@ -296,6 +296,7 @@ steps:
- tests/v1
commands:
# split the test to avoid interference
- pytest -v -s -m 'not cpu_test' v1/core
- pytest -v -s v1/executor
- pytest -v -s v1/kv_offload
- pytest -v -s v1/sample
@ -317,7 +318,7 @@ steps:
no_gpu: true
commands:
# split the test to avoid interference
- pytest -v -s v1/core
- pytest -v -s -m 'cpu_test' v1/core
- pytest -v -s v1/structured_output
- pytest -v -s v1/test_serial_utils.py
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit

View File

@ -13,7 +13,7 @@ import pytest
import torch
from vllm import LLM
from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
from vllm.v1.engine.llm_engine import LLMEngine
from ..conftest import HfRunner, VllmRunner
from ..models.utils import check_outputs_equal
@ -211,16 +211,11 @@ def test_models_distributed(
def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
from vllm.envs import VLLM_USE_V1
if not VLLM_USE_V1:
pytest.skip("Skipping V0 test, dump input not supported")
# Needed to mock an error in the same process
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
with vllm_runner("facebook/opt-125m", enforce_eager=True) as vllm_model:
if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
if isinstance(vllm_model.llm.llm_engine, LLMEngine):
v1_test_failed_model_execution(vllm_model)

View File

@ -117,68 +117,59 @@ def test_cumem_with_cudagraph():
@create_new_process_for_each_test()
@pytest.mark.parametrize(
"model, use_v1",
"model",
[
# sleep mode with safetensors
("meta-llama/Llama-3.2-1B", True),
"meta-llama/Llama-3.2-1B",
# sleep mode with pytorch checkpoint
("facebook/opt-125m", True),
"facebook/opt-125m",
],
)
def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
with monkeypatch.context() as m:
assert use_v1
m.setenv("VLLM_USE_V1", "1")
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True)
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params)
def test_end_to_end(model: str):
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True)
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params)
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
# which is difficult to measure in the test. therefore, we only
# test sleep level 1 here.
llm.sleep(level=1)
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
# which is difficult to measure in the test. therefore, we only
# test sleep level 1 here.
llm.sleep(level=1)
free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
# now the memory usage is mostly cudagraph memory pool,
# and it should be less than the model weights (1B model, 2GiB weights)
free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
# now the memory usage is mostly cudagraph memory pool,
# and it should be less than the model weights (1B model, 2GiB weights)
# NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
# is captured but cannot be releasesd from PyTorch due to a known bug,
# therefore high memory usage after `llm.sleep` is called is expected.
# FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
# in V1.
if use_v1:
assert used_bytes < 7 * GiB_bytes
else:
assert used_bytes < 2 * GiB_bytes
# NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
# is captured but cannot be releasesd from PyTorch due to a known bug,
# therefore high memory usage after `llm.sleep` is called is expected.
# FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
# in V1.
assert used_bytes < 7 * GiB_bytes
llm.wake_up()
output2 = llm.generate(prompt, sampling_params)
# cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text
llm.wake_up()
output2 = llm.generate(prompt, sampling_params)
# cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text
llm.sleep(level=1)
llm.wake_up(tags=["weights"])
llm.sleep(level=1)
llm.wake_up(tags=["weights"])
free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
# should just reallocate memory for weights (1B model, ~2GiB weights)
if use_v1:
assert used_bytes < 10 * GiB_bytes
else:
assert used_bytes < 6 * GiB_bytes
# should just reallocate memory for weights (1B model, ~2GiB weights)
assert used_bytes < 10 * GiB_bytes
# now allocate kv cache memory
llm.wake_up(tags=["kv_cache"])
output3 = llm.generate(prompt, sampling_params)
# now allocate kv cache memory
llm.wake_up(tags=["kv_cache"])
output3 = llm.generate(prompt, sampling_params)
# cmp output
assert output[0].outputs[0].text == output3[0].outputs[0].text
# cmp output
assert output[0].outputs[0].text == output3[0].outputs[0].text
@create_new_process_for_each_test()

View File

@ -66,7 +66,6 @@ def llm_pair(request):
pytest.skip("Only Blackwell GPUs support Cutlass MLA")
env_vars = {
"VLLM_USE_V1": "1",
# Force native sampler to avoid potential nondeterminism in FlashInfer
# when per-request generators are not used in V1.
"VLLM_USE_FLASHINFER_SAMPLER": "0",
@ -161,7 +160,6 @@ def test_full_cudagraph_with_invalid_backend():
with (
temporary_environ(
{
"VLLM_USE_V1": "1",
"VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION",
# Flex_Attention is not supported with full cuda graph
}

View File

@ -18,7 +18,6 @@ from vllm.config import (
VllmConfig,
set_current_vllm_config,
)
from vllm.envs import VLLM_USE_V1
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils import is_torch_equal_or_newer
@ -127,7 +126,6 @@ def _run_simple_model(
@pytest.mark.parametrize("use_inductor", [True, False])
@torch.inference_mode()
def test_simple_piecewise_compile(use_inductor):
assert VLLM_USE_V1
_run_simple_model(
splitting_ops=["silly.attention"],
use_inductor_graph_partition=False,
@ -146,7 +144,6 @@ def test_simple_piecewise_compile(use_inductor):
@torch.inference_mode()
@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []])
def test_simple_inductor_graph_partition(splitting_ops):
assert VLLM_USE_V1
if not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")

View File

@ -388,10 +388,6 @@ def test_async_tp_pass_correctness(
"pass_config": {"enable_async_tp": async_tp_enabled},
}
async_tp_env = tp_env = {
"VLLM_USE_V1": "1",
}
async_tp_args = [
*common_args,
"--tensor-parallel-size",
@ -410,6 +406,4 @@ def test_async_tp_pass_correctness(
"mp",
]
compare_two_settings(
model_id, async_tp_args, tp_args, async_tp_env, tp_env, method="generate"
)
compare_two_settings(model_id, async_tp_args, tp_args, method="generate")

View File

@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import vllm
from vllm.compilation.counter import compilation_counter
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
from vllm.utils import _is_torch_equal_or_newer
@ -16,15 +15,10 @@ def test_version():
assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")
def test_use_cudagraphs_dynamic(monkeypatch):
assert vllm.envs.VLLM_USE_V1
def test_use_cudagraphs_dynamic():
vllm_config = VllmConfig()
assert vllm_config.compilation_config.use_cudagraph
monkeypatch.setenv("VLLM_USE_V1", "0")
vllm_config = VllmConfig()
assert not vllm_config.compilation_config.use_cudagraph
def test_custom_op():
# proper syntax
@ -41,8 +35,6 @@ def test_custom_op():
# may be influenced by other tests.
@pytest.mark.parametrize("val", ["1"])
def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
assert vllm.envs.VLLM_USE_V1
# Disable multiprocessing so that the counter is in the same process
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val)
@ -68,8 +60,6 @@ def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
@pytest.mark.forked
@pytest.mark.parametrize("enabled", [True, False])
def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
assert vllm.envs.VLLM_USE_V1
# Disable multiprocessing so that the counter is in the same process
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

View File

@ -303,7 +303,6 @@ def test_attention_quant_pattern(
model_class: type[AttentionQuantPatternModel],
backend: _Backend,
use_inductor_graph_partition: bool,
monkeypatch,
dist_init,
caplog_vllm,
):
@ -312,8 +311,6 @@ def test_attention_quant_pattern(
if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
monkeypatch.setenv("VLLM_USE_V1", "1")
device = torch.device("cuda:0")
torch.manual_seed(42)

View File

@ -8,16 +8,13 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.v1.engine.async_llm import AsyncLLM
def test_mp_reducer(monkeypatch):
def test_mp_reducer():
"""
Test that _reduce_config reducer is registered when AsyncLLM is instantiated
without transformers_modules. This is a regression test for
https://github.com/vllm-project/vllm/pull/18640.
"""
# Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value
monkeypatch.setenv("VLLM_USE_V1", "1")
# Ensure transformers_modules is not in sys.modules
if "transformers_modules" in sys.modules:
del sys.modules["transformers_modules"]

View File

@ -5,7 +5,7 @@ from typing import Any, Optional
import pytest
from vllm import LLM, SamplingParams, envs
from vllm import LLM, SamplingParams
MODEL = "meta-llama/llama-2-7b-hf"
MAX_TOKENS = 200
@ -111,9 +111,7 @@ def _stop_token_id(llm):
@pytest.mark.skip_global_cleanup
def test_stop_strings():
# If V0, must set enforce_eager=False since we use
# async output processing below.
llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
llm = LLM(MODEL, enforce_eager=True)
_stop_basic(llm)
_stop_multi_tokens(llm)

View File

@ -42,24 +42,10 @@ class CPTestOptions(NamedTuple):
@dataclass
class CPTestSettings:
parallel_setups: list[ParallelSetup]
# NOTE: the length of distributed_backends and
# vllm_major_versions should be the same, and they
# are first zipped together to iterate over all
# test settings.
distributed_backends: list[str]
# vllm major version: "0" for V0, "1" for V1
vllm_major_versions: list[str]
runner: RunnerOption
test_options: CPTestOptions
def __post_init__(self):
if len(self.distributed_backends) != len(self.vllm_major_versions):
raise ValueError(
f"Length mismatch: distributed_backends "
f"({len(self.distributed_backends)}) != "
f"vllm_major_versions ({len(self.vllm_major_versions)})"
)
@staticmethod
def detailed(
*,
@ -87,7 +73,6 @@ class CPTestSettings:
return CPTestSettings(
parallel_setups=parallel_setups,
distributed_backends=["mp"],
vllm_major_versions=["1"],
runner=runner,
test_options=CPTestOptions(
multi_node_only=multi_node_only, load_format=load_format
@ -98,14 +83,11 @@ class CPTestSettings:
opts = self.test_options
for parallel_setup in self.parallel_setups:
for backend, vllm_major_version in zip(
self.distributed_backends, self.vllm_major_versions
):
for backend in self.distributed_backends:
yield (
model_id,
parallel_setup,
backend,
vllm_major_version,
self.runner,
opts,
)
@ -115,7 +97,6 @@ def _compare_cp_with_tp(
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
vllm_major_version: str,
runner: RunnerOption,
test_options: CPTestOptions,
num_gpus_available: int,
@ -191,10 +172,6 @@ def _compare_cp_with_tp(
if hf_overrides:
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
cp_env = tp_env = {
"VLLM_USE_V1": vllm_major_version, # Note(hc): DCP only support V1 engine only
}
cp_args = [
*common_args,
"--tensor-parallel-size",
@ -217,24 +194,13 @@ def _compare_cp_with_tp(
distributed_backend,
]
try:
compare_two_settings(
model_id,
cp_args,
tp_args,
cp_env,
tp_env,
method=method,
max_wait_seconds=720,
)
except Exception:
testing_ray_compiled_graph = cp_env is not None
if testing_ray_compiled_graph and vllm_major_version == "0":
# Ray Compiled Graph tests are flaky for V0,
# so we don't want to fail the test
logger.exception("Ray Compiled Graph tests failed")
else:
raise
compare_two_settings(
model_id,
cp_args,
tp_args,
method=method,
max_wait_seconds=720,
)
CP_TEXT_GENERATION_MODELS = {
@ -257,7 +223,6 @@ CP_TEST_MODELS = [
"model_id",
"parallel_setup",
"distributed_backend",
"vllm_major_version",
"runner",
"test_options",
),
@ -274,7 +239,6 @@ def test_cp_generation(
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
vllm_major_version: str,
runner: RunnerOption,
test_options: CPTestOptions,
num_gpus_available,
@ -283,7 +247,6 @@ def test_cp_generation(
model_id,
parallel_setup,
distributed_backend,
vllm_major_version,
runner,
test_options,
num_gpus_available,

View File

@ -307,7 +307,6 @@ def _compare_tp(
if distributed_backend == "ray":
# For V1, test Ray Compiled Graph for all the tests
pp_env = {
"VLLM_USE_V1": "1",
"VLLM_USE_RAY_COMPILED_DAG": "1",
"VLLM_USE_RAY_SPMD_WORKER": "1",
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
@ -316,15 +315,11 @@ def _compare_tp(
# terminate because of a Ray Compiled Graph issue.
common_args.append("--disable-frontend-multiprocessing")
elif distributed_backend == "mp":
pp_env = {
"VLLM_USE_V1": "1",
}
pp_env = None
else:
pp_env = None
tp_env = {
"VLLM_USE_V1": "1",
}
tp_env = None
pp_args = [
*common_args,

View File

@ -42,24 +42,10 @@ class SPTestOptions(NamedTuple):
@dataclass
class SPTestSettings:
parallel_setups: list[ParallelSetup]
# NOTE: the length of distributed_backends and
# vllm_major_versions should be the same, and they
# are first zipped together to iterate over all
# test settings.
distributed_backends: list[str]
# vllm major version: "0" for V0, "1" for V1
vllm_major_versions: list[str]
runner: RunnerOption
test_options: SPTestOptions
def __post_init__(self):
if len(self.distributed_backends) != len(self.vllm_major_versions):
raise ValueError(
f"Length mismatch: distributed_backends "
f"({len(self.distributed_backends)}) != "
f"vllm_major_versions ({len(self.vllm_major_versions)})"
)
@staticmethod
def detailed(
*,
@ -85,7 +71,6 @@ class SPTestSettings:
return SPTestSettings(
parallel_setups=parallel_setups,
distributed_backends=["mp", "ray"],
vllm_major_versions=["1", "1"],
runner=runner,
test_options=SPTestOptions(
multi_node_only=multi_node_only, load_format=load_format
@ -117,7 +102,6 @@ class SPTestSettings:
return SPTestSettings(
parallel_setups=parallel_setups,
distributed_backends=["mp", "ray"],
vllm_major_versions=["1", "1"],
runner=runner,
test_options=SPTestOptions(
multi_node_only=multi_node_only, load_format=load_format
@ -147,7 +131,6 @@ class SPTestSettings:
return SPTestSettings(
parallel_setups=parallel_setups,
distributed_backends=["mp", "ray"],
vllm_major_versions=["1", "1"],
runner=runner,
test_options=SPTestOptions(
multi_node_only=multi_node_only, load_format=load_format
@ -158,14 +141,11 @@ class SPTestSettings:
opts = self.test_options
for parallel_setup in self.parallel_setups:
for backend, vllm_major_version in zip(
self.distributed_backends, self.vllm_major_versions
):
for backend in self.distributed_backends:
yield (
model_id,
parallel_setup,
backend,
vllm_major_version,
self.runner,
opts,
)
@ -175,7 +155,6 @@ def _compare_sp(
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
vllm_major_version: str,
runner: RunnerOption,
test_options: SPTestOptions,
num_gpus_available: int,
@ -265,10 +244,6 @@ def _compare_sp(
},
}
tp_sp_env = tp_env = {
"VLLM_USE_V1": vllm_major_version,
}
tp_sp_args = [
*common_args,
"--tensor-parallel-size",
@ -281,9 +256,6 @@ def _compare_sp(
json.dumps(compilation_config),
]
tp_env = {
"VLLM_USE_V1": vllm_major_version,
}
tp_args = [
*common_args,
"--tensor-parallel-size",
@ -292,18 +264,7 @@ def _compare_sp(
"mp",
]
try:
compare_two_settings(
model_id, tp_sp_args, tp_args, tp_sp_env, tp_env, method=method
)
except Exception:
testing_ray_compiled_graph = tp_sp_env is not None
if testing_ray_compiled_graph and vllm_major_version == "0":
# Ray Compiled Graph tests are flaky for V0,
# so we don't want to fail the test
logger.exception("Ray Compiled Graph tests failed")
else:
raise
compare_two_settings(model_id, tp_sp_args, tp_args, method=method)
SP_TEXT_GENERATION_MODELS = {
@ -325,7 +286,6 @@ SP_TEST_MODELS = [
"model_id",
"parallel_setup",
"distributed_backend",
"vllm_major_version",
"runner",
"test_options",
),
@ -341,7 +301,6 @@ def test_tp_sp_generation(
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
vllm_major_version: str,
runner: RunnerOption,
test_options: SPTestOptions,
num_gpus_available,
@ -350,7 +309,6 @@ def test_tp_sp_generation(
model_id,
parallel_setup,
distributed_backend,
vllm_major_version,
runner,
test_options,
num_gpus_available,

View File

@ -61,50 +61,34 @@ def run_test(model_name, more_args=None):
TPU_TP_TEST_STR = "" # "tensor_parallel_size=4"
@pytest.mark.skipif(
not current_platform.is_cuda() and not current_platform.is_tpu(),
reason="V1 is currently only supported on CUDA and TPU",
)
@pytest.mark.parametrize("model", MODEL_NAMES)
def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
def test_lm_eval_accuracy_v1_engine(model):
"""Run with the V1 Engine."""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
more_args = None
if current_platform.is_tpu():
# Limit compilation time for TPU V1
more_args = None
if current_platform.is_tpu():
# Limit compilation time for TPU V1
more_args = "max_model_len=2048,max_num_seqs=64"
more_args = "max_model_len=2048,max_num_seqs=64"
# Add TP test (if provided)
if TPU_TP_TEST_STR:
more_args += ",{}".format(TPU_TP_TEST_STR)
# Add TP test (if provided)
if TPU_TP_TEST_STR:
more_args += ",{}".format(TPU_TP_TEST_STR)
run_test(model, more_args)
run_test(model, more_args)
@pytest.mark.skipif(
not current_platform.is_cuda() and not current_platform.is_tpu(),
reason="V1 is currently only supported on CUDA and TPU",
)
@pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES)
def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
model, monkeypatch: pytest.MonkeyPatch
):
def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(model):
"""Run with the V1 Engine."""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
more_args = None
if current_platform.is_tpu():
# Limit compilation time for TPU V1
more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
more_args = None
if current_platform.is_tpu():
# Limit compilation time for TPU V1
more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
# Add TP test (if provided)
if TPU_TP_TEST_STR:
more_args += ",{}".format(TPU_TP_TEST_STR)
# Add TP test (if provided)
if TPU_TP_TEST_STR:
more_args += ",{}".format(TPU_TP_TEST_STR)
run_test(model, more_args)
run_test(model, more_args)

View File

@ -10,7 +10,6 @@ AsyncLLMEngine are working correctly.
"""
import lm_eval
import pytest
from vllm.platforms import current_platform
@ -67,21 +66,13 @@ def run_test(more_args):
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
@pytest.mark.skipif(
not current_platform.is_cuda()
and not current_platform.is_tpu()
and not current_platform.is_xpu(),
reason="V1 currently only supported on CUDA, XPU and TPU",
)
def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
def test_lm_eval_accuracy_v1_engine():
"""Run with the V1 Engine."""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
more_args = []
more_args = []
# Limit compilation time for V1
if current_platform.is_tpu():
more_args = ["--max-num-seqs", "64"]
# Limit compilation time for V1
if current_platform.is_tpu():
more_args = ["--max-num-seqs", "64"]
run_test(more_args)
run_test(more_args)

View File

@ -21,18 +21,7 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@pytest.fixture(scope="module")
def monkeypatch_module():
from _pytest.monkeypatch import MonkeyPatch
mpatch = MonkeyPatch()
yield mpatch
mpatch.undo()
@pytest.fixture(scope="module")
def server(monkeypatch_module, zephyr_lora_files): # noqa: F811
monkeypatch_module.setenv("VLLM_USE_V1", "1")
def server(zephyr_lora_files): # noqa: F811
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",

View File

@ -37,21 +37,8 @@ BADREQUEST_CASES = [
]
@pytest.fixture(scope="module")
def monkeypatch_module():
from _pytest.monkeypatch import MonkeyPatch
mpatch = MonkeyPatch()
yield mpatch
mpatch.undo()
@pytest.fixture(scope="module", params=[True])
def server_with_lora_modules_json(request, monkeypatch_module, zephyr_lora_files):
use_v1 = request.param
assert use_v1
monkeypatch_module.setenv("VLLM_USE_V1", "1")
def server_with_lora_modules_json(request, zephyr_lora_files):
# Define the json format LoRA module configurations
lora_module_1 = {
"name": "zephyr-lora",

View File

@ -22,24 +22,6 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
PREV_MINOR_VERSION = version._prev_minor_version()
@pytest.fixture(scope="module", params=[True])
def use_v1(request):
# Module-scoped variant of run_with_both_engines
#
# Use this fixture to run a test with both v0 and v1, and
# also to conditionalize the test logic e.g.
#
# def test_metrics_exist(use_v1, server, client):
# ...
# expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
# for metric in expected:
# assert metric in response.text
#
# @skip_v1 wouldn't work here because this is a module-level
# fixture - per-function decorators would have no effect
yield request.param
@pytest.fixture(scope="module")
def default_server_args():
return [
@ -63,13 +45,11 @@ def default_server_args():
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
],
)
def server(use_v1, default_server_args, request):
def server(default_server_args, request):
if request.param:
default_server_args.append(request.param)
env_dict = dict(VLLM_USE_V1="1" if use_v1 else "0")
with RemoteOpenAIServer(
MODEL_NAME, default_server_args, env_dict=env_dict
) as remote_server:
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server
@ -129,7 +109,8 @@ EXPECTED_VALUES = {
@pytest.mark.asyncio
async def test_metrics_counts(
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
server: RemoteOpenAIServer,
client: openai.AsyncClient,
):
for _ in range(_NUM_REQUESTS):
# sending a request triggers the metrics to be logged.
@ -145,7 +126,7 @@ async def test_metrics_counts(
# Loop over all expected metric_families
for metric_family, suffix_values_list in EXPECTED_VALUES.items():
if (use_v1 and metric_family not in EXPECTED_METRICS_V1) or (
if (metric_family not in EXPECTED_METRICS_V1) or (
not server.show_hidden_metrics
and metric_family in HIDDEN_DEPRECATED_METRICS
):
@ -183,62 +164,6 @@ async def test_metrics_counts(
assert found_metric, f"Did not find {metric_family} in prom endpoint"
EXPECTED_METRICS = [
"vllm:num_requests_running",
"vllm:num_requests_waiting",
"vllm:gpu_cache_usage_perc",
"vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count",
"vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count",
"vllm:e2e_request_latency_seconds_sum",
"vllm:e2e_request_latency_seconds_bucket",
"vllm:e2e_request_latency_seconds_count",
"vllm:request_queue_time_seconds_sum",
"vllm:request_queue_time_seconds_bucket",
"vllm:request_queue_time_seconds_count",
"vllm:request_inference_time_seconds_sum",
"vllm:request_inference_time_seconds_bucket",
"vllm:request_inference_time_seconds_count",
"vllm:request_prefill_time_seconds_sum",
"vllm:request_prefill_time_seconds_bucket",
"vllm:request_prefill_time_seconds_count",
"vllm:request_decode_time_seconds_sum",
"vllm:request_decode_time_seconds_bucket",
"vllm:request_decode_time_seconds_count",
"vllm:request_prompt_tokens_sum",
"vllm:request_prompt_tokens_bucket",
"vllm:request_prompt_tokens_count",
"vllm:request_generation_tokens_sum",
"vllm:request_generation_tokens_bucket",
"vllm:request_generation_tokens_count",
"vllm:request_params_n_sum",
"vllm:request_params_n_bucket",
"vllm:request_params_n_count",
"vllm:request_params_max_tokens_sum",
"vllm:request_params_max_tokens_bucket",
"vllm:request_params_max_tokens_count",
"vllm:iteration_tokens_total",
"vllm:num_preemptions_total",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
"vllm:request_success_total",
"vllm:cache_config_info",
# labels in cache_config_info
"block_size",
"cache_dtype",
"cpu_offload_gb",
"enable_prefix_caching",
"gpu_memory_utilization",
"num_cpu_blocks",
"num_gpu_blocks",
"num_gpu_blocks_override",
"sliding_window",
"swap_space_bytes",
]
EXPECTED_METRICS_V1 = [
"vllm:num_requests_running",
"vllm:num_requests_waiting",
@ -304,17 +229,21 @@ HIDDEN_DEPRECATED_METRICS: list[str] = [
@pytest.mark.asyncio
async def test_metrics_exist(
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
server: RemoteOpenAIServer,
client: openai.AsyncClient,
):
# sending a request triggers the metrics to be logged.
await client.completions.create(
model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, temperature=0.0
model=MODEL_NAME,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0,
)
response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK
for metric in EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS:
for metric in EXPECTED_METRICS_V1:
if metric in HIDDEN_DEPRECATED_METRICS and not server.show_hidden_metrics:
continue
assert metric in response.text
@ -322,10 +251,11 @@ async def test_metrics_exist(
@pytest.mark.asyncio
async def test_abort_metrics_reset(
server: RemoteOpenAIServer, client: openai.AsyncClient, use_v1: bool
server: RemoteOpenAIServer,
client: openai.AsyncClient,
):
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
server, use_v1
server
)
# Expect no running requests or kvcache usage
@ -351,7 +281,7 @@ async def test_abort_metrics_reset(
# Check that we have running requests
running_requests, waiting_requests, kv_cache_usage = _get_running_metrics_from_api(
server, use_v1
server
)
# Expect running requests and kvcache usage
@ -371,7 +301,7 @@ async def test_abort_metrics_reset(
# Verify running and waiting requests counts and KV cache usage are zero
running_requests_after, waiting_requests_after, kv_cache_usage_after = (
_get_running_metrics_from_api(server, use_v1)
_get_running_metrics_from_api(server)
)
assert running_requests_after == 0, (
@ -385,7 +315,7 @@ async def test_abort_metrics_reset(
)
def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
def _get_running_metrics_from_api(server: RemoteOpenAIServer):
"""Return (running_count, waiting_count, kv_cache_usage)"""
response = requests.get(server.url_for("metrics"))
@ -394,9 +324,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
# Verify running and waiting requests counts and KV cache usage are zero
running_requests, waiting_requests, kv_cache_usage = None, None, None
kv_cache_usage_metric = (
"vllm:kv_cache_usage_perc" if use_v1 else "vllm:gpu_cache_usage_perc"
)
kv_cache_usage_metric = "vllm:kv_cache_usage_perc"
for family in text_string_to_metric_families(response.text):
if family.name == "vllm:num_requests_running":
@ -422,7 +350,7 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
return running_requests, waiting_requests, kv_cache_usage
def test_metrics_exist_run_batch(use_v1: bool):
def test_metrics_exist_run_batch():
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501
base_url = "0.0.0.0"
@ -452,7 +380,6 @@ def test_metrics_exist_run_batch(use_v1: bool):
"--port",
port,
],
env={"VLLM_USE_V1": "1"},
)
def is_server_up(url):

View File

@ -15,11 +15,6 @@ from vllm.entrypoints.renderer import BaseRenderer
from ...utils import RemoteOpenAIServer
@pytest.fixture(scope="function", autouse=True)
def use_v1_only(monkeypatch):
monkeypatch.setenv("VLLM_USE_V1", "1")
@pytest.mark.asyncio
async def test_empty_prompt():
model_name = "gpt2"

View File

@ -80,7 +80,6 @@ def test_env(
):
"""Test attention backend selection with valid device-backend pairs."""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv(STR_BACKEND_ENV_VAR, name)
m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
@ -212,30 +211,21 @@ def test_env(
@pytest.mark.parametrize("device", ["cpu", "cuda"])
def test_fp32_fallback(
device: str,
monkeypatch: pytest.MonkeyPatch,
):
def test_fp32_fallback(device: str):
"""Test attention backend selection with fp32."""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
if device == "cpu":
with patch("vllm.attention.selector.current_platform", CpuPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "TORCH_SDPA"
if device == "cpu":
with patch("vllm.attention.selector.current_platform", CpuPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "TORCH_SDPA"
elif device == "cuda":
with patch("vllm.attention.selector.current_platform", CudaPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "FLEX_ATTENTION"
elif device == "cuda":
with patch("vllm.attention.selector.current_platform", CudaPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "FLEX_ATTENTION"
def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
"""Test FlashAttn validation."""
# TODO: When testing for v1, pipe in `use_v1` as an argument to
# get_attn_backend
pytest.skip(
"Skipping as current backend selector does not "
"handle fallbacks when a backend is set via env var."
@ -289,7 +279,6 @@ def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
monkeypatch.context() as m,
patch("vllm.attention.selector.current_platform", CudaPlatform()),
):
m.setenv("VLLM_USE_V1", "1")
m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
# Should raise ValueError for invalid backend

View File

@ -55,7 +55,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with flex attention
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
set_seed(seed)
@ -72,7 +71,6 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with default backend
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
set_seed(seed)
with vllm_runner(
model_name,
@ -113,7 +111,6 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
# Run with flex attention
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
with vllm_runner(
model_name,
@ -126,17 +123,18 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
flex_outputs = llm_flex.embed(prompts)
# Run with default backend
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
with vllm_runner(
with (
monkeypatch.context() as m,
vllm_runner(
model_name,
runner="pooling",
dtype=torch.bfloat16,
tensor_parallel_size=1,
max_model_len=100,
enforce_eager=True,
) as llm_default:
default_outputs = llm_default.embed(prompts)
) as llm_default,
):
default_outputs = llm_default.embed(prompts)
check_embeddings_close(
embeddings_0_lst=flex_outputs,

View File

@ -613,7 +613,6 @@ def test_dummy_maverick(
profile: bool = False,
) -> None:
# Disable multiprocessing allows us to access model executor from LLM engine
monkeypatch.setenv("VLLM_USE_V1", "1")
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
model_path = create_reduced_maverick_model(

View File

@ -8,7 +8,6 @@ if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None
from vllm import envs
class DummyPlatform(Platform):
@ -19,10 +18,7 @@ class DummyPlatform(Platform):
@classmethod
def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
if envs.VLLM_USE_V1:
compilation_config = vllm_config.compilation_config
# Activate custom ops for v1.
compilation_config.custom_ops = ["all"]
vllm_config.compilation_config.custom_ops = ["all"]
def get_attn_backend_cls(
self,

View File

@ -16,7 +16,6 @@ class DummyV1Scheduler(Scheduler):
def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Explicitly turn off engine multiprocessing so
# that the scheduler runs in this process
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

View File

@ -8,18 +8,11 @@ Run `pytest tests/samplers/test_no_bad_words.py`.
from typing import Optional
import pytest
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
@pytest.fixture(autouse=True)
def v1(monkeypatch):
"""Only run on vLLM v1."""
monkeypatch.setenv("VLLM_USE_V1", "1")
def _generate(
llm: LLM,
prompt: str,

View File

@ -17,17 +17,6 @@ from vllm.lora.request import LoRARequest
# 100 training iterations with a training batch size of 100.
@pytest.fixture(scope="function", autouse=True)
def use_v1_only(monkeypatch: pytest.MonkeyPatch):
"""
Since Multi-LoRA is only supported on the v1 TPU backend, set VLLM_USE_V1=1
for all tests in this file
"""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
yield
def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
return vllm.LLM(
model="Qwen/Qwen2.5-3B-Instruct",

View File

@ -305,7 +305,6 @@ full_cg_backend_configs = {
"CutlassMLA": BackendConfig(
name="CutlassMLA",
env_vars={
"VLLM_USE_V1": "1",
"VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
"FORCE_NUM_KV_SPLITS": "1", # TODO: remove this when hang issue is fixed
},

View File

@ -1,11 +1,14 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec
from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups
pytestmark = pytest.mark.cpu_test
def new_kv_cache_spec():
return FullAttentionSpec(16, 1, 1, torch.float32, False)

View File

@ -1,14 +1,10 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
from vllm import LLM
if os.getenv("VLLM_USE_V1", "0") != "1":
pytest.skip("Test package requires V1", allow_module_level=True)
MODEL = "meta-llama/Llama-3.2-1B"
PROMPT = "Hello my name is Robert and I"

View File

@ -60,7 +60,7 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
):
pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars}
env_vars = backend_configs[backend_name].env_vars
with temporary_environ(env_vars), ExitStack() as stack:
if not supported:
@ -117,7 +117,7 @@ combo_cases_2 = [
def test_cudagraph_compilation_combo(combo_case):
backend_name, cudagraph_mode, compilation_level, supported = combo_case
env_vars = {"VLLM_USE_V1": "1", **backend_configs[backend_name].env_vars}
env_vars = backend_configs[backend_name].env_vars
with temporary_environ(env_vars), ExitStack() as stack:
if not supported:

View File

@ -20,7 +20,6 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
)
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")

View File

@ -32,7 +32,7 @@ model_config = {
@pytest.mark.parametrize("seed", [1])
@pytest.mark.parametrize("disable_hybrid_kv_cache_manager", [True, False])
def test_sliding_window_retrieval(
monkeypatch, model, batch_size, seed, disable_hybrid_kv_cache_manager
model, batch_size, seed, disable_hybrid_kv_cache_manager
):
"""
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
@ -40,39 +40,34 @@ def test_sliding_window_retrieval(
If we tell it upfront which we are going to be looking for, then
it answers correctly (mostly).
"""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
test_config = model_config[model]
test_config = model_config[model]
llm = LLM(
model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
)
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
llm = LLM(
model=model, disable_hybrid_kv_cache_manager=disable_hybrid_kv_cache_manager
)
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
prompts, answer, indices = prep_prompts(batch_size, ln_range=test_config.ln_range)
prompts, answer, indices = prep_prompts(
batch_size, ln_range=test_config.ln_range
)
check_length(prompts, llm, test_config.sliding_window)
check_length(prompts, llm, test_config.sliding_window)
# Fresh generation
responses = llm.generate(prompts, sampling_params)
check_answers(
indices,
answer,
[response.outputs[0].text for response in responses],
accept_rate=1.0,
)
# Fresh generation
responses = llm.generate(prompts, sampling_params)
check_answers(
indices,
answer,
[response.outputs[0].text for response in responses],
accept_rate=1.0,
)
# Re-generate with the same prompts to test prefix caching
responses = llm.generate(prompts, sampling_params)
check_answers(
indices,
answer,
[response.outputs[0].text for response in responses],
accept_rate=1.0,
)
# Re-generate with the same prompts to test prefix caching
responses = llm.generate(prompts, sampling_params)
check_answers(
indices,
answer,
[response.outputs[0].text for response in responses],
accept_rate=1.0,
)
def check_length(prompts: list[str], llm: LLM, sliding_window: int):

View File

@ -81,8 +81,6 @@ def test_kv_sharing_fast_prefill(
)
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Make scheduling deterministic for reproducibility
m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

View File

@ -13,7 +13,6 @@ Covers:
5) Multiple stop conditions
"""
import os
from typing import Optional, Union
import pytest
@ -161,9 +160,6 @@ MIN_TOKENS_TEST_CASES = [
@pytest.fixture(scope="module")
def llm_v1():
"""Create V1 LLM instance for testing"""
# Ensure V1 engine is used
os.environ["VLLM_USE_V1"] = "1"
llm = LLM(
model=TEST_MODEL,
tensor_parallel_size=1,
@ -503,6 +499,6 @@ if __name__ == "__main__":
Usage:
cd vllm/
VLLM_USE_V1=1 python -m pytest tests/v1/e2e/test_min_tokens.py -v
python -m pytest tests/v1/e2e/test_min_tokens.py -v
"""
pytest.main([__file__, "-v"])

View File

@ -301,7 +301,6 @@ def test_mtp_correctness(
model_setup: (method, model_name, tp_size)
"""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_MLA_DISABLE", "1")
method, model_name, tp_size = model_setup

View File

@ -95,17 +95,11 @@ async def generate(
)
@pytest.mark.asyncio
async def test_load(
monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind,
engine_args: AsyncEngineArgs,
prompt: PromptType,
):
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1
# so that in the future when we switch, we don't have to change all the
# tests.
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with ExitStack() as after:
with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown)
@ -149,14 +143,11 @@ async def test_load(
)
@pytest.mark.asyncio
async def test_abort(
monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind,
engine_args: AsyncEngineArgs,
prompt: PromptType,
):
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with ExitStack() as after:
with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown)
@ -222,13 +213,8 @@ async def test_abort(
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
)
@pytest.mark.asyncio
async def test_multi_abort(
monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind,
):
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
async def test_multi_abort(output_kind: RequestOutputKind):
with ExitStack() as after:
with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown)
@ -304,14 +290,11 @@ async def test_multi_abort(
)
@pytest.mark.asyncio
async def test_finished_flag(
monkeypatch: pytest.MonkeyPatch,
n: int,
engine_args: AsyncEngineArgs,
prompt: PromptType,
):
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with ExitStack() as after:
with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown)
@ -341,12 +324,10 @@ async def test_finished_flag(
)
@pytest.mark.asyncio
async def test_mid_stream_cancellation(
monkeypatch: pytest.MonkeyPatch, engine_args: AsyncEngineArgs, prompt: PromptType
engine_args: AsyncEngineArgs, prompt: PromptType
):
"""Test that requests can be cancelled mid-stream."""
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with ExitStack() as after:
with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(engine_args)
after.callback(engine.shutdown)
@ -411,9 +392,7 @@ async def test_customize_loggers(monkeypatch):
be added to the default loggers.
"""
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with ExitStack() as after:
with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(
TEXT_ENGINE_ARGS,
@ -430,10 +409,8 @@ async def test_customize_loggers(monkeypatch):
@pytest.mark.asyncio(scope="module")
async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
async def test_dp_rank_argument():
with ExitStack() as after:
with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown)
@ -466,7 +443,7 @@ async def test_dp_rank_argument(monkeypatch: pytest.MonkeyPatch):
@pytest.mark.asyncio
async def test_check_health(monkeypatch: pytest.MonkeyPatch):
async def test_check_health():
"""Test that check_health returns normally for healthy engine
and raises EngineDeadError when the engine is dead.
"""
@ -474,9 +451,7 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
from vllm.v1.engine.exceptions import EngineDeadError
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with ExitStack() as after:
with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown)
@ -503,15 +478,10 @@ async def test_check_health(monkeypatch: pytest.MonkeyPatch):
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
)
@pytest.mark.asyncio
async def test_abort_final_output(
monkeypatch: pytest.MonkeyPatch,
output_kind: RequestOutputKind,
):
async def test_abort_final_output(output_kind: RequestOutputKind):
"""Test that abort() returns a final output with correct information."""
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
with ExitStack() as after:
with set_default_torch_num_threads(1):
engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
after.callback(engine.shutdown)

View File

@ -5,18 +5,11 @@ from argparse import ArgumentError
import pytest
from vllm import envs
from vllm.config import VllmConfig
from vllm.engine.arg_utils import EngineArgs
from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser
if not envs.VLLM_USE_V1:
pytest.skip(
"Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
allow_module_level=True,
)
def test_prefix_caching_from_cli():
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())

View File

@ -46,188 +46,184 @@ def make_request() -> EngineCoreRequest:
@create_new_process_for_each_test()
def test_engine_core(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
"""Setup the EngineCore."""
engine_args = EngineArgs(model=MODEL_NAME)
vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config)
def test_engine_core():
"""Setup the EngineCore."""
engine_args = EngineArgs(model=MODEL_NAME)
vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1):
engine_core = EngineCore(
vllm_config=vllm_config, executor_class=executor_class, log_stats=True
)
"""Test basic request lifecycle."""
with set_default_torch_num_threads(1):
engine_core = EngineCore(
vllm_config=vllm_config, executor_class=executor_class, log_stats=True
)
"""Test basic request lifecycle."""
# First request.
engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0
# First request.
engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 1
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 1
# Second request.
engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 1
# Second request.
engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 1
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 2
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 2
# Add two requests in a row.
engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
assert len(engine_core.scheduler.waiting) == 2
assert len(engine_core.scheduler.running) == 2
# Add two requests in a row.
engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
engine_core.add_request(*engine_core.preprocess_add_request(make_request()))
assert len(engine_core.scheduler.waiting) == 2
assert len(engine_core.scheduler.running) == 2
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 4
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 4
# Loop through until they are all done.
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass
# Loop through until they are all done.
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
"""Test abort cycle."""
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
"""Test abort cycle."""
# Basic abort.
req = make_request()
request_id = req.request_id
# Basic abort.
req = make_request()
request_id = req.request_id
engine_core.add_request(*engine_core.preprocess_add_request(req))
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0
assert engine_core.scheduler.has_unfinished_requests()
assert not engine_core.scheduler.has_finished_requests()
engine_core.add_request(*engine_core.preprocess_add_request(req))
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0
assert engine_core.scheduler.has_unfinished_requests()
assert not engine_core.scheduler.has_finished_requests()
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 1
assert engine_core.scheduler.has_unfinished_requests()
assert not engine_core.scheduler.has_finished_requests()
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 1
assert engine_core.scheduler.has_unfinished_requests()
assert not engine_core.scheduler.has_finished_requests()
engine_core.abort_requests([request_id])
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
assert not engine_core.scheduler.has_unfinished_requests()
assert engine_core.scheduler.has_finished_requests()
engine_core.abort_requests([request_id])
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
assert not engine_core.scheduler.has_unfinished_requests()
assert engine_core.scheduler.has_finished_requests()
_ = engine_core.step()
assert not engine_core.scheduler.has_unfinished_requests()
assert not engine_core.scheduler.has_finished_requests()
_ = engine_core.step()
assert not engine_core.scheduler.has_unfinished_requests()
assert not engine_core.scheduler.has_finished_requests()
# Add, step, abort 1 of the 3.
req0 = make_request()
req1 = make_request()
req2 = make_request()
# Add, step, abort 1 of the 3.
req0 = make_request()
req1 = make_request()
req2 = make_request()
engine_core.add_request(*engine_core.preprocess_add_request(req0))
engine_core.add_request(*engine_core.preprocess_add_request(req1))
assert len(engine_core.scheduler.waiting) == 2
assert len(engine_core.scheduler.running) == 0
engine_core.add_request(*engine_core.preprocess_add_request(req0))
engine_core.add_request(*engine_core.preprocess_add_request(req1))
assert len(engine_core.scheduler.waiting) == 2
assert len(engine_core.scheduler.running) == 0
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 2
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 2
engine_core.add_request(*engine_core.preprocess_add_request(req2))
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 2
engine_core.add_request(*engine_core.preprocess_add_request(req2))
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 2
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 3
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 3
# Abort just one.
engine_core.abort_requests([req1.request_id])
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 2
# Abort just one.
engine_core.abort_requests([req1.request_id])
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 2
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 2
_ = engine_core.step()
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 2
# Abort the other requests at the same time.
engine_core.abort_requests([req2.request_id, req0.request_id])
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
# Abort the other requests at the same time.
engine_core.abort_requests([req2.request_id, req0.request_id])
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
# Sending duplicate requests with same request_id
req0 = make_request()
req1 = make_request()
req0.request_id = req1.request_id = "test"
engine_core.add_request(*engine_core.preprocess_add_request(req0))
# Sending duplicate requests with same request_id
req0 = make_request()
req1 = make_request()
req0.request_id = req1.request_id = "test"
engine_core.add_request(*engine_core.preprocess_add_request(req0))
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass
engine_core.add_request(*engine_core.preprocess_add_request(req1))
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass
engine_core.add_request(*engine_core.preprocess_add_request(req1))
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
@create_new_process_for_each_test()
def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
def test_engine_core_advanced_sampling():
"""
A basic end-to-end test to verify that the engine functions correctly
when additional sampling parameters, such as top_p, min_tokens, and
presence_penalty, are set.
"""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
"""Setup the EngineCore."""
engine_args = EngineArgs(model=MODEL_NAME)
vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config)
"""Setup the EngineCore."""
engine_args = EngineArgs(model=MODEL_NAME)
vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1):
engine_core = EngineCore(
vllm_config=vllm_config, executor_class=executor_class, log_stats=True
)
"""Test basic request lifecycle."""
# First request.
request: EngineCoreRequest = make_request()
request.sampling_params = SamplingParams(
min_tokens=4,
presence_penalty=1.0,
frequency_penalty=1.0,
repetition_penalty=0.1,
stop_token_ids=[1001, 1002],
with set_default_torch_num_threads(1):
engine_core = EngineCore(
vllm_config=vllm_config, executor_class=executor_class, log_stats=True
)
engine_core.add_request(*engine_core.preprocess_add_request(request))
"""Test basic request lifecycle."""
# First request.
request: EngineCoreRequest = make_request()
request.sampling_params = SamplingParams(
min_tokens=4,
presence_penalty=1.0,
frequency_penalty=1.0,
repetition_penalty=0.1,
stop_token_ids=[1001, 1002],
)
engine_core.add_request(*engine_core.preprocess_add_request(request))
def _check_engine_state():
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0
# Loop through until they are all done.
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
def _check_engine_state():
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0
# Loop through until they are all done.
while (outs := engine_core.step()[0].get(0)) and outs.outputs:
pass
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
_check_engine_state()
_check_engine_state()
# Second request.
request2 = make_request()
request2.sampling_params = SamplingParams(
top_p=0.99,
top_k=50,
)
engine_core.add_request(*engine_core.preprocess_add_request(request2))
_check_engine_state()
# Second request.
request2 = make_request()
request2.sampling_params = SamplingParams(
top_p=0.99,
top_k=50,
)
engine_core.add_request(*engine_core.preprocess_add_request(request2))
_check_engine_state()
@create_new_process_for_each_test()
def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
def test_engine_core_concurrent_batches():
"""
Test that the engine can handle multiple concurrent batches.
"""
@ -272,173 +268,163 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
if hasattr(self, "thread_pool"):
self.thread_pool.shutdown(wait=False)
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
engine_args = EngineArgs(
model=MODEL_NAME,
# To test concurrent batches.
max_num_seqs=2,
# Avoid all requests being scheduled once.
enable_prefix_caching=False,
max_num_batched_tokens=10,
# Reduce startup time.
enforce_eager=True,
engine_args = EngineArgs(
model=MODEL_NAME,
# To test concurrent batches.
max_num_seqs=2,
# Avoid all requests being scheduled once.
enable_prefix_caching=False,
max_num_batched_tokens=10,
# Reduce startup time.
enforce_eager=True,
)
vllm_config = engine_args.create_engine_config()
with set_default_torch_num_threads(1):
engine_core = EngineCore(
vllm_config=vllm_config, log_stats=False, executor_class=DummyExecutor
)
vllm_config = engine_args.create_engine_config()
with set_default_torch_num_threads(1):
engine_core = EngineCore(
vllm_config=vllm_config, log_stats=False, executor_class=DummyExecutor
assert engine_core.batch_queue is not None
# Add two requests in a row. Each request have 12 prompt tokens.
req0 = make_request_with_max_tokens("0", 5)
engine_core.add_request(*engine_core.preprocess_add_request(req0))
req1 = make_request_with_max_tokens("1", 5)
engine_core.add_request(*engine_core.preprocess_add_request(req1))
# Schedule Batch 1: (10, req0)
assert engine_core.step_with_batch_queue()[0] is None
assert len(engine_core.batch_queue) == 1
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["0"] == 10
# num_computed_tokens should have been updated immediately.
assert engine_core.scheduler.requests[req0.request_id].num_computed_tokens == 10
# Schedule Batch 2: (2, req0), (8, req1)
assert engine_core.step_with_batch_queue()[0] == {}
assert len(engine_core.batch_queue) == 1
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["0"] == 2
assert scheduler_output.num_scheduled_tokens["1"] == 8
# num_computed_tokens should have been updated immediately.
assert engine_core.scheduler.requests["0"].num_computed_tokens == 12
assert engine_core.scheduler.requests["1"].num_computed_tokens == 8
assert engine_core.scheduler.get_num_unfinished_requests() == 2
# Finish Batch 1 and schedule Batch 3: (4, req1).
# Note that req0 cannot be scheduled
# because it is in the decoding stage now.
engine_core.step_with_batch_queue()
assert len(engine_core.batch_queue) == 1
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["1"] == 4
# Finish Batch 2. Get first token of req0.
# Schedule Batch 4: (1, req0).
output = engine_core.step_with_batch_queue()[0].get(0)
assert output is not None
assert len(output.outputs) == 1
assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["0"] == 1
# Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
output = engine_core.step_with_batch_queue()[0].get(0)
assert output is not None
assert len(output.outputs) == 1
assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["1"] == 1
# Loop until req0 is finished.
req_id = 0
expected_num_tokens = [
engine_core.scheduler.requests["0"].num_tokens + 1,
engine_core.scheduler.requests["1"].num_tokens + 1,
]
while engine_core.scheduler.get_num_unfinished_requests() == 2:
output = engine_core.step_with_batch_queue()[0]
# Every step consumes an output.
assert output is not None
assert len(output[0].outputs) == 1
if req_id in engine_core.scheduler.requests:
assert (
engine_core.scheduler.requests[req_id].num_tokens
== expected_num_tokens[req_id]
)
assert engine_core.batch_queue is not None
# Add two requests in a row. Each request have 12 prompt tokens.
req0 = make_request_with_max_tokens("0", 5)
engine_core.add_request(*engine_core.preprocess_add_request(req0))
req1 = make_request_with_max_tokens("1", 5)
engine_core.add_request(*engine_core.preprocess_add_request(req1))
# Schedule Batch 1: (10, req0)
assert engine_core.step_with_batch_queue()[0] is None
assert len(engine_core.batch_queue) == 1
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["0"] == 10
# num_computed_tokens should have been updated immediately.
assert engine_core.scheduler.requests[req0.request_id].num_computed_tokens == 10
# Schedule Batch 2: (2, req0), (8, req1)
assert engine_core.step_with_batch_queue()[0] == {}
assert len(engine_core.batch_queue) == 1
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["0"] == 2
assert scheduler_output.num_scheduled_tokens["1"] == 8
# num_computed_tokens should have been updated immediately.
assert engine_core.scheduler.requests["0"].num_computed_tokens == 12
assert engine_core.scheduler.requests["1"].num_computed_tokens == 8
assert engine_core.scheduler.get_num_unfinished_requests() == 2
# Finish Batch 1 and schedule Batch 3: (4, req1).
# Note that req0 cannot be scheduled
# because it is in the decoding stage now.
engine_core.step_with_batch_queue()
assert len(engine_core.batch_queue) == 1
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["1"] == 4
# Finish Batch 2. Get first token of req0.
# Schedule Batch 4: (1, req0).
output = engine_core.step_with_batch_queue()[0].get(0)
assert output is not None
assert len(output.outputs) == 1
assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["0"] == 1
# Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
output = engine_core.step_with_batch_queue()[0].get(0)
assert output is not None
assert len(output.outputs) == 1
assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
scheduler_output = engine_core.batch_queue[-1][1]
assert scheduler_output.num_scheduled_tokens["1"] == 1
# Loop until req0 is finished.
req_id = 0
expected_num_tokens = [
engine_core.scheduler.requests["0"].num_tokens + 1,
engine_core.scheduler.requests["1"].num_tokens + 1,
]
while engine_core.scheduler.get_num_unfinished_requests() == 2:
output = engine_core.step_with_batch_queue()[0]
# Every step consumes an output.
assert output is not None
assert len(output[0].outputs) == 1
if req_id in engine_core.scheduler.requests:
assert (
engine_core.scheduler.requests[req_id].num_tokens
== expected_num_tokens[req_id]
)
expected_num_tokens[req_id] += 1
req_id = (req_id + 1) % 2
expected_num_tokens[req_id] += 1
req_id = (req_id + 1) % 2
@multi_gpu_test(num_gpus=2)
def test_engine_core_tp(monkeypatch: pytest.MonkeyPatch):
def test_engine_core_tp():
"""
Test engine can initialize worker in tp properly
"""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
"""Setup the EngineCore."""
engine_args = EngineArgs(
model=MODEL_NAME,
tensor_parallel_size=2,
# Reduce startup time.
enforce_eager=True,
)
vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config)
"""Setup the EngineCore."""
engine_args = EngineArgs(
model=MODEL_NAME,
tensor_parallel_size=2,
# Reduce startup time.
enforce_eager=True,
)
vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1):
engine_core = EngineCore(
vllm_config=vllm_config, executor_class=executor_class, log_stats=True
)
def get_worker_cache_config_field(worker, key: str):
return getattr(worker.cache_config, key)
num_gpu_blocks = engine_core.collective_rpc(
get_worker_cache_config_field, args=("num_gpu_blocks",)
with set_default_torch_num_threads(1):
engine_core = EngineCore(
vllm_config=vllm_config, executor_class=executor_class, log_stats=True
)
num_cpu_blocks = engine_core.collective_rpc(
get_worker_cache_config_field, args=("num_cpu_blocks",)
)
assert all(x is not None for x in num_gpu_blocks)
assert all(x is not None for x in num_cpu_blocks)
def get_worker_cache_config_field(worker, key: str):
return getattr(worker.cache_config, key)
num_gpu_blocks = engine_core.collective_rpc(
get_worker_cache_config_field, args=("num_gpu_blocks",)
)
num_cpu_blocks = engine_core.collective_rpc(
get_worker_cache_config_field, args=("num_cpu_blocks",)
)
assert all(x is not None for x in num_gpu_blocks)
assert all(x is not None for x in num_cpu_blocks)
@create_new_process_for_each_test()
def test_engine_core_invalid_request_id_type(monkeypatch: pytest.MonkeyPatch):
def test_engine_core_invalid_request_id_type():
"""Test that engine raises TypeError for non-string request_id."""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
engine_args = EngineArgs(model=MODEL_NAME)
vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config)
engine_args = EngineArgs(model=MODEL_NAME)
vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1):
engine_core = EngineCore(
vllm_config=vllm_config, executor_class=executor_class, log_stats=True
)
with set_default_torch_num_threads(1):
engine_core = EngineCore(
vllm_config=vllm_config, executor_class=executor_class, log_stats=True
)
# Test with UUID object (common mistake)
uuid_request = make_request()
uuid_request.request_id = uuid.uuid4() # UUID object instead of string
# Test with UUID object (common mistake)
uuid_request = make_request()
uuid_request.request_id = uuid.uuid4() # UUID object instead of string
with pytest.raises(TypeError, match="request_id must be a string, got.*UUID"):
engine_core.add_request(*engine_core.preprocess_add_request(uuid_request))
with pytest.raises(TypeError, match="request_id must be a string, got.*UUID"):
engine_core.add_request(*engine_core.preprocess_add_request(uuid_request))
# Test with integer
int_request = make_request()
int_request.request_id = 12345
# Test with integer
int_request = make_request()
int_request.request_id = 12345
with pytest.raises(TypeError, match="request_id must be a string, got.*int"):
engine_core.add_request(*engine_core.preprocess_add_request(int_request))
with pytest.raises(TypeError, match="request_id must be a string, got.*int"):
engine_core.add_request(*engine_core.preprocess_add_request(int_request))
# Test with None
none_request = make_request()
none_request.request_id = None
# Test with None
none_request = make_request()
none_request.request_id = None
with pytest.raises(TypeError, match="request_id must be a string, got.*NoneType"):
engine_core.add_request(*engine_core.preprocess_add_request(none_request))
with pytest.raises(
TypeError, match="request_id must be a string, got.*NoneType"
):
engine_core.add_request(*engine_core.preprocess_add_request(none_request))
# Verify engine is still functional after errors
valid_request = make_request()
engine_core.add_request(*engine_core.preprocess_add_request(valid_request))
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0
# Verify engine is still functional after errors
valid_request = make_request()
engine_core.add_request(*engine_core.preprocess_add_request(valid_request))
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0

View File

@ -130,8 +130,6 @@ def test_engine_core_client(
monkeypatch: pytest.MonkeyPatch, multiprocessing_mode: bool
):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Monkey-patch core engine utility function to test.
m.setattr(EngineCore, "echo", echo, raising=False)
@ -218,8 +216,6 @@ def test_engine_core_client(
@pytest.mark.asyncio(loop_scope="function")
async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Monkey-patch core engine utility function to test.
m.setattr(EngineCore, "echo", echo, raising=False)
@ -373,8 +369,6 @@ async def test_engine_core_client_util_method_custom_return(
monkeypatch: pytest.MonkeyPatch,
):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Must set insecure serialization to allow returning custom types.
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
@ -422,8 +416,6 @@ async def test_engine_core_client_util_method_custom_dict_return(
monkeypatch: pytest.MonkeyPatch,
):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Must set insecure serialization to allow returning custom types.
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
@ -480,8 +472,6 @@ async def test_engine_core_client_util_method_nested_structures(
monkeypatch: pytest.MonkeyPatch,
):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
# Must set insecure serialization to allow returning custom types.
m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
@ -592,76 +582,71 @@ async def test_engine_core_client_util_method_nested_structures(
indirect=["publisher_config"],
)
def test_kv_cache_events(
monkeypatch: pytest.MonkeyPatch,
multiprocessing_mode: bool,
publisher_config,
):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
block_size = 16
num_blocks = 2
block_size = 16
num_blocks = 2
engine_args = EngineArgs(
model=MODEL_NAME,
enforce_eager=True,
enable_prefix_caching=True,
block_size=block_size,
engine_args = EngineArgs(
model=MODEL_NAME,
enforce_eager=True,
enable_prefix_caching=True,
block_size=block_size,
)
engine_args.kv_events_config = publisher_config
vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1):
client = EngineCoreClient.make_client(
multiprocess_mode=multiprocessing_mode,
asyncio_mode=False,
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=False,
)
engine_args.kv_events_config = publisher_config
endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
subscriber = MockSubscriber(
endpoint, topic=publisher_config.topic, decode_type=KVEventBatch
)
vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
try:
custom_tokens = list(range(num_blocks * block_size))
sampling_params = SamplingParams(max_tokens=1)
request = make_request(sampling_params, custom_tokens)
client.add_request(request)
executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1):
client = EngineCoreClient.make_client(
multiprocess_mode=multiprocessing_mode,
asyncio_mode=False,
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=False,
)
endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
subscriber = MockSubscriber(
endpoint, topic=publisher_config.topic, decode_type=KVEventBatch
outputs: dict[str, list] = {request.request_id: []}
loop_until_done(client, outputs)
result = subscriber.receive_one(timeout=1000)
assert result is not None, "No message received"
seq, received = result
assert seq == 0, "Sequence number mismatch"
assert len(received.events) == 1, "We should have exactly one BlockStored event"
event = received.events[0]
assert isinstance(event, BlockStored), "We should have a BlockStored event"
assert len(event.block_hashes) == num_blocks, (
"We should have a BlockStored event with 2 block_hashes"
)
try:
custom_tokens = list(range(num_blocks * block_size))
sampling_params = SamplingParams(max_tokens=1)
request = make_request(sampling_params, custom_tokens)
client.add_request(request)
outputs: dict[str, list] = {request.request_id: []}
loop_until_done(client, outputs)
result = subscriber.receive_one(timeout=1000)
assert result is not None, "No message received"
seq, received = result
assert seq == 0, "Sequence number mismatch"
assert len(received.events) == 1, (
"We should have exactly one BlockStored event"
)
event = received.events[0]
assert isinstance(event, BlockStored), "We should have a BlockStored event"
assert len(event.block_hashes) == num_blocks, (
"We should have a BlockStored event with 2 block_hashes"
)
assert event.block_size == block_size, (
"Block size should be the same as the block size"
)
assert event.parent_block_hash is None, "Parent block hash should be None"
assert event.lora_id is None, "Lora id should be None"
assert len(event.token_ids) == num_blocks * block_size, (
"Token ids should be the same as the custom tokens"
)
assert event.token_ids == custom_tokens, (
"Token ids should be the same as the custom tokens"
)
finally:
client.shutdown()
subscriber.close()
assert event.block_size == block_size, (
"Block size should be the same as the block size"
)
assert event.parent_block_hash is None, "Parent block hash should be None"
assert event.lora_id is None, "Lora id should be None"
assert len(event.token_ids) == num_blocks * block_size, (
"Token ids should be the same as the custom tokens"
)
assert event.token_ids == custom_tokens, (
"Token ids should be the same as the custom tokens"
)
finally:
client.shutdown()
subscriber.close()
@pytest.mark.asyncio
@ -672,101 +657,96 @@ def test_kv_cache_events(
)
@multi_gpu_test(num_gpus=4)
async def test_kv_cache_events_dp(
monkeypatch: pytest.MonkeyPatch,
multiprocessing_mode: bool,
publisher_config,
):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
block_size = 16
num_blocks = 2
dp_size = 2
tp_size = 2
block_size = 16
num_blocks = 2
dp_size = 2
tp_size = 2
engine_args = EngineArgs(
model=MODEL_NAME,
enforce_eager=True,
enable_prefix_caching=True,
data_parallel_size=dp_size,
tensor_parallel_size=tp_size,
block_size=block_size,
engine_args = EngineArgs(
model=MODEL_NAME,
enforce_eager=True,
enable_prefix_caching=True,
data_parallel_size=dp_size,
tensor_parallel_size=tp_size,
block_size=block_size,
)
engine_args.kv_events_config = publisher_config
vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1):
client = EngineCoreClient.make_client(
multiprocess_mode=multiprocessing_mode,
asyncio_mode=True,
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=False,
)
engine_args.kv_events_config = publisher_config
await asyncio.sleep(1)
vllm_config = engine_args.create_engine_config(UsageContext.UNKNOWN_CONTEXT)
# Build endpoints for all DP ranks
base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
endpoints = []
for i in range(dp_size):
offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i)
endpoints.append(offset_endpoint)
executor_class = Executor.get_class(vllm_config)
with set_default_torch_num_threads(1):
client = EngineCoreClient.make_client(
multiprocess_mode=multiprocessing_mode,
asyncio_mode=True,
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=False,
)
await asyncio.sleep(1)
subscriber = MockSubscriber(
endpoints, topic=publisher_config.topic, decode_type=KVEventBatch
)
# Build endpoints for all DP ranks
base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1")
endpoints = []
for i in range(dp_size):
offset_endpoint = ZmqEventPublisher.offset_endpoint_port(base_endpoint, i)
endpoints.append(offset_endpoint)
try:
custom_tokens = list(range(num_blocks * block_size))
sampling_params = SamplingParams(max_tokens=1)
all_request_ids = []
subscriber = MockSubscriber(
endpoints, topic=publisher_config.topic, decode_type=KVEventBatch
# Create and add 25 requests
# NOTE: attempts to force routing to both dp groups but can be flaky
for i in range(25):
await asyncio.sleep(0.01)
request = make_request(sampling_params, custom_tokens)
await client.add_request_async(request)
all_request_ids.append(request.request_id)
await asyncio.sleep(0.1)
# Initialize outputs dict for all requests
outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids}
print("processing requests...")
await asyncio.wait_for(
loop_until_fully_done_async(client, outputs), timeout=20.0
)
try:
custom_tokens = list(range(num_blocks * block_size))
sampling_params = SamplingParams(max_tokens=1)
all_request_ids = []
# Receive from subscriber until no more messages
print("collecting results...")
results = []
while True:
result = subscriber.receive_one(timeout=1)
print(result)
if result is None:
break
results.append(result)
# Create and add 25 requests
# NOTE: attempts to force routing to both dp groups but can be flaky
for i in range(25):
await asyncio.sleep(0.01)
request = make_request(sampling_params, custom_tokens)
await client.add_request_async(request)
all_request_ids.append(request.request_id)
# Collect all events and data_parallel_ranks from all results
all_dp_ranks = [received.data_parallel_rank for (_, received) in results]
unique_dps = set(all_dp_ranks)
assert len(unique_dps) == 2, (
f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
)
await asyncio.sleep(0.1)
# Initialize outputs dict for all requests
outputs: dict[str, list] = {req_id: [] for req_id in all_request_ids}
print("processing requests...")
await asyncio.wait_for(
loop_until_fully_done_async(client, outputs), timeout=20.0
)
# Receive from subscriber until no more messages
print("collecting results...")
results = []
while True:
result = subscriber.receive_one(timeout=1)
print(result)
if result is None:
break
results.append(result)
# Collect all events and data_parallel_ranks from all results
all_dp_ranks = [received.data_parallel_rank for (_, received) in results]
unique_dps = set(all_dp_ranks)
assert len(unique_dps) == 2, (
f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}"
)
finally:
client.shutdown()
subscriber.close()
finally:
client.shutdown()
subscriber.close()
@pytest.mark.timeout(20)
def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
m.setenv("VLLM_USE_V1", "1")
# Monkey-patch to extract core process pid while it's starting.
core_proc_pid = [None]
cepm_ctor = CoreEngineProcManager.__init__
@ -841,7 +821,6 @@ def test_engine_core_proc_instantiation_cuda_empty(monkeypatch: pytest.MonkeyPat
mock_executor_class.side_effect = create_mock_executor
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("CUDA_VISIBLE_DEVICES", "") # No CUDA devices
from vllm.v1.engine.utils import EngineZmqAddresses

View File

@ -21,12 +21,10 @@ DTYPE = "half"
def _vllm_model(
apc: bool,
vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
*,
skip_tokenizer_init: bool = False,
):
"""Set up VllmRunner instance."""
monkeypatch.setenv("VLLM_USE_V1", "1")
return vllm_runner(
MODEL,
dtype=DTYPE,
@ -45,16 +43,16 @@ def _vllm_model(
# Prefix caching
params=[False, True],
)
def vllm_model(vllm_runner, request, monkeypatch):
def vllm_model(vllm_runner, request):
"""VllmRunner test fixture parameterized by APC True/False."""
with _vllm_model(request.param, vllm_runner, monkeypatch) as vllm_model:
with _vllm_model(request.param, vllm_runner) as vllm_model:
yield vllm_model
@pytest.fixture(scope="function")
def vllm_model_apc(vllm_runner, monkeypatch):
def vllm_model_apc(vllm_runner):
"""VllmRunner test fixture with APC."""
with _vllm_model(True, vllm_runner, monkeypatch) as vllm_model:
with _vllm_model(True, vllm_runner) as vllm_model:
yield vllm_model
@ -65,12 +63,11 @@ def vllm_model_apc(vllm_runner, monkeypatch):
# Prefix caching
params=[False, True],
)
def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch):
def vllm_model_skip_tokenizer_init(vllm_runner, request):
"""VllmRunner test fixture with APC."""
with _vllm_model(
request.param,
vllm_runner,
monkeypatch,
skip_tokenizer_init=True,
) as vllm_model:
yield vllm_model
@ -152,7 +149,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
)
def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
def test_engine_metrics(vllm_runner, example_prompts):
max_tokens = 100
# Use spec decoding to test num_accepted_tokens_per_pos
speculative_config = {
@ -161,7 +158,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
"prompt_lookup_min": 3,
"num_speculative_tokens": 5,
}
monkeypatch.setenv("VLLM_USE_V1", "1")
with vllm_runner(
MODEL,
speculative_config=speculative_config,
@ -216,8 +213,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
@pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"])
def test_skip_tokenizer_initialization(model: str, monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_V1", "1")
def test_skip_tokenizer_initialization(model: str):
# This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain
# token ids.

View File

@ -103,7 +103,6 @@ def test_guided_decoding_deprecated():
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
)
def test_structured_output(
monkeypatch: pytest.MonkeyPatch,
sample_json_schema: dict[str, Any],
unsupported_json_schema: dict[str, Any],
sample_sql_ebnf: str,
@ -115,8 +114,6 @@ def test_structured_output(
model_name: str,
speculative_config: dict[str, Any],
):
monkeypatch.setenv("VLLM_USE_V1", "1")
if current_platform.is_tpu() and speculative_config:
pytest.skip("TPU does not support speculative decoding")
@ -620,15 +617,12 @@ Make the response as short as possible.
],
)
def test_structured_output_with_reasoning_matrices(
monkeypatch: pytest.MonkeyPatch,
backend: str,
tokenizer_mode: TokenizerMode,
reasoning_parser: str,
model_name: str,
speculative_config: dict[str, Any] | None,
):
monkeypatch.setenv("VLLM_USE_V1", "1")
if current_platform.is_tpu() and speculative_config:
pytest.skip("TPU does not support speculative decoding")
@ -691,13 +685,10 @@ def test_structured_output_with_reasoning_matrices(
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE)
def test_structured_output_auto_mode(
monkeypatch: pytest.MonkeyPatch,
unsupported_json_schema: dict[str, Any],
model_name: str,
tokenizer_mode: str,
):
monkeypatch.setenv("VLLM_USE_V1", "1")
llm = LLM(
model=model_name,
max_model_len=1024,
@ -739,9 +730,7 @@ def test_structured_output_auto_mode(
@pytest.mark.skip_global_cleanup
def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
monkeypatch.setenv("VLLM_USE_V1", "1")
def test_guidance_no_additional_properties():
llm = LLM(
model="Qwen/Qwen2.5-1.5B-Instruct",
max_model_len=1024,
@ -801,12 +790,9 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
@pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
def test_structured_output_batched_with_non_structured_outputs_requests(
monkeypatch: pytest.MonkeyPatch,
sample_json_schema: dict[str, Any],
backend: str,
):
monkeypatch.setenv("VLLM_USE_V1", "1")
# Don't use eager execution on TPUs because we want to test for no
# recompilation at runtime
enforce_eager = bool(not current_platform.is_tpu())

View File

@ -53,7 +53,6 @@ cleanup() {
launch_baseline() {
BASELINE_BASE_CMD="source ${CONDA_PATH}/bin/activate ${CONDA_ENV_NAME};
VLLM_LOGGING_LEVEL=DEBUG \
VLLM_USE_V1=1 \
PJRT_DEVICE=TPU \
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \
@ -73,7 +72,6 @@ launch_pd() {
UCX_TLS=tcp \
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
VLLM_LOGGING_LEVEL=DEBUG \
VLLM_USE_V1=1 \
VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
PJRT_DEVICE=TPU \
@ -93,7 +91,6 @@ launch_pd() {
UCX_TLS=tcp \
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
VLLM_LOGGING_LEVEL=DEBUG \
VLLM_USE_V1=1 \
PJRT_DEVICE=TPU \
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \

View File

@ -55,7 +55,6 @@ launch_pd() {
UCX_TLS=tcp \
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
VLLM_LOGGING_LEVEL=DEBUG \
VLLM_USE_V1=1 \
VLLM_NIXL_SIDE_CHANNEL_HOST=${PREFILL_HOST} \
VLLM_NIXL_SIDE_CHANNEL_PORT=${PREFILL_NIXL_SIDE_PORT} \
PJRT_DEVICE=TPU \
@ -75,7 +74,6 @@ launch_pd() {
UCX_TLS=tcp \
VLLM_MULTIPROC_EXECUTE_MODEL_TIMEOUT_S=200 \
VLLM_LOGGING_LEVEL=DEBUG \
VLLM_USE_V1=1 \
PJRT_DEVICE=TPU \
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ENABLE_V1_MULTIPROCESSING=0 vllm serve $MODEL_NAME \

View File

@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
import ray
@ -10,15 +9,6 @@ from vllm.sampling_params import SamplingParams
from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
from vllm.v1.metrics.ray_wrappers import RayPrometheusMetric, RayPrometheusStatLogger
@pytest.fixture(scope="function", autouse=True)
def use_v1_only(monkeypatch):
"""
The change relies on V1 APIs, so set VLLM_USE_V1=1.
"""
monkeypatch.setenv("VLLM_USE_V1", "1")
MODELS = [
"distilbert/distilgpt2",
]
@ -39,10 +29,6 @@ def test_engine_log_metrics_ray(
@ray.remote(num_gpus=1)
class EngineTestActor:
async def run(self):
# Set environment variable inside the Ray actor since environment
# variables from pytest fixtures don't propagate to Ray actors
os.environ["VLLM_USE_V1"] = "1"
engine_args = AsyncEngineArgs(
model=model, dtype=dtype, disable_log_stats=False, enforce_eager=True
)

View File

@ -280,7 +280,6 @@ def test_get_logprobs_and_prompt_logprobs(
batch_logprobs_composition: BatchLogprobsComposition,
temperature: float,
example_prompts: list[str],
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Test V1 Engine logprobs & prompt logprobs
@ -308,220 +307,204 @@ def test_get_logprobs_and_prompt_logprobs(
temperature: "temperature" sampling parameter
example_prompts: example prompt fixture
"""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
if do_apc and (
temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT
):
# Skip some test-cases to save time.
pytest.skip()
test_prompts = example_prompts
do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
if do_apc and (temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT):
# Skip some test-cases to save time.
pytest.skip()
test_prompts = example_prompts
max_tokens = 5
hf_outputs = hf_model.generate_greedy(
test_prompts,
max_tokens = 5
hf_outputs = hf_model.generate_greedy(
test_prompts,
max_tokens=max_tokens,
)
hf_logprobs = hf_model.generate_greedy_logprobs(
test_prompts,
max_tokens=max_tokens,
)
# Batch has mixed sample params
# (different logprobs/prompt logprobs combos)
logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
# Ensure that each test prompt has a logprob config for testing
logprob_prompt_logprob_list = _repeat_logprob_config(
test_prompts, logprob_prompt_logprob_list
)
# Generate SamplingParams
vllm_sampling_params = [
SamplingParams(
max_tokens=max_tokens,
logprobs=num_lp,
prompt_logprobs=num_plp,
temperature=temperature,
seed=1984,
)
hf_logprobs = hf_model.generate_greedy_logprobs(
test_prompts,
for num_lp, num_plp in logprob_prompt_logprob_list
]
for _ in range(2 if do_apc else 1):
_run_and_validate(
vllm_model=vllm_model,
test_prompts=test_prompts,
vllm_sampling_params=vllm_sampling_params,
hf_logprobs=hf_logprobs,
hf_outputs=hf_outputs,
logprob_prompt_logprob_list=logprob_prompt_logprob_list,
temperature=temperature,
max_tokens=max_tokens,
do_apc=do_apc,
)
# Batch has mixed sample params
# (different logprobs/prompt logprobs combos)
logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
# Ensure that each test prompt has a logprob config for testing
logprob_prompt_logprob_list = _repeat_logprob_config(
test_prompts, logprob_prompt_logprob_list
)
# Generate SamplingParams
vllm_sampling_params = [
SamplingParams(
max_tokens=max_tokens,
logprobs=num_lp,
prompt_logprobs=num_plp,
temperature=temperature,
seed=1984,
)
for num_lp, num_plp in logprob_prompt_logprob_list
]
for _ in range(2 if do_apc else 1):
_run_and_validate(
vllm_model=vllm_model,
test_prompts=test_prompts,
vllm_sampling_params=vllm_sampling_params,
hf_logprobs=hf_logprobs,
hf_outputs=hf_outputs,
logprob_prompt_logprob_list=logprob_prompt_logprob_list,
temperature=temperature,
max_tokens=max_tokens,
do_apc=do_apc,
)
def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
def test_max_logprobs():
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs`
Should also fail for `prompt_logprobs > max_logprobs`
APC should not matter as this test checks basic request validation.
"""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
runner = VllmRunner(
"facebook/opt-125m",
max_logprobs=1,
enable_prefix_caching=False,
# 2 other llms alive during whole session
gpu_memory_utilization=0.15,
max_model_len=256,
)
vllm_sampling_params = SamplingParams(logprobs=1)
# should pass
runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
runner = VllmRunner(
"facebook/opt-125m",
max_logprobs=1,
enable_prefix_caching=False,
# 2 other llms alive during whole session
gpu_memory_utilization=0.15,
max_model_len=256,
)
vllm_sampling_params = SamplingParams(logprobs=1)
# should pass
runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
bad_sampling_params = SamplingParams(logprobs=2)
with pytest.raises(ValueError):
runner.generate(["Hello world"], sampling_params=bad_sampling_params)
bad_sampling_params = SamplingParams(logprobs=2)
with pytest.raises(ValueError):
runner.generate(["Hello world"], sampling_params=bad_sampling_params)
def test_none_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch):
def test_none_logprobs(vllm_model, example_prompts):
"""Engine should return `logprobs` and `prompt_logprobs` as `None`
Args:
vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture)
"""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
max_tokens = 5
max_tokens = 5
sampling_params_logprobs_none = SamplingParams(
max_tokens=max_tokens,
logprobs=None,
prompt_logprobs=None,
temperature=0.0,
)
results_logprobs_none = vllm_model.llm.generate(
example_prompts,
sampling_params=sampling_params_logprobs_none,
)
sampling_params_logprobs_none = SamplingParams(
max_tokens=max_tokens,
logprobs=None,
prompt_logprobs=None,
temperature=0.0,
)
results_logprobs_none = vllm_model.llm.generate(
example_prompts,
sampling_params=sampling_params_logprobs_none,
)
for i in range(len(results_logprobs_none)):
# Check sample logprobs are None
assert results_logprobs_none[i].outputs[0].logprobs is None
assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
# Check prompt logprobs are None
assert results_logprobs_none[i].prompt_logprobs is None
for i in range(len(results_logprobs_none)):
# Check sample logprobs are None
assert results_logprobs_none[i].outputs[0].logprobs is None
assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
# Check prompt logprobs are None
assert results_logprobs_none[i].prompt_logprobs is None
def test_zero_logprobs(vllm_model, example_prompts, monkeypatch: pytest.MonkeyPatch):
def test_zero_logprobs(vllm_model, example_prompts):
"""Engine should return sampled token and prompt token logprobs
Args:
vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture)
"""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
max_tokens = 5
max_tokens = 5
sampling_params_logprobs_zero = SamplingParams(
max_tokens=max_tokens, logprobs=0, prompt_logprobs=0, temperature=0.0
)
results_logprobs_zero = vllm_model.llm.generate(
example_prompts, sampling_params=sampling_params_logprobs_zero
)
sampling_params_logprobs_zero = SamplingParams(
max_tokens=max_tokens, logprobs=0, prompt_logprobs=0, temperature=0.0
)
results_logprobs_zero = vllm_model.llm.generate(
example_prompts, sampling_params=sampling_params_logprobs_zero
)
for i in range(len(results_logprobs_zero)):
# Check that there is one sample logprob dict for each
# sample token
logprobs = results_logprobs_zero[i].outputs[0].logprobs
prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
assert logprobs is not None
assert len(sampled_token_ids) == len(logprobs)
assert results_logprobs_zero[i].outputs[0].cumulative_logprob is not None
# Check that there is one prompt logprob dict for each
# prompt token
assert prompt_logprobs is not None
assert len(prompt_token_ids) == len(prompt_logprobs)
for i in range(len(results_logprobs_zero)):
# Check that there is one sample logprob dict for each
# sample token
logprobs = results_logprobs_zero[i].outputs[0].logprobs
prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
assert logprobs is not None
assert len(sampled_token_ids) == len(logprobs)
assert results_logprobs_zero[i].outputs[0].cumulative_logprob is not None
# Check that there is one prompt logprob dict for each
# prompt token
assert prompt_logprobs is not None
assert len(prompt_token_ids) == len(prompt_logprobs)
def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
def test_all_logprobs(example_prompts):
"""Engine should return all vocabulary logprobs and prompt logprobs
Args:
example_prompts: list of example prompts (test fixture)
"""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
runner = VllmRunner(
"facebook/opt-125m",
max_logprobs=-1,
enable_prefix_caching=False,
# 2 other llms alive during whole session
gpu_memory_utilization=0.15,
max_model_len=256,
)
runner = VllmRunner(
"facebook/opt-125m",
max_logprobs=-1,
enable_prefix_caching=False,
# 2 other llms alive during whole session
gpu_memory_utilization=0.15,
max_model_len=256,
)
sampling_params_logprobs_all = SamplingParams(
max_tokens=5, logprobs=-1, prompt_logprobs=-1
)
results_logprobs_all = runner.llm.generate(
example_prompts, sampling_params=sampling_params_logprobs_all
)
vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size()
sampling_params_logprobs_all = SamplingParams(
max_tokens=5, logprobs=-1, prompt_logprobs=-1
)
results_logprobs_all = runner.llm.generate(
example_prompts, sampling_params=sampling_params_logprobs_all
)
vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size()
for i in range(len(results_logprobs_all)):
logprobs = results_logprobs_all[i].outputs[0].logprobs
prompt_logprobs = results_logprobs_all[i].prompt_logprobs
assert logprobs is not None
for logprob in logprobs:
assert len(logprob) == vocab_size
assert prompt_logprobs is not None
assert prompt_logprobs[0] is None
for prompt_logprob in prompt_logprobs[1:]:
assert len(prompt_logprob) == vocab_size
for i in range(len(results_logprobs_all)):
logprobs = results_logprobs_all[i].outputs[0].logprobs
prompt_logprobs = results_logprobs_all[i].prompt_logprobs
assert logprobs is not None
for logprob in logprobs:
assert len(logprob) == vocab_size
assert prompt_logprobs is not None
assert prompt_logprobs[0] is None
for prompt_logprob in prompt_logprobs[1:]:
assert len(prompt_logprob) == vocab_size
@pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode))
def test_logprobs_mode(logprobs_mode: LogprobsMode, monkeypatch: pytest.MonkeyPatch):
def test_logprobs_mode(logprobs_mode: LogprobsMode):
"""Test with LLM engine with different logprobs_mode.
For logprobs, we should have non-positive values.
For logits, we should expect at least one positive values.
"""
from vllm import LLM
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
llm = LLM(
"facebook/opt-125m",
max_logprobs=5,
enable_prefix_caching=False,
# 2 other llms alive during whole session
gpu_memory_utilization=0.05,
max_model_len=16,
logprobs_mode=logprobs_mode,
)
vllm_sampling_params = SamplingParams(logprobs=1)
results = llm.generate(["Hello world"], sampling_params=vllm_sampling_params)
llm = LLM(
"facebook/opt-125m",
max_logprobs=5,
enable_prefix_caching=False,
# 2 other llms alive during whole session
gpu_memory_utilization=0.05,
max_model_len=16,
logprobs_mode=logprobs_mode,
)
vllm_sampling_params = SamplingParams(logprobs=1)
results = llm.generate(["Hello world"], sampling_params=vllm_sampling_params)
total_token_with_logprobs = 0
positive_values = 0
for output in results[0].outputs:
for logprobs in output.logprobs:
for token_id in logprobs:
logprob = logprobs[token_id]
if logprobs_mode in ("raw_logprobs", "processed_logprobs"):
assert logprob.logprob <= 0
if logprob.logprob > 0:
positive_values = positive_values + 1
total_token_with_logprobs = total_token_with_logprobs + 1
assert total_token_with_logprobs >= len(results[0].outputs)
if logprobs_mode in ("raw_logits", "processed_logits"):
assert positive_values > 0
del llm
total_token_with_logprobs = 0
positive_values = 0
for output in results[0].outputs:
for logprobs in output.logprobs:
for token_id in logprobs:
logprob = logprobs[token_id]
if logprobs_mode in ("raw_logprobs", "processed_logprobs"):
assert logprob.logprob <= 0
if logprob.logprob > 0:
positive_values = positive_values + 1
total_token_with_logprobs = total_token_with_logprobs + 1
assert total_token_with_logprobs >= len(results[0].outputs)
if logprobs_mode in ("raw_logits", "processed_logits"):
assert positive_values > 0
del llm

View File

@ -1,14 +1,10 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
from vllm import LLM, SamplingParams
if os.getenv("VLLM_USE_V1", "0") != "1":
pytest.skip("Test package requires V1", allow_module_level=True)
MODEL = "meta-llama/Llama-3.2-1B"
PROMPT = "Hello my name is Robert and I"
@ -173,14 +169,6 @@ def test_allowed_token_ids(llm):
_ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))
def test_priority(llm):
"""Check that we reject requests with priority."""
# Reject all allowed token ids
with pytest.raises(ValueError):
_ = llm.generate(PROMPT, priority=[1])
def test_seed(llm):
"""Check that seed impacts randomness."""

View File

@ -38,7 +38,6 @@ def test_eagle_max_len(
monkeypatch: pytest.MonkeyPatch, num_speculative_tokens: int, attn_backend: str
):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():

View File

@ -42,7 +42,6 @@ MAX_NUM_REQS = [16, 1024]
@pytest.mark.parametrize("max_num_seqs", MAX_NUM_REQS)
def test_basic(
vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
model: str,
max_tokens: int,
tensor_parallel_size: int,
@ -55,23 +54,20 @@ def test_basic(
)
example_prompts = [prompt]
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
with vllm_runner(
model,
# Note: max_num_batched_tokens == 1024 is needed here to
# actually test chunked prompt
max_num_batched_tokens=1024,
max_model_len=8192,
gpu_memory_utilization=0.7,
max_num_seqs=max_num_seqs,
tensor_parallel_size=tensor_parallel_size,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
output = vllm_outputs[0][1]
with vllm_runner(
model,
# Note: max_num_batched_tokens == 1024 is needed here to
# actually test chunked prompt
max_num_batched_tokens=1024,
max_model_len=8192,
gpu_memory_utilization=0.7,
max_num_seqs=max_num_seqs,
tensor_parallel_size=tensor_parallel_size,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
output = vllm_outputs[0][1]
assert "1024" in output or "0, 1" in output
assert "1024" in output or "0, 1" in output
@pytest.mark.skip(reason="Temporarily disabled due to timeout")
@ -82,7 +78,6 @@ def test_basic(
@pytest.mark.parametrize("max_num_seqs", [16])
def test_phi3(
vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
max_tokens: int,
max_num_seqs: int,
) -> None:
@ -99,18 +94,15 @@ def test_phi3(
# test head dim = 96
model = "microsoft/Phi-3-mini-128k-instruct"
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
with vllm_runner(
model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
# vllm_outputs is a list of tuples whose first element is the token id
# and the second element is the output (including the prompt).
for output, answer in zip(vllm_outputs, answers):
generated_text = output[1]
assert answer in generated_text
with vllm_runner(
model, max_num_batched_tokens=256, max_num_seqs=max_num_seqs
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
# vllm_outputs is a list of tuples whose first element is the token id
# and the second element is the output (including the prompt).
for output, answer in zip(vllm_outputs, answers):
generated_text = output[1]
assert answer in generated_text
TP_SIZE_8 = 8
@ -123,7 +115,6 @@ TP_SIZE_8 = 8
)
def test_gemma3_27b_with_text_input_and_tp(
vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
) -> None:
model = "google/gemma-3-27b-it"
max_tokens = 16
@ -140,21 +131,18 @@ def test_gemma3_27b_with_text_input_and_tp(
" but in rising every time we fall.",
]
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
with vllm_runner(
model,
max_num_batched_tokens=256,
max_num_seqs=max_num_seqs,
tensor_parallel_size=tensor_parallel_size,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
# vllm_outputs is a list of tuples whose first element is the token id
# and the second element is the output (including the prompt).
for output, answer in zip(vllm_outputs, answers):
generated_text = output[1]
assert answer in generated_text
with vllm_runner(
model,
max_num_batched_tokens=256,
max_num_seqs=max_num_seqs,
tensor_parallel_size=tensor_parallel_size,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(prompts, max_tokens)
# vllm_outputs is a list of tuples whose first element is the token id
# and the second element is the output (including the prompt).
for output, answer in zip(vllm_outputs, answers):
generated_text = output[1]
assert answer in generated_text
@pytest.mark.skipif(
@ -162,7 +150,6 @@ def test_gemma3_27b_with_text_input_and_tp(
)
def test_w8a8_quantization(
vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
) -> None:
model = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
max_tokens = 5
@ -176,18 +163,15 @@ def test_w8a8_quantization(
)
example_prompts = [prompt]
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
with vllm_runner(
model,
max_num_batched_tokens=64,
max_model_len=4096,
gpu_memory_utilization=0.7,
max_num_seqs=max_num_seqs,
tensor_parallel_size=tensor_parallel_size,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
output = vllm_outputs[0][1]
with vllm_runner(
model,
max_num_batched_tokens=64,
max_model_len=4096,
gpu_memory_utilization=0.7,
max_num_seqs=max_num_seqs,
tensor_parallel_size=tensor_parallel_size,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
output = vllm_outputs[0][1]
assert "1024" in output or "0, 1" in output
assert "1024" in output or "0, 1" in output

View File

@ -86,7 +86,6 @@ GPU_UTIL = 0.9
@pytest.mark.parametrize("params", TEST_PARAMS)
def test_perf(
vllm_runner: type[VllmRunner],
monkeypatch: pytest.MonkeyPatch,
params: TestParams,
) -> None:
tokenizer = get_tokenizer(
@ -107,48 +106,45 @@ def test_perf(
)
)
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
sampling_params = SamplingParams(
max_tokens=params.decode_len, temperature=1.0, min_p=0.0
)
sampling_params = SamplingParams(
max_tokens=params.decode_len, temperature=1.0, min_p=0.0
)
with vllm_runner(
params.model,
max_num_batched_tokens=MAX_MODEL_LEN,
max_model_len=MAX_MODEL_LEN,
max_num_seqs=MAX_NUM_SEQS,
gpu_memory_utilization=GPU_UTIL,
enforce_eager=False,
tensor_parallel_size=1,
) as vllm_model:
print(" -- Warmup / Compile")
for i in range(NUM_WARMUPS):
_ = vllm_model.generate(prompts, sampling_params)
with vllm_runner(
params.model,
max_num_batched_tokens=MAX_MODEL_LEN,
max_model_len=MAX_MODEL_LEN,
max_num_seqs=MAX_NUM_SEQS,
gpu_memory_utilization=GPU_UTIL,
enforce_eager=False,
tensor_parallel_size=1,
) as vllm_model:
print(" -- Warmup / Compile")
for i in range(NUM_WARMUPS):
_ = vllm_model.generate(prompts, sampling_params)
print(" -- Benchmarking... ")
times = []
for i in range(NUM_RUNS):
start_time = time.time()
_ = vllm_model.generate(prompts, sampling_params)
times.append(time.time() - start_time)
print(" -- Benchmarking... ")
times = []
for i in range(NUM_RUNS):
start_time = time.time()
_ = vllm_model.generate(prompts, sampling_params)
times.append(time.time() - start_time)
avg_time = sum(times) / len(times)
avg_time = sum(times) / len(times)
print(" -- avg_time = {}".format(avg_time))
print(
" -- expected_avg_time = {} with err_tol = {}".format(
params.expected_avg_time, params.err_tol
)
print(" -- avg_time = {}".format(avg_time))
print(
" -- expected_avg_time = {} with err_tol = {}".format(
params.expected_avg_time, params.err_tol
)
)
diff = avg_time - params.expected_avg_time
ok = diff < params.err_tol
if diff < -params.err_tol:
print(
" !! WARNING !! Performance has improved by {}, "
"it may be necessary to fine-tune the "
"expected_avg_time = {}".format(-diff, params.expected_avg_time)
)
diff = avg_time - params.expected_avg_time
ok = diff < params.err_tol
if diff < -params.err_tol:
print(
" !! WARNING !! Performance has improved by {}, "
"it may be necessary to fine-tune the "
"expected_avg_time = {}".format(-diff, params.expected_avg_time)
)
assert ok, " !! ERROR !! Regression detected"
assert ok, " !! ERROR !! Regression detected"

View File

@ -82,7 +82,7 @@ def test_traces(
):
with monkeypatch.context() as m:
m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
m.setenv("VLLM_USE_V1", "1")
sampling_params = SamplingParams(
temperature=0.01,
top_p=0.1,

View File

@ -77,7 +77,13 @@ class CPUModelRunner(GPUModelRunner):
logger.info("Warming up model for the compilation...")
# Only generate graph for the generic shape
with _set_global_compilation_settings(self.vllm_config):
self._dummy_run(max(16, self.max_num_reqs))
self._dummy_run(
min(
max(16, self.max_num_reqs),
self.scheduler_config.max_num_batched_tokens,
)
)
logger.info("Warming up done.")
def _init_device_properties(self) -> None: