vllm/tests/v1/logits_processors/test_custom_offline.py
Sergei Skvortsov 6ebaf43ee4
[V1] Logit processors for rejection sampler (#19482)
Signed-off-by: southfreebird <yvorott@gmail.com>
Signed-off-by: Sergei Skvortsov <sergeyskv@nebius.com>
Signed-off-by: Sergei Skvortsov <yvorott@gmail.com>
Co-authored-by: Sergei Skvortsov <sergeyskv@nebius.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-10-07 13:02:49 -07:00

301 lines
11 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import sys
from typing import Any, Union
import pytest
from tests.utils import create_new_process_for_each_test
from tests.v1.logits_processors.utils import (
DUMMY_LOGITPROC_ARG,
DUMMY_LOGITPROC_FQCN,
DUMMY_LOGITPROC_MODULE,
MAX_TOKENS,
MODEL_NAME,
POOLING_MODEL_NAME,
TEMP_GREEDY,
CustomLogitprocSource,
DummyLogitsProcessor,
WrappedPerReqLogitsProcessor,
dummy_module,
prompts,
)
from tests.v1.logits_processors.utils import entry_points as fake_entry_points
from vllm import LLM, SamplingParams
from vllm.v1.sample.logits_processor import (
STR_POOLING_REJECTS_LOGITSPROCS,
STR_SPEC_DEC_REJECTS_LOGITSPROCS,
LogitsProcessor,
)
# Create a mixture of requests which do and don't utilize the dummy logitproc
sampling_params_list = [
SamplingParams(
temperature=TEMP_GREEDY,
max_tokens=MAX_TOKENS,
extra_args={DUMMY_LOGITPROC_ARG: 128},
),
SamplingParams(temperature=TEMP_GREEDY, max_tokens=MAX_TOKENS),
SamplingParams(
temperature=TEMP_GREEDY,
max_tokens=MAX_TOKENS,
extra_args={DUMMY_LOGITPROC_ARG: 67},
),
SamplingParams(temperature=TEMP_GREEDY, max_tokens=MAX_TOKENS),
]
def _run_test(kwargs: dict, logitproc_loaded: bool) -> None:
"""Compare `LLM` instance initialized with specified `kwargs` against
reference `LLM` instance.
Two scenarios:
1. Server has loaded dummy logitproc; test that requests which specify
dummy logitproc arg value behave as if logitproc is operating (output
token value should repeat), while requests that don't specify dummy
logitproc arg value should match reference `LLM` output.
2. Server has *not* loaded dummy logitproc; test that all requests
behave as if logitproc is *not* operating (output matches reference
`LLM` output.)
Args:
kwargs: `LLM` constructor kwargs
logitproc_loaded: server has loaded dummy logitproc if True
"""
# Create a vLLM instance and load custom logitproc
llm_logitproc = LLM(
model=MODEL_NAME,
gpu_memory_utilization=0.1,
**kwargs,
)
# Create a reference vLLM instance without custom logitproc
llm_ref = LLM(model=MODEL_NAME, gpu_memory_utilization=0.1)
# Run inference with logitproc loaded
outputs_logitproc = llm_logitproc.generate(prompts, sampling_params_list)
# Reference run
outputs_ref = llm_ref.generate(prompts, sampling_params_list)
# Validate outputs
for bdx, (out_lp, out_ref, params) in enumerate(
zip(outputs_logitproc, outputs_ref, sampling_params_list)
):
lp_toks = out_lp.outputs[0].token_ids
if logitproc_loaded and params.extra_args:
# This request exercises custom logitproc; validate that logitproc
# forces `target_token` to be decoded in each step
target_token = params.extra_args[DUMMY_LOGITPROC_ARG]
if not all(x == target_token for x in lp_toks):
raise AssertionError(
f"Request {bdx} generated {lp_toks}, should all be {target_token}"
)
else:
# This request does not exercise custom logitproc (or custom
# logitproc is not enabled on this server); validate against
# reference result
ref_toks = out_ref.outputs[0].token_ids
if lp_toks != ref_toks:
raise AssertionError(
f"Request {bdx} generated {lp_toks}, should match {ref_toks}"
)
@create_new_process_for_each_test()
@pytest.mark.parametrize("logitproc_source", list(CustomLogitprocSource))
def test_custom_logitsprocs(monkeypatch, logitproc_source: CustomLogitprocSource):
"""Test offline Python interface for passing custom logitsprocs
Construct an `LLM` instance which loads a custom logitproc that has a
well-defined behavior (mask out all tokens except one `target_token`)
Construct a reference `LLM` instance with no custom logitproc
Pass in a batch of requests, 50% of which pass a `target_token` value
in through `SamplingParams.extra_args`, 50% of which do not.
Validate that
* Requests which do not activate the custom logitproc, yield the same
results for both `LLM` instances
* Requests which activate the custom logitproc, only output `target_token`
Test four scenarios, corresponding to `logitproc_source` value
* No logitsprocs loaded - test that generated tokens match reference `LLM`
instance output
* Logitproc passed in via {entrypoint, class object, fully-qualified class
name (FQCN)} - test that dummy logitproc is utilized correctly when
provided via any of these three possible sources
Args:
monkeypatch: for setting env vars
logitproc_source: what source (entrypoint, fully-qualified class name
(FQCN), class object, or None) the user pulls the
logitproc from
"""
# Test that logitproc info is passed to workers
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1")
random.seed(40)
# Choose LLM args based on logitproc source
if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_NONE:
# Scenario: the server does not load any custom logitproc
# Every other scenario is a different way of loading a custom logitproc
_run_test({}, logitproc_loaded=False)
return
if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT:
# Scenario: vLLM loads a logitproc from a preconfigured entrypoint
# To that end, mock a dummy logitproc entrypoint
import importlib.metadata
importlib.metadata.entry_points = fake_entry_points # type: ignore
# fork is required for workers to see entrypoint patch
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "fork")
_run_test({}, logitproc_loaded=True)
return
kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {}
if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN:
# Scenario: load logitproc based on fully-qualified class name (FQCN)
# Inject dummy module which defines logitproc
sys.modules[DUMMY_LOGITPROC_MODULE] = dummy_module
kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN]
elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_CLASS:
# Scenario: load logitproc from provided class object
kwargs["logits_processors"] = [DummyLogitsProcessor]
_run_test(kwargs, logitproc_loaded=True)
@create_new_process_for_each_test()
def test_custom_logitsprocs_req(monkeypatch):
"""Test passing request-level logits processor to offline Python interface
Wrap a request-level logits processor to create a batch level logits
processor that has a well-defined behavior (mask out all tokens except one
`target_token`)
Construct an `LLM` instance which loads the wrapped logits processor. Pass
the custom logitproc as a class object.
Construct a reference `LLM` instance with no custom logitproc
Pass in a batch of requests, 50% of which pass a `target_token` value
in through `SamplingParams.extra_args`, 50% of which do not.
Validate that
* Requests which do not activate the custom logitproc, yield the same
results for both `LLM` instances
* Requests which activate the custom logitproc, only output `target_token`
Args:
monkeypatch: for setting env vars
"""
# Test that logitproc info is passed to workers
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1")
random.seed(40)
_run_test(
{"logits_processors": [WrappedPerReqLogitsProcessor]}, logitproc_loaded=True
)
@create_new_process_for_each_test()
@pytest.mark.parametrize("model_scenario", ["pooling", "spec_dec"])
@pytest.mark.parametrize(
"logitproc_source",
[
CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT,
CustomLogitprocSource.LOGITPROC_SOURCE_FQCN,
CustomLogitprocSource.LOGITPROC_SOURCE_CLASS,
],
)
def test_rejects_custom_logitsprocs(
monkeypatch, model_scenario: str, logitproc_source: CustomLogitprocSource
):
"""Validate that vLLM engine initialization properly rejects custom
logitsprocs when the model is a pooling model or speculative decoding
enabled.
Use `LLM` entrypoint. We expect `LLM` initialization to fail before the
logitproc is actually loaded.
Scenario 1:
* Mock a logitproc entrypoint
* Validate that `LLM` does not load the logitproc
Scenario 2:
* Pass custom logitproc to `LLM` constructor
* Scenario 2a: via FQCN
* Scenario 2b: via class object
* Validate that initialization fails with appropriate exception
Args:
monkeypatch: used to set environment variables
logitproc_source: what source (entrypoint, fully-qualified class name
(FQCN), or class object) the user pulls the
logitproc from
"""
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
random.seed(40)
test_params: dict[str, dict[str, Any]] = {
"pooling": {
"runner": "pooling",
"model": POOLING_MODEL_NAME,
"error_message": STR_POOLING_REJECTS_LOGITSPROCS,
"speculative_config": None,
},
"spec_dec": {
"runner": "auto",
"model": MODEL_NAME,
"error_message": STR_SPEC_DEC_REJECTS_LOGITSPROCS,
"speculative_config": {"model": "ngram", "num_speculative_tokens": 1},
},
}
config = test_params[model_scenario]
llm_kwargs: dict[str, Any] = {
"runner": config["runner"],
"model": config["model"],
"gpu_memory_utilization": 0.1,
"speculative_config": config["speculative_config"],
}
if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT:
# Scenario: vLLM loads a model and ignores a logitproc that is
# available at a preconfigured entrypoint
# Patch in dummy logitproc entrypoint
import importlib.metadata
importlib.metadata.entry_points = fake_entry_points # type: ignore
# fork is required for entrypoint patch to be visible to workers,
# although they should ignore the entrypoint patch anyway
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "fork")
llm = LLM(**llm_kwargs)
# Require that no logitsprocs have been loaded
worker = llm.llm_engine.model_executor.driver_worker.worker
assert sum([1 for _ in worker.model_runner.input_batch.logitsprocs.all]) == 0
return
if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN:
# Scenario: load logitproc based on fully-qualified class name (FQCN)
llm_kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN]
elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_CLASS:
# Scenario: load logitproc from provided class object
llm_kwargs["logits_processors"] = [DummyLogitsProcessor]
with pytest.raises(ValueError, match=config["error_message"]):
# Require that loading a model alongside the logitproc raises
# the appropriate exception.
LLM(**llm_kwargs)