Merge branch 'main' into mlm-full-lora-support

This commit is contained in:
B-201 2025-12-12 00:04:59 +08:00 committed by GitHub
commit e10321bf6a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
59 changed files with 2016 additions and 667 deletions

View File

@ -32,12 +32,11 @@ def benchmark_propose(args):
model_config = ModelConfig(
model="facebook/opt-125m",
task="generate",
max_model_len=args.num_token + args.num_spec_token,
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
dtype="auto",
seed=None,
seed=0,
trust_remote_code=False,
)
proposer = NgramProposer(

View File

@ -0,0 +1,150 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation
in MLA (Multi-head Latent Attention) prefill.
This validates that the optimization from commit 8d4142bd is beneficial across
various batch sizes, not just the originally tested batch size of 32768.
"""
import time
from collections.abc import Callable
import torch
# DeepSeek-V3 MLA dimensions
NUM_HEADS = 128
QK_NOPE_HEAD_DIM = 128
PE_DIM = 64
def cat_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
"""Original torch.cat approach with expand."""
return torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
def direct_copy_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor:
"""Optimized direct copy approach (avoids expand + cat overhead)."""
k = torch.empty(
(*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]),
dtype=k_nope.dtype,
device=k_nope.device,
)
k[..., : k_nope.shape[-1]] = k_nope
k[..., k_nope.shape[-1] :] = k_pe
return k
def benchmark_method(
method: Callable,
k_nope: torch.Tensor,
k_pe: torch.Tensor,
num_warmup: int = 10,
num_iters: int = 100,
) -> float:
"""Benchmark a concatenation method and return mean latency in ms."""
# Warmup
for _ in range(num_warmup):
_ = method(k_nope, k_pe)
torch.cuda.synchronize()
# Benchmark
start = time.perf_counter()
for _ in range(num_iters):
_ = method(k_nope, k_pe)
torch.cuda.synchronize()
end = time.perf_counter()
return (end - start) / num_iters * 1000 # Convert to ms
@torch.inference_mode()
def run_benchmark(dtype: torch.dtype, dtype_name: str):
"""Run benchmark for a specific dtype."""
torch.set_default_device("cuda")
# Batch sizes to test (powers of 2 from 32 to 65536)
batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536]
print("=" * 80)
print("Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation")
print("=" * 80)
print(
f"Tensor shapes: k_nope=[B, {NUM_HEADS}, {QK_NOPE_HEAD_DIM}], "
f"k_pe=[B, 1, {PE_DIM}]"
)
print(f"dtype: {dtype_name}")
print()
print(
f"{'Batch Size':>12} | {'cat (ms)':>10} | {'direct (ms)':>12} | "
f"{'Speedup':>8} | {'Reduction':>10}"
)
print("-" * 70)
results = []
for batch_size in batch_sizes:
# Create input tensors (generate in float32 then convert for FP8 compatibility)
k_nope = torch.randn(
batch_size, NUM_HEADS, QK_NOPE_HEAD_DIM, dtype=torch.float32, device="cuda"
).to(dtype)
k_pe = torch.randn(
batch_size, 1, PE_DIM, dtype=torch.float32, device="cuda"
).to(dtype)
# Benchmark both methods
cat_time = benchmark_method(cat_method, k_nope, k_pe)
direct_time = benchmark_method(direct_copy_method, k_nope, k_pe)
speedup = cat_time / direct_time
reduction = (1 - direct_time / cat_time) * 100
results.append((batch_size, cat_time, direct_time, speedup, reduction))
print(
f"{batch_size:>12} | {cat_time:>10.3f} | {direct_time:>12.3f} | "
f"{speedup:>7.2f}x | {reduction:>9.1f}%"
)
print("=" * 80)
# Summary statistics
speedups = [r[3] for r in results]
print("\nSpeedup summary:")
print(f" Min: {min(speedups):.2f}x")
print(f" Max: {max(speedups):.2f}x")
print(f" Mean: {sum(speedups) / len(speedups):.2f}x")
# Find crossover point
crossover_batch = None
for batch_size, _, _, speedup, _ in results:
if speedup >= 1.0:
crossover_batch = batch_size
break
print("\nConclusion:")
if crossover_batch:
print(f" - Direct copy becomes beneficial at batch size >= {crossover_batch}")
# Filter for large batches (>= 512 which is typical for prefill)
large_batch_speedups = [r[3] for r in results if r[0] >= 512]
if large_batch_speedups:
avg_large = sum(large_batch_speedups) / len(large_batch_speedups)
print(f" - For batch sizes >= 512: avg speedup = {avg_large:.2f}x")
print(" - MLA prefill typically uses large batches, so optimization is effective")
return results
@torch.inference_mode()
def main():
# Test bfloat16
print("\n")
run_benchmark(torch.bfloat16, "bfloat16")
# Test float8_e4m3fn
print("\n")
run_benchmark(torch.float8_e4m3fn, "float8_e4m3fn")
if __name__ == "__main__":
main()

View File

@ -152,5 +152,5 @@ The interface for the model/module may change during vLLM's development. If you
## Deprecation announcement
!!! warning "Deprecations"
- `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It will be removed in v0.13.0 or v1.0.0.
- `_Backend` in `vllm.attention` is deprecated. It will be removed in v0.13.0 or v1.0.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
- `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0.
- `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.

View File

@ -26,3 +26,4 @@ The backends below live **outside** the main `vllm` repository and follow the
| Rebellions ATOM / REBEL NPU | `vllm-rbln` | <https://github.com/rebellions-sw/vllm-rbln> |
| IBM Spyre AIU | `vllm-spyre` | <https://github.com/vllm-project/vllm-spyre> |
| Cambricon MLU | `vllm-mlu` | <https://github.com/Cambricon/vllm-mlu> |
| Baidu Kunlun XPU | N/A, install from source | <https://github.com/baidu/vLLM-Kunlun> |

View File

@ -29,8 +29,27 @@ uv pip install --pre vllm==<version>+cpu --extra-index-url https://wheels.vllm.a
The `uv` approach works for vLLM `v0.6.6` and later. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
!!! note
Nightly wheels are currently unsupported for this architecture. (e.g. to bisect the behavior change, performance regression).
**Install the latest code**
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides working pre-built Arm CPU wheels for every commit since `v0.11.2` on <https://wheels.vllm.ai/nightly>. For native CPU wheels, this index should be used:
* `https://wheels.vllm.ai/nightly/cpu/vllm`
To install from nightly index, copy the link address of the `*.whl` under this index to run, for example:
```bash
uv pip install -U https://wheels.vllm.ai/c756fb678184b867ed94e5613a529198f1aee423/vllm-0.13.0rc2.dev11%2Bgc756fb678.cpu-cp38-abi3-manylinux_2_31_aarch64.whl # current nightly build (the filename will change!)
```
**Install specific revisions**
If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), specify the full commit hash in the index:
https://wheels.vllm.ai/${VLLM_COMMIT}/cpu/vllm .
Then, copy the link address of the `*.whl` under this index to run:
```bash
uv pip install -U <wheel-url>
```
# --8<-- [end:pre-built-wheels]
# --8<-- [start:build-wheel-from-source]

View File

@ -316,10 +316,13 @@ We have split the `encode` task into two more specific token-wise tasks: `token_
### Remove softmax from PoolingParams
We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.
We are going to remove `softmax` and `activation` from `PoolingParams` in v0.15. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.
### as_reward_model
!!! warning
We are going to remove `--convert reward` in v0.15, use `--convert embed` instead.
Pooling models now default support all pooling, you can use it without any settings.
- Extracting hidden states prefers using `token_embed` task.

View File

@ -422,7 +422,7 @@ def parse_args():
parser.add_argument(
"--seed",
type=int,
default=None,
default=0,
help="Set the seed when initializing `vllm.LLM`.",
)
parser.add_argument(

View File

@ -77,7 +77,7 @@ def parse_args():
parser.add_argument(
"--seed",
type=int,
default=None,
default=0,
help="Set the seed when initializing `vllm.LLM`.",
)
return parser.parse_args()

View File

@ -158,7 +158,7 @@ def parse_args():
parser.add_argument(
"--seed",
type=int,
default=None,
default=0,
help="Set the seed when initializing `vllm.LLM`.",
)

View File

@ -158,7 +158,7 @@ def parse_args():
parser.add_argument(
"--seed",
type=int,
default=None,
default=0,
help="Set the seed when initializing `vllm.LLM`.",
)

View File

@ -2031,7 +2031,7 @@ def parse_args():
parser.add_argument(
"--seed",
type=int,
default=None,
default=0,
help="Set the seed when initializing `vllm.LLM`.",
)

View File

@ -1382,7 +1382,7 @@ def run_generate(
model,
question: str,
image_urls: list[str],
seed: int | None,
seed: int,
tensor_parallel_size: int | None,
):
req_data = model_example_map[model](question, image_urls)
@ -1416,7 +1416,7 @@ def run_chat(
model: str,
question: str,
image_urls: list[str],
seed: int | None,
seed: int,
tensor_parallel_size: int | None,
):
req_data = model_example_map[model](question, image_urls)
@ -1494,7 +1494,7 @@ def parse_args():
parser.add_argument(
"--seed",
type=int,
default=None,
default=0,
help="Set the seed when initializing `vllm.LLM`.",
)
parser.add_argument(

View File

@ -16,7 +16,7 @@ import requests
# - start vllm in serving mode with the below args
# --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
# --model-impl terratorch
# --task embed --trust-remote-code
# --trust-remote-code
# --skip-tokenizer-init --enforce-eager
# --io-processor-plugin terratorch_segmentation
# --enable-mm-embeds

View File

@ -305,7 +305,7 @@ def get_query(modality: QueryModality):
raise ValueError(msg)
def run_encode(model: str, modality: QueryModality, seed: int | None):
def run_encode(model: str, modality: QueryModality, seed: int):
query = get_query(modality)
req_data = model_example_map[model](query)
@ -335,7 +335,7 @@ def run_encode(model: str, modality: QueryModality, seed: int | None):
print("-" * 50)
def run_score(model: str, modality: QueryModality, seed: int | None):
def run_score(model: str, modality: QueryModality, seed: int):
query = get_query(modality)
req_data = model_example_map[model](query)
@ -390,7 +390,7 @@ def parse_args():
parser.add_argument(
"--seed",
type=int,
default=None,
default=0,
help="Set the seed when initializing `vllm.LLM`.",
)
return parser.parse_args()

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy
import logging
from contextlib import nullcontext
from unittest.mock import patch
@ -13,7 +12,6 @@ from vllm.compilation.fix_functionalization import FixFunctionalizationPass
from vllm.config import CompilationConfig, CUDAGraphMode, ParallelConfig, VllmConfig
from vllm.config.compilation import CompilationMode, PassConfig
from vllm.engine.arg_utils import EngineArgs
from vllm.logger import _print_warning_once
from vllm.platforms import current_platform
from vllm.utils.torch_utils import _is_torch_equal_or_newer
@ -290,7 +288,7 @@ def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor():
),
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
pass_config={"enable_attn_fusion": True, "enable_noop": True},
pass_config={"fuse_attn_quant": True, "eliminate_noops": True},
custom_ops=["+quant_fp8"],
cudagraph_mode=CUDAGraphMode.PIECEWISE,
),
@ -442,62 +440,3 @@ def test_cudagraph_sizes_post_init(
vllm_config.compilation_config.max_cudagraph_capture_size
== expected_max_size
)
def test_pass_config_deprecation(caplog_vllm):
caplog_vllm.set_level(logging.WARNING)
# Clear cache to ensure warnings are re-issued
_print_warning_once.cache_clear()
# Test enable_fusion -> fuse_norm_quant, fuse_act_quant
caplog_vllm.clear()
config = PassConfig(enable_fusion=True)
assert "enable_fusion is deprecated" in caplog_vllm.text
assert config.fuse_norm_quant is True
assert config.fuse_act_quant is True
assert config.enable_fusion is True
# Test enable_attn_fusion -> fuse_attn_quant
caplog_vllm.clear()
config = PassConfig(enable_attn_fusion=True)
assert "enable_attn_fusion is deprecated" in caplog_vllm.text
assert config.fuse_attn_quant is True
assert config.enable_attn_fusion is True
# Test enable_noop -> eliminate_noops
caplog_vllm.clear()
config = PassConfig(enable_noop=True)
assert "enable_noop is deprecated" in caplog_vllm.text
assert config.eliminate_noops is True
assert config.enable_noop is True
# Test enable_sequence_parallelism -> enable_sp
caplog_vllm.clear()
config = PassConfig(enable_sequence_parallelism=True)
assert "enable_sequence_parallelism is deprecated" in caplog_vllm.text
assert config.enable_sp is True
assert config.enable_sequence_parallelism is True
# Test enable_async_tp -> fuse_gemm_comms
caplog_vllm.clear()
config = PassConfig(enable_async_tp=True)
assert "enable_async_tp is deprecated" in caplog_vllm.text
assert config.fuse_gemm_comms is True
assert config.enable_async_tp is True
# Test enable_fi_allreduce_fusion -> fuse_allreduce_rms
caplog_vllm.clear()
config = PassConfig(enable_fi_allreduce_fusion=True)
assert "enable_fi_allreduce_fusion is deprecated" in caplog_vllm.text
assert config.fuse_allreduce_rms is True
assert config.enable_fi_allreduce_fusion is True
# Test hash consistency
config_old = PassConfig(enable_fusion=True)
config_new = PassConfig(fuse_norm_quant=True, fuse_act_quant=True)
assert config_old.compute_hash() == config_new.compute_hash()
config_old = PassConfig(enable_async_tp=True)
config_new = PassConfig(fuse_gemm_comms=True)
assert config_old.compute_hash() == config_new.compute_hash()

View File

@ -741,7 +741,7 @@ class VllmRunner:
tokenizer_name: str | None = None,
tokenizer_mode: str = "auto",
trust_remote_code: bool = True,
seed: int | None = 0,
seed: int = 0,
max_model_len: int | None = 1024,
dtype: str = "auto",
disable_log_stats: bool = True,

View File

@ -350,21 +350,35 @@ def test_human_readable_model_len():
assert args.max_model_len == 1_000_000
args = parser.parse_args(["--max-model-len", "10k"])
assert args.max_model_len == 10_000
args = parser.parse_args(["--max-model-len", "2g"])
assert args.max_model_len == 2_000_000_000
args = parser.parse_args(["--max-model-len", "2t"])
assert args.max_model_len == 2_000_000_000_000
# Capital
args = parser.parse_args(["--max-model-len", "3K"])
assert args.max_model_len == 1024 * 3
assert args.max_model_len == 2**10 * 3
args = parser.parse_args(["--max-model-len", "10M"])
assert args.max_model_len == 2**20 * 10
args = parser.parse_args(["--max-model-len", "4G"])
assert args.max_model_len == 2**30 * 4
args = parser.parse_args(["--max-model-len", "4T"])
assert args.max_model_len == 2**40 * 4
# Decimal values
args = parser.parse_args(["--max-model-len", "10.2k"])
assert args.max_model_len == 10200
# ..truncated to the nearest int
args = parser.parse_args(["--max-model-len", "10.212345k"])
args = parser.parse_args(["--max-model-len", "10.2123451234567k"])
assert args.max_model_len == 10212
args = parser.parse_args(["--max-model-len", "10.2123451234567m"])
assert args.max_model_len == 10212345
args = parser.parse_args(["--max-model-len", "10.2123451234567g"])
assert args.max_model_len == 10212345123
args = parser.parse_args(["--max-model-len", "10.2123451234567t"])
assert args.max_model_len == 10212345123456
# Invalid (do not allow decimals with binary multipliers)
for invalid in ["1a", "pwd", "10.24", "1.23M"]:
for invalid in ["1a", "pwd", "10.24", "1.23M", "1.22T"]:
with pytest.raises(ArgumentError):
args = parser.parse_args(["--max-model-len", invalid])
parser.parse_args(["--max-model-len", invalid])

View File

@ -70,12 +70,12 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
f"{torch.cuda.device_count()}"
)
# `cuda_graph_sizes=[16]` to reduce load time.
# `cudagraph_capture_sizes=[16]` to reduce load time.
with vllm_runner(
model_case.model_id,
tensor_parallel_size=model_case.tp,
load_format="dummy",
cuda_graph_sizes=[16],
cudagraph_capture_sizes=[16],
) as llm:
# Disabled as check_model is broken: https://github.com/vllm-project/vllm/pull/18465#issuecomment-3329880562
# def check_model(model):

View File

@ -17,7 +17,6 @@ def test_idefics_multimodal(
with vllm_runner(
model_name="HuggingFaceM4/Idefics3-8B-Llama3",
runner="pooling",
task="classify",
convert="classify",
load_format="dummy",
max_model_len=512,
@ -86,7 +85,6 @@ def test_gemma_multimodal(
with vllm_runner(
model_name="google/gemma-3-4b-it",
runner="pooling",
task="classify",
convert="classify",
load_format="auto",
hf_overrides=update_config,

View File

@ -147,7 +147,7 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
"""
Regression test for handling videos with broken frames.
This test uses a pre-corrupted video file (assets/corrupted.mp4) that
contains broken/unreadable frames to verify the video loader handles
contains broken frames to verify the video loader handles
them gracefully without crashing and returns accurate metadata.
"""
with monkeypatch.context() as m:
@ -177,3 +177,125 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
f"Expected fewer than {metadata['total_num_frames']} frames, "
f"but loaded {frames.shape[0]} frames"
)
@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_1")
class TestVideoBackendOverride1(VideoLoader):
"""Test loader that returns FAKE_OUTPUT_1 to verify backend selection."""
@classmethod
def load_bytes(
cls, data: bytes, num_frames: int = -1, **kwargs
) -> tuple[npt.NDArray, dict]:
return FAKE_OUTPUT_1, {"video_backend": "test_video_backend_override_1"}
@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_2")
class TestVideoBackendOverride2(VideoLoader):
"""Test loader that returns FAKE_OUTPUT_2 to verify backend selection."""
@classmethod
def load_bytes(
cls, data: bytes, num_frames: int = -1, **kwargs
) -> tuple[npt.NDArray, dict]:
return FAKE_OUTPUT_2, {"video_backend": "test_video_backend_override_2"}
def test_video_media_io_backend_kwarg_override(monkeypatch: pytest.MonkeyPatch):
"""
Test that video_backend kwarg can override the VLLM_VIDEO_LOADER_BACKEND
environment variable.
This allows users to dynamically select a different video backend
via --media-io-kwargs without changing the global env var, which is
useful when plugins set a default backend but a specific request
needs a different one.
"""
with monkeypatch.context() as m:
# Set the env var to one backend
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_1")
imageio = ImageMediaIO()
# Without video_backend kwarg, should use env var backend
videoio_default = VideoMediaIO(imageio, num_frames=10)
frames_default, metadata_default = videoio_default.load_bytes(b"test")
np.testing.assert_array_equal(frames_default, FAKE_OUTPUT_1)
assert metadata_default["video_backend"] == "test_video_backend_override_1"
# With video_backend kwarg, should override env var
videoio_override = VideoMediaIO(
imageio, num_frames=10, video_backend="test_video_backend_override_2"
)
frames_override, metadata_override = videoio_override.load_bytes(b"test")
np.testing.assert_array_equal(frames_override, FAKE_OUTPUT_2)
assert metadata_override["video_backend"] == "test_video_backend_override_2"
def test_video_media_io_backend_kwarg_not_passed_to_loader(
monkeypatch: pytest.MonkeyPatch,
):
"""
Test that video_backend kwarg is consumed by VideoMediaIO and NOT passed
through to the underlying video loader's load_bytes method.
This ensures the kwarg is properly popped from kwargs before forwarding.
"""
@VIDEO_LOADER_REGISTRY.register("test_reject_video_backend_kwarg")
class RejectVideoBackendKwargLoader(VideoLoader):
"""Test loader that fails if video_backend is passed through."""
@classmethod
def load_bytes(
cls, data: bytes, num_frames: int = -1, **kwargs
) -> tuple[npt.NDArray, dict]:
# This should never receive video_backend in kwargs
if "video_backend" in kwargs:
raise AssertionError(
"video_backend should be consumed by VideoMediaIO, "
"not passed to loader"
)
return FAKE_OUTPUT_1, {"received_kwargs": list(kwargs.keys())}
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_reject_video_backend_kwarg")
imageio = ImageMediaIO()
# Even when video_backend is provided, it should NOT be passed to loader
videoio = VideoMediaIO(
imageio,
num_frames=10,
video_backend="test_reject_video_backend_kwarg",
other_kwarg="should_pass_through",
)
# This should NOT raise AssertionError
frames, metadata = videoio.load_bytes(b"test")
np.testing.assert_array_equal(frames, FAKE_OUTPUT_1)
# Verify other kwargs are still passed through
assert "other_kwarg" in metadata["received_kwargs"]
def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch):
"""
Test that when video_backend kwarg is None or not provided,
VideoMediaIO falls back to VLLM_VIDEO_LOADER_BACKEND env var.
"""
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_2")
imageio = ImageMediaIO()
# Explicit None should fall back to env var
videoio_none = VideoMediaIO(imageio, num_frames=10, video_backend=None)
frames_none, metadata_none = videoio_none.load_bytes(b"test")
np.testing.assert_array_equal(frames_none, FAKE_OUTPUT_2)
assert metadata_none["video_backend"] == "test_video_backend_override_2"
# Not providing video_backend should also fall back to env var
videoio_missing = VideoMediaIO(imageio, num_frames=10)
frames_missing, metadata_missing = videoio_missing.load_bytes(b"test")
np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2)
assert metadata_missing["video_backend"] == "test_video_backend_override_2"

View File

@ -212,11 +212,11 @@ def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int):
task = "wikitext"
rtol = 0.1
# Smaller cuda_graph_sizes to speed up the test.
# Smaller cudagraph_capture_sizes to speed up the test.
results = lm_eval.simple_evaluate(
model="vllm",
model_args=config.get_model_args(
tp_size=tp_size, kwargs={"cuda_graph_sizes": [16]}
tp_size=tp_size, kwargs={"cudagraph_capture_sizes": [16]}
),
tasks=task,
batch_size=64,

View File

@ -0,0 +1,195 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
parser_name = "minimax_m2_append_think"
end_token = "</think>"
# MiniMax M2 model path
REASONING_MODEL_NAME = "MiniMaxAI/MiniMax-M2"
@pytest.fixture(scope="module")
def minimax_m2_tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
# =============================================================================
# MiniMaxM2AppendThinkReasoningParser behavior:
# - Prepends <think> to the beginning of the output
# - Does NOT separate reasoning and content
# - Returns everything as content (with <think> prepended)
# - reasoning is always None
#
# This parser is used when you want to keep the raw output with <think> added
# =============================================================================
# Case: simple output with end token
SIMPLE_OUTPUT = {
"output": "This is reasoning</think>This is response",
"reasoning": None,
"content": "<think>This is reasoning</think>This is response",
"is_reasoning_end": True,
}
# Case: output without end token (reasoning in progress)
NO_END_TOKEN = {
"output": "This is reasoning in progress",
"reasoning": None,
"content": "<think>This is reasoning in progress",
"is_reasoning_end": False,
}
# Case: only end token
ONLY_END_TOKEN = {
"output": "</think>This is response",
"reasoning": None,
"content": "<think></think>This is response",
"is_reasoning_end": True,
}
# Case: multiple lines
MULTIPLE_LINES = {
"output": "Line 1\nLine 2</think>Response 1\nResponse 2",
"reasoning": None,
"content": "<think>Line 1\nLine 2</think>Response 1\nResponse 2",
"is_reasoning_end": True,
}
# Case: empty output (non-streaming prepends <think>)
EMPTY = {
"output": "",
"reasoning": None,
"content": "<think>",
"is_reasoning_end": False,
}
# Case: empty output streaming (no tokens = no output)
EMPTY_STREAMING = {
"output": "",
"reasoning": None,
"content": None,
"is_reasoning_end": False,
}
# Case: special characters
SPECIAL_CHARS = {
"output": "Let me think... 1+1=2</think>Yes!",
"reasoning": None,
"content": "<think>Let me think... 1+1=2</think>Yes!",
"is_reasoning_end": True,
}
# Case: code in output
CODE_OUTPUT = {
"output": "```python\nprint('hi')\n```</think>Here's the code.",
"reasoning": None,
"content": "<think>```python\nprint('hi')\n```</think>Here's the code.",
"is_reasoning_end": True,
}
TEST_CASES = [
pytest.param(
False,
SIMPLE_OUTPUT,
id="simple_output",
),
pytest.param(
True,
SIMPLE_OUTPUT,
id="simple_output_streaming",
),
pytest.param(
False,
NO_END_TOKEN,
id="no_end_token",
),
pytest.param(
True,
NO_END_TOKEN,
id="no_end_token_streaming",
),
pytest.param(
False,
ONLY_END_TOKEN,
id="only_end_token",
),
pytest.param(
True,
ONLY_END_TOKEN,
id="only_end_token_streaming",
),
pytest.param(
False,
MULTIPLE_LINES,
id="multiple_lines",
),
pytest.param(
True,
MULTIPLE_LINES,
id="multiple_lines_streaming",
),
pytest.param(
False,
EMPTY,
id="empty",
),
pytest.param(
True,
EMPTY_STREAMING,
id="empty_streaming",
),
pytest.param(
False,
SPECIAL_CHARS,
id="special_chars",
),
pytest.param(
True,
SPECIAL_CHARS,
id="special_chars_streaming",
),
pytest.param(
False,
CODE_OUTPUT,
id="code_output",
),
pytest.param(
True,
CODE_OUTPUT,
id="code_output_streaming",
),
]
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict,
minimax_m2_tokenizer,
):
output = minimax_m2_tokenizer.tokenize(param_dict["output"])
# decode everything to tokens
output_tokens: list[str] = [
minimax_m2_tokenizer.convert_tokens_to_string([token]) for token in output
]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
minimax_m2_tokenizer
)
reasoning, content = run_reasoning_extraction(
parser, output_tokens, streaming=streaming
)
assert reasoning == param_dict["reasoning"]
assert content == param_dict["content"]
# Test is_reasoning_end
output_ids = minimax_m2_tokenizer.convert_tokens_to_ids(output)
is_reasoning_end = parser.is_reasoning_end(output_ids)
assert is_reasoning_end == param_dict["is_reasoning_end"]

View File

@ -0,0 +1,230 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from transformers import AutoTokenizer
from tests.reasoning.utils import run_reasoning_extraction
from vllm.reasoning import ReasoningParser, ReasoningParserManager
parser_name = "minimax_m2"
end_token = "</think>"
# MiniMax M2 model path
REASONING_MODEL_NAME = "MiniMaxAI/MiniMax-M2"
@pytest.fixture(scope="module")
def minimax_m2_tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
# =============================================================================
# MiniMax M2 specific behavior:
# - Model does NOT generate <think> start token
# - Model only generates </think> end token
# - All content before </think> is reasoning
# - All content after </think> is the actual response (content)
# =============================================================================
# Case: reasoning + end token + content (typical case)
SIMPLE_REASONING = {
"output": "This is a reasoning section</think>This is the rest",
"reasoning": "This is a reasoning section",
"content": "This is the rest",
"is_reasoning_end": True,
}
# Case: reasoning + end token only (no content after)
COMPLETE_REASONING = {
"output": "This is a reasoning section</think>",
"reasoning": "This is a reasoning section",
"content": None,
"is_reasoning_end": True,
}
# Case: no end token yet (streaming in progress, all is reasoning)
NO_END_TOKEN = {
"output": "This is reasoning in progress",
"reasoning": "This is reasoning in progress",
"content": None,
"is_reasoning_end": False,
}
# Case: multiple lines of reasoning
MULTIPLE_LINES = {
"output": "First line\nSecond line</think>Response first line\nResponse second",
"reasoning": "First line\nSecond line",
"content": "Response first line\nResponse second",
"is_reasoning_end": True,
}
# Case: only end token (empty reasoning, immediate response)
SHORTEST_REASONING_NO_STREAMING = {
"output": "</think>This is the response",
"reasoning": "",
"content": "This is the response",
"is_reasoning_end": True,
}
# Case: only end token streaming (reasoning is None because it's just the token)
SHORTEST_REASONING_STREAMING = {
"output": "</think>This is the response",
"reasoning": None,
"content": "This is the response",
"is_reasoning_end": True,
}
# Case: empty output
EMPTY = {
"output": "",
"reasoning": "",
"content": None,
"is_reasoning_end": False,
}
# Case: empty streaming
EMPTY_STREAMING = {
"output": "",
"reasoning": None,
"content": None,
"is_reasoning_end": False,
}
# Case: long reasoning with special characters
SPECIAL_CHARS = {
"output": "Let me think... 1+1=2, right?</think>Yes, 1+1=2.",
"reasoning": "Let me think... 1+1=2, right?",
"content": "Yes, 1+1=2.",
"is_reasoning_end": True,
}
# Case: reasoning with code blocks
CODE_IN_REASONING = {
"output": "```python\nprint('hello')\n```</think>Here is the code.",
"reasoning": "```python\nprint('hello')\n```",
"content": "Here is the code.",
"is_reasoning_end": True,
}
TEST_CASES = [
# Core cases: no start token (MiniMax M2 actual behavior)
pytest.param(
False,
SIMPLE_REASONING,
id="simple_reasoning",
),
pytest.param(
True,
SIMPLE_REASONING,
id="simple_reasoning_streaming",
),
pytest.param(
False,
COMPLETE_REASONING,
id="complete_reasoning",
),
pytest.param(
True,
COMPLETE_REASONING,
id="complete_reasoning_streaming",
),
pytest.param(
False,
NO_END_TOKEN,
id="no_end_token",
),
pytest.param(
True,
NO_END_TOKEN,
id="no_end_token_streaming",
),
pytest.param(
False,
MULTIPLE_LINES,
id="multiple_lines",
),
pytest.param(
True,
MULTIPLE_LINES,
id="multiple_lines_streaming",
),
pytest.param(
False,
SHORTEST_REASONING_NO_STREAMING,
id="shortest_reasoning",
),
pytest.param(
True,
SHORTEST_REASONING_STREAMING,
id="shortest_reasoning_streaming",
),
pytest.param(
False,
EMPTY,
id="empty",
),
pytest.param(
True,
EMPTY_STREAMING,
id="empty_streaming",
),
pytest.param(
False,
SPECIAL_CHARS,
id="special_chars",
),
pytest.param(
True,
SPECIAL_CHARS,
id="special_chars_streaming",
),
pytest.param(
False,
CODE_IN_REASONING,
id="code_in_reasoning",
),
pytest.param(
True,
CODE_IN_REASONING,
id="code_in_reasoning_streaming",
),
]
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict,
minimax_m2_tokenizer,
):
output = minimax_m2_tokenizer.tokenize(param_dict["output"])
# decode everything to tokens
output_tokens: list[str] = [
minimax_m2_tokenizer.convert_tokens_to_string([token]) for token in output
]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)(
minimax_m2_tokenizer
)
reasoning, content = run_reasoning_extraction(
parser, output_tokens, streaming=streaming
)
assert reasoning == param_dict["reasoning"]
assert content == param_dict["content"]
# Test is_reasoning_end
output_ids = minimax_m2_tokenizer.convert_tokens_to_ids(output)
is_reasoning_end = parser.is_reasoning_end(output_ids)
assert is_reasoning_end == param_dict["is_reasoning_end"]
# Test extract_content
if param_dict["content"] is not None:
content = parser.extract_content_ids(output_ids)
assert content == minimax_m2_tokenizer.convert_tokens_to_ids(
minimax_m2_tokenizer.tokenize(param_dict["content"])
)
else:
content = parser.extract_content_ids(output)
assert content == []

View File

@ -89,64 +89,6 @@ def test_update_config():
new_config3 = update_config(config3, {"a": "new_value"})
# Can remove once --task option is fully deprecated
@pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_convert_type", "expected_task"),
[
("distilbert/distilgpt2", "generate", "none", "generate"),
("intfloat/multilingual-e5-small", "pooling", "none", "embed"),
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none", "classify"),
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "embed"),
("openai/whisper-small", "generate", "none", "transcription"),
],
)
def test_auto_task(
model_id, expected_runner_type, expected_convert_type, expected_task
):
config = ModelConfig(model_id, task="auto")
assert config.runner_type == expected_runner_type
assert config.convert_type == expected_convert_type
# Can remove once --task option is fully deprecated
@pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_convert_type", "expected_task"),
[
("distilbert/distilgpt2", "pooling", "embed", "embed"),
("intfloat/multilingual-e5-small", "pooling", "embed", "embed"),
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify", "classify"),
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed", "embed"),
("openai/whisper-small", "pooling", "embed", "embed"),
],
)
def test_score_task(
model_id, expected_runner_type, expected_convert_type, expected_task
):
config = ModelConfig(model_id, task="score")
assert config.runner_type == expected_runner_type
assert config.convert_type == expected_convert_type
# Can remove once --task option is fully deprecated
@pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_convert_type", "expected_task"),
[
("openai/whisper-small", "generate", "none", "transcription"),
],
)
def test_transcription_task(
model_id, expected_runner_type, expected_convert_type, expected_task
):
config = ModelConfig(model_id, task="transcription")
assert config.runner_type == expected_runner_type
assert config.convert_type == expected_convert_type
@pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_convert_type"),
[
@ -1085,7 +1027,7 @@ def test_vllm_config_explicit_overrides():
)
# Override one field but not others
pass_config = PassConfig(enable_noop=False)
pass_config = PassConfig(eliminate_noops=False)
compilation_config = CompilationConfig(pass_config=pass_config)
config = VllmConfig(
model_config=regular_model,

View File

@ -119,7 +119,7 @@ class RemoteOpenAIServer:
vllm_serve_args: list[str],
*,
env_dict: dict[str, str] | None = None,
seed: int | None = 0,
seed: int = 0,
auto_port: bool = True,
max_wait_seconds: float | None = None,
override_hf_configs: dict[str, Any] | None = None,
@ -283,7 +283,7 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer):
child_process_fxn: Callable[[dict[str, str] | None, str, list[str]], None],
*,
env_dict: dict[str, str] | None = None,
seed: int | None = 0,
seed: int = 0,
auto_port: bool = True,
max_wait_seconds: float | None = None,
) -> None:

View File

@ -152,8 +152,8 @@ def run_tests(
m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA")
else:
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
# lock matmul precision to full FP32
m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
# lock matmul precision to full FP32 (IEEE)
m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "ieee")
# m.setenv("VLLM_BATCH_INVARIANT", "1")
outputs: list[tuple[str, list, list]] = []
for n, (

View File

@ -0,0 +1,756 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import MagicMock
import pytest
from vllm.distributed.kv_events import BlockStored
from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_connector import (
LMCacheConnectorV1,
LMCacheKVEvents,
)
from vllm.v1.outputs import KVConnectorOutput
@pytest.fixture
def mock_lmcache_engine_event():
"""Create a mock event object that mimics what the lmcache engine returns."""
class MockEvent:
def __init__(
self,
block_hashes,
parent_block_hash,
token_ids,
lora_id,
block_size,
medium,
):
self.block_hashes = block_hashes
self.parent_block_hash = parent_block_hash
self.token_ids = token_ids
self.lora_id = lora_id
self.block_size = block_size
self.medium = medium
return MockEvent(
block_hashes=["hash1", "hash2"],
parent_block_hash="parent_hash",
token_ids=[1, 2, 3, 4],
lora_id=None,
block_size=16,
medium="GPU",
)
@pytest.fixture
def mock_connector():
"""Create a mock LMCacheConnectorV1 instance with mocked dependencies."""
connector = MagicMock(spec=LMCacheConnectorV1)
connector._kv_cache_events = None
connector._lmcache_engine = MagicMock()
# Make the methods use the real implementation
connector.get_kv_connector_kv_cache_events = (
LMCacheConnectorV1.get_kv_connector_kv_cache_events.__get__(
connector, LMCacheConnectorV1
)
)
connector.update_connector_output = (
LMCacheConnectorV1.update_connector_output.__get__(
connector, LMCacheConnectorV1
)
)
connector.take_events = LMCacheConnectorV1.take_events.__get__(
connector, LMCacheConnectorV1
)
return connector
class TestGetKVConnectorKVCacheEvents:
"""Test get_kv_connector_kv_cache_events method."""
def test_returns_none_when_no_events(self, mock_connector):
"""Test that None is returned when lmcache engine has no events."""
mock_connector._lmcache_engine.get_kv_events.return_value = None
result = mock_connector.get_kv_connector_kv_cache_events()
assert result is None
mock_connector._lmcache_engine.get_kv_events.assert_called_once()
def test_returns_none_when_empty_list(self, mock_connector):
"""Test that None is returned when lmcache engine returns empty list."""
mock_connector._lmcache_engine.get_kv_events.return_value = []
result = mock_connector.get_kv_connector_kv_cache_events()
assert result is None
def test_converts_single_event(self, mock_connector, mock_lmcache_engine_event):
"""Test conversion of a single event from lmcache engine format."""
mock_connector._lmcache_engine.get_kv_events.return_value = [
mock_lmcache_engine_event
]
result = mock_connector.get_kv_connector_kv_cache_events()
assert result is not None
assert isinstance(result, LMCacheKVEvents)
assert result.get_number_of_workers() == 1
events = result.get_all_events()
assert len(events) == 1
assert isinstance(events[0], BlockStored)
assert events[0].block_hashes == ["hash1", "hash2"]
assert events[0].parent_block_hash == "parent_hash"
assert events[0].token_ids == [1, 2, 3, 4]
assert events[0].lora_id is None
assert events[0].block_size == 16
assert events[0].medium == "GPU"
def test_converts_multiple_events(self, mock_connector):
"""Test conversion of multiple events from lmcache engine format."""
class MockEvent:
def __init__(self, i):
self.block_hashes = [f"hash{i}"]
self.parent_block_hash = f"parent{i}"
self.token_ids = [i]
self.lora_id = None
self.block_size = 16
self.medium = "GPU"
events = [MockEvent(i) for i in range(5)]
mock_connector._lmcache_engine.get_kv_events.return_value = events
result = mock_connector.get_kv_connector_kv_cache_events()
assert result is not None
assert isinstance(result, LMCacheKVEvents)
converted_events = result.get_all_events()
assert len(converted_events) == 5
for i, event in enumerate(converted_events):
assert isinstance(event, BlockStored)
assert event.block_hashes == [f"hash{i}"]
assert event.parent_block_hash == f"parent{i}"
assert event.token_ids == [i]
def test_preserves_event_attributes(self, mock_connector):
"""Test that all event attributes are correctly preserved."""
class MockEventWithLora:
def __init__(self):
self.block_hashes = ["hash_a", "hash_b", "hash_c"]
self.parent_block_hash = "parent_xyz"
self.token_ids = [100, 200, 300]
self.lora_id = 42
self.block_size = 32
self.medium = "DISK"
mock_connector._lmcache_engine.get_kv_events.return_value = [
MockEventWithLora()
]
result = mock_connector.get_kv_connector_kv_cache_events()
events = result.get_all_events()
event = events[0]
assert event.block_hashes == ["hash_a", "hash_b", "hash_c"]
assert event.parent_block_hash == "parent_xyz"
assert event.token_ids == [100, 200, 300]
assert event.lora_id == 42
assert event.block_size == 32
assert event.medium == "DISK"
def test_handles_none_parent_block_hash(self, mock_connector):
"""Test handling of events with None parent_block_hash."""
class MockEventNoParent:
def __init__(self):
self.block_hashes = ["hash1"]
self.parent_block_hash = None
self.token_ids = [1, 2]
self.lora_id = None
self.block_size = 16
self.medium = "GPU"
mock_connector._lmcache_engine.get_kv_events.return_value = [
MockEventNoParent()
]
result = mock_connector.get_kv_connector_kv_cache_events()
events = result.get_all_events()
assert events[0].parent_block_hash is None
class TestUpdateConnectorOutput:
"""Test update_connector_output method."""
def test_does_nothing_when_kv_cache_events_is_none(self, mock_connector):
"""Test that method returns early when kv_cache_events is None."""
connector_output = KVConnectorOutput(kv_cache_events=None)
mock_connector.update_connector_output(connector_output)
assert mock_connector._kv_cache_events is None
def test_does_nothing_when_kv_cache_events_is_not_lmcache_kv_events(
self, mock_connector
):
"""Test that method returns early when kv_cache_events is not
LMCacheKVEvents."""
# Create a mock object that is not LMCacheKVEvents
fake_events = MagicMock()
connector_output = KVConnectorOutput(kv_cache_events=fake_events)
mock_connector.update_connector_output(connector_output)
assert mock_connector._kv_cache_events is None
def test_sets_kv_cache_events_when_none(self, mock_connector):
"""Test that _kv_cache_events is set when it was None."""
kv_events = LMCacheKVEvents(num_workers=1)
event = BlockStored(
block_hashes=["hash1"],
parent_block_hash=None,
token_ids=[1, 2],
block_size=16,
lora_id=None,
medium="GPU",
)
kv_events.add_events([event])
connector_output = KVConnectorOutput(kv_cache_events=kv_events)
mock_connector.update_connector_output(connector_output)
assert mock_connector._kv_cache_events is kv_events
def test_adds_events_when_kv_cache_events_already_exists(self, mock_connector):
"""Test that events are added when _kv_cache_events already exists."""
# Set up existing events
existing_events = LMCacheKVEvents(num_workers=2)
event1 = BlockStored(
block_hashes=["hash1"],
parent_block_hash=None,
token_ids=[1],
block_size=16,
lora_id=None,
medium="GPU",
)
existing_events.add_events([event1])
existing_events.add_events([event1]) # Simulate 2 workers reporting
mock_connector._kv_cache_events = existing_events
# Create new events to add
new_events = LMCacheKVEvents(num_workers=1)
event2 = BlockStored(
block_hashes=["hash2"],
parent_block_hash=None,
token_ids=[2],
block_size=16,
lora_id=None,
medium="GPU",
)
new_events.add_events([event2])
connector_output = KVConnectorOutput(kv_cache_events=new_events)
mock_connector.update_connector_output(connector_output)
# Check that events were added
all_events = mock_connector._kv_cache_events.get_all_events()
assert len(all_events) == 3 # 2 from existing + 1 from new
assert event1 in all_events
assert event2 in all_events
def test_increments_workers_when_kv_cache_events_already_exists(
self, mock_connector
):
"""Test that worker count is incremented correctly."""
# Set up existing events with 2 workers
existing_events = LMCacheKVEvents(num_workers=2)
mock_connector._kv_cache_events = existing_events
# Create new events from 3 workers
new_events = LMCacheKVEvents(num_workers=3)
event = BlockStored(
block_hashes=["hash1"],
parent_block_hash=None,
token_ids=[1],
block_size=16,
lora_id=None,
medium="GPU",
)
new_events.add_events([event])
connector_output = KVConnectorOutput(kv_cache_events=new_events)
mock_connector.update_connector_output(connector_output)
# Worker count should be 2 + 3 = 5
assert mock_connector._kv_cache_events.get_number_of_workers() == 5
def test_multiple_updates(self, mock_connector):
"""Test multiple consecutive updates."""
# First update
events1 = LMCacheKVEvents(num_workers=1)
event1 = BlockStored(
block_hashes=["hash1"],
parent_block_hash=None,
token_ids=[1],
block_size=16,
lora_id=None,
medium="GPU",
)
events1.add_events([event1])
output1 = KVConnectorOutput(kv_cache_events=events1)
mock_connector.update_connector_output(output1)
# Second update
events2 = LMCacheKVEvents(num_workers=2)
event2 = BlockStored(
block_hashes=["hash2"],
parent_block_hash=None,
token_ids=[2],
block_size=16,
lora_id=None,
medium="GPU",
)
events2.add_events([event2])
output2 = KVConnectorOutput(kv_cache_events=events2)
mock_connector.update_connector_output(output2)
# Third update
events3 = LMCacheKVEvents(num_workers=1)
event3 = BlockStored(
block_hashes=["hash3"],
parent_block_hash=None,
token_ids=[3],
block_size=16,
lora_id=None,
medium="GPU",
)
events3.add_events([event3])
output3 = KVConnectorOutput(kv_cache_events=events3)
mock_connector.update_connector_output(output3)
# Check final state
all_events = mock_connector._kv_cache_events.get_all_events()
assert len(all_events) == 3
assert mock_connector._kv_cache_events.get_number_of_workers() == 4 # 1+2+1
def test_updates_with_empty_events(self, mock_connector):
"""Test updating with empty event lists."""
# First update with actual events
events1 = LMCacheKVEvents(num_workers=1)
event1 = BlockStored(
block_hashes=["hash1"],
parent_block_hash=None,
token_ids=[1],
block_size=16,
lora_id=None,
medium="GPU",
)
events1.add_events([event1])
output1 = KVConnectorOutput(kv_cache_events=events1)
mock_connector.update_connector_output(output1)
# Second update with empty events
events2 = LMCacheKVEvents(num_workers=2)
# No events added
output2 = KVConnectorOutput(kv_cache_events=events2)
mock_connector.update_connector_output(output2)
# Should still have the original event
all_events = mock_connector._kv_cache_events.get_all_events()
assert len(all_events) == 1
assert mock_connector._kv_cache_events.get_number_of_workers() == 3
class TestTakeEvents:
"""Test take_events method."""
def test_yields_nothing_when_kv_cache_events_is_none(self, mock_connector):
"""Test that nothing is yielded when _kv_cache_events is None."""
mock_connector._kv_cache_events = None
events = list(mock_connector.take_events())
assert events == []
def test_yields_events_and_clears(self, mock_connector):
"""Test that events are yielded and then cleared."""
# Set up events
kv_events = LMCacheKVEvents(num_workers=1)
event1 = BlockStored(
block_hashes=["hash1"],
parent_block_hash=None,
token_ids=[1],
block_size=16,
lora_id=None,
medium="GPU",
)
event2 = BlockStored(
block_hashes=["hash2"],
parent_block_hash=None,
token_ids=[2],
block_size=16,
lora_id=None,
medium="GPU",
)
kv_events.add_events([event1, event2])
mock_connector._kv_cache_events = kv_events
# Take events
events = list(mock_connector.take_events())
# Check that events were yielded
assert len(events) == 2
assert event1 in events
assert event2 in events
# Check that _kv_cache_events was cleared
assert mock_connector._kv_cache_events is None
def test_aggregates_before_yielding(self, mock_connector):
"""Test that events are aggregated before yielding."""
# Set up events from multiple workers
kv_events = LMCacheKVEvents(num_workers=3)
common_event = BlockStored(
block_hashes=["hash_common"],
parent_block_hash=None,
token_ids=[1],
block_size=16,
lora_id=None,
medium="GPU",
)
uncommon_event = BlockStored(
block_hashes=["hash_uncommon"],
parent_block_hash=None,
token_ids=[2],
block_size=16,
lora_id=None,
medium="GPU",
)
# All 3 workers report common_event
kv_events.add_events([common_event])
kv_events.add_events([common_event])
kv_events.add_events([common_event])
# Only 1 worker reports uncommon_event
kv_events.add_events([uncommon_event])
mock_connector._kv_cache_events = kv_events
# Take events
events = list(mock_connector.take_events())
# Only the common event should be yielded
assert len(events) == 1
assert events[0] == common_event
def test_multiple_take_events_calls(self, mock_connector):
"""Test calling take_events multiple times."""
# First call with events
kv_events1 = LMCacheKVEvents(num_workers=1)
event1 = BlockStored(
block_hashes=["hash1"],
parent_block_hash=None,
token_ids=[1],
block_size=16,
lora_id=None,
medium="GPU",
)
kv_events1.add_events([event1])
mock_connector._kv_cache_events = kv_events1
events1 = list(mock_connector.take_events())
assert len(events1) == 1
assert events1[0] == event1
assert mock_connector._kv_cache_events is None
# Second call with no events
events2 = list(mock_connector.take_events())
assert events2 == []
# Third call after adding new events
kv_events2 = LMCacheKVEvents(num_workers=1)
event2 = BlockStored(
block_hashes=["hash2"],
parent_block_hash=None,
token_ids=[2],
block_size=16,
lora_id=None,
medium="GPU",
)
kv_events2.add_events([event2])
mock_connector._kv_cache_events = kv_events2
events3 = list(mock_connector.take_events())
assert len(events3) == 1
assert events3[0] == event2
def test_yields_empty_after_aggregation_removes_all(self, mock_connector):
"""Test that nothing is yielded if aggregation removes all events."""
# Set up events from 2 workers with no common events
kv_events = LMCacheKVEvents(num_workers=2)
event1 = BlockStored(
block_hashes=["hash1"],
parent_block_hash=None,
token_ids=[1],
block_size=16,
lora_id=None,
medium="GPU",
)
event2 = BlockStored(
block_hashes=["hash2"],
parent_block_hash=None,
token_ids=[2],
block_size=16,
lora_id=None,
medium="GPU",
)
# Worker 1 reports event1
kv_events.add_events([event1])
# Worker 2 reports event2
kv_events.add_events([event2])
mock_connector._kv_cache_events = kv_events
# Take events
events = list(mock_connector.take_events())
# No common events, so nothing should be yielded
assert events == []
assert mock_connector._kv_cache_events is None
class TestIntegrationScenarios:
"""Test integration scenarios."""
def test_full_workflow(self, mock_connector, mock_lmcache_engine_event):
"""Test a complete workflow from getting events to taking them."""
# Step 1: Get events from lmcache engine
mock_connector._lmcache_engine.get_kv_events.return_value = [
mock_lmcache_engine_event
]
kv_events = mock_connector.get_kv_connector_kv_cache_events()
assert kv_events is not None
assert len(kv_events.get_all_events()) == 1
# Step 2: Update connector output (simulate receiving from worker)
output1 = KVConnectorOutput(kv_cache_events=kv_events)
mock_connector.update_connector_output(output1)
assert mock_connector._kv_cache_events is not None
# Step 3: Take events
taken_events = list(mock_connector.take_events())
assert len(taken_events) == 1
assert mock_connector._kv_cache_events is None
def test_multiple_workers_workflow(self, mock_connector):
"""Test workflow with multiple workers."""
class MockEvent:
def __init__(self, hash_val):
self.block_hashes = [hash_val]
self.parent_block_hash = None
self.token_ids = [1]
self.lora_id = None
self.block_size = 16
self.medium = "GPU"
# Worker 1
mock_connector._lmcache_engine.get_kv_events.return_value = [
MockEvent("hash_common"),
MockEvent("hash_worker1"),
]
kv_events1 = mock_connector.get_kv_connector_kv_cache_events()
output1 = KVConnectorOutput(kv_cache_events=kv_events1)
mock_connector.update_connector_output(output1)
# Worker 2
mock_connector._lmcache_engine.get_kv_events.return_value = [
MockEvent("hash_common"),
MockEvent("hash_worker2"),
]
kv_events2 = mock_connector.get_kv_connector_kv_cache_events()
output2 = KVConnectorOutput(kv_cache_events=kv_events2)
mock_connector.update_connector_output(output2)
# Take events (should only get common events)
taken_events = list(mock_connector.take_events())
# With aggregation, only events reported by both workers should be present
# In this case, hash_common was reported by both
event_hashes = [e.block_hashes[0] for e in taken_events]
assert "hash_common" in event_hashes
def test_empty_workflow(self, mock_connector):
"""Test workflow when there are no events at any stage."""
# Get events returns None
mock_connector._lmcache_engine.get_kv_events.return_value = None
kv_events = mock_connector.get_kv_connector_kv_cache_events()
assert kv_events is None
# Update with None
output = KVConnectorOutput(kv_cache_events=None)
mock_connector.update_connector_output(output)
# Take events
taken_events = list(mock_connector.take_events())
assert taken_events == []
assert mock_connector._kv_cache_events is None
def test_repeated_cycles(self, mock_connector):
"""Test multiple cycles of the complete workflow."""
class MockEvent:
def __init__(self, cycle_num):
self.block_hashes = [f"hash_cycle_{cycle_num}"]
self.parent_block_hash = None
self.token_ids = [cycle_num]
self.lora_id = None
self.block_size = 16
self.medium = "GPU"
for cycle in range(3):
# Get events
mock_connector._lmcache_engine.get_kv_events.return_value = [
MockEvent(cycle)
]
kv_events = mock_connector.get_kv_connector_kv_cache_events()
# Update
output = KVConnectorOutput(kv_cache_events=kv_events)
mock_connector.update_connector_output(output)
# Take
taken_events = list(mock_connector.take_events())
# Verify
assert len(taken_events) == 1
assert taken_events[0].block_hashes[0] == f"hash_cycle_{cycle}"
assert mock_connector._kv_cache_events is None
def test_lmcache_kv_events_aggregation(self):
"""
Test LMCacheKVEvents aggregation across TP ranks using
KVOutputAggregator (used by MultiprocExecutor).
"""
from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
from vllm.v1.outputs import ModelRunnerOutput
# Create KVOutputAggregator for 3 workers (simulating TP=3)
aggregator = KVOutputAggregator(expected_finished_count=3)
# Define common and unique events
common_event = BlockStored(
block_hashes=["hash_common"],
parent_block_hash="parent_common",
token_ids=[1, 2, 3],
block_size=16,
lora_id=None,
medium="GPU",
)
worker1_unique_event = BlockStored(
block_hashes=["hash_worker1"],
parent_block_hash="parent_w1",
token_ids=[4, 5],
block_size=16,
lora_id=None,
medium="GPU",
)
worker2_unique_event = BlockStored(
block_hashes=["hash_worker2"],
parent_block_hash="parent_w2",
token_ids=[6, 7],
block_size=16,
lora_id=None,
medium="GPU",
)
worker3_unique_event = BlockStored(
block_hashes=["hash_worker3"],
parent_block_hash="parent_w3",
token_ids=[8, 9],
block_size=16,
lora_id=None,
medium="GPU",
)
# Create events for each worker
# Worker 0: reports common event and its unique event
worker0_events = LMCacheKVEvents(num_workers=1)
worker0_events.add_events([common_event, worker1_unique_event])
# Worker 1: reports common event and its unique event
worker1_events = LMCacheKVEvents(num_workers=1)
worker1_events.add_events([common_event, worker2_unique_event])
# Worker 2: reports common event and its unique event
worker2_events = LMCacheKVEvents(num_workers=1)
worker2_events.add_events([common_event, worker3_unique_event])
# Create ModelRunnerOutput instances for each worker
worker_outputs = []
for i, worker_events in enumerate(
[worker0_events, worker1_events, worker2_events]
):
output = ModelRunnerOutput(
req_ids=[f"req_{i}"],
req_id_to_index={f"req_{i}": 0},
sampled_token_ids=[[123]], # dummy token
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[None],
kv_connector_output=KVConnectorOutput(
finished_sending=set([f"req_{i}_send"])
if i < 2
else None, # Workers 0,1 finished sending
finished_recving=set([f"req_{i}_recv"])
if i > 0
else None, # Workers 1,2 finished receiving
kv_cache_events=worker_events,
),
)
worker_outputs.append(output)
# Use the real aggregation mechanism (like MultiprocExecutor.execute_model)
aggregated_output = aggregator.aggregate(worker_outputs, output_rank=0)
kv_cache_events = aggregated_output.kv_connector_output.kv_cache_events
assert isinstance(kv_cache_events, LMCacheKVEvents)
# After aggregation, events should be combined from all workers
# The aggregator doesn't automatically aggregate events, so we need to call
# aggregate() to get only common events
kv_cache_events.aggregate()
aggregated_events = kv_cache_events.get_all_events()
# Only the common event should remain after aggregation
# because it's the only event reported by all 3 workers
assert len(aggregated_events) == 1
assert aggregated_events[0] == common_event
# Verify the common event properties
assert aggregated_events[0].block_hashes == ["hash_common"]
assert aggregated_events[0].parent_block_hash == "parent_common"
assert aggregated_events[0].token_ids == [1, 2, 3]

View File

@ -294,6 +294,12 @@ class AttentionImpl(ABC, Generic[T]):
# Some features like decode context parallelism require the softmax lse.
can_return_lse_for_decode: bool = False
# Whether the attention impl supports Prefill Context Parallelism.
supports_pcp: bool = False
# Whether the attention impl(or ops) supports MTP
# when cp_kv_cache_interleave_size > 1
supports_mtp_with_cp_non_trivial_interleave_size: bool = False
# some attention backends might not always want to return lse
# even if they can return lse (for efficiency reasons)
need_to_return_lse_for_decode: bool = False

View File

@ -252,35 +252,3 @@ def register_backend(
return lambda x: x
return decorator
# Backwards compatibility alias for plugins
class _BackendMeta(type):
"""Metaclass to provide deprecation warnings when accessing _Backend."""
def __getattribute__(cls, name: str):
if name not in ("__class__", "__mro__", "__name__"):
logger.warning(
"_Backend has been renamed to AttentionBackendEnum. "
"Please update your code to use AttentionBackendEnum instead. "
"_Backend will be removed in a future release."
)
return getattr(AttentionBackendEnum, name)
def __getitem__(cls, name: str):
logger.warning(
"_Backend has been renamed to AttentionBackendEnum. "
"Please update your code to use AttentionBackendEnum instead. "
"_Backend will be removed in a future release."
)
return AttentionBackendEnum[name]
class _Backend(metaclass=_BackendMeta):
"""Deprecated: Use AttentionBackendEnum instead.
This class is provided for backwards compatibility with plugins
and will be removed in a future release.
"""
pass

View File

@ -1,7 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import inspect
from functools import cache
from typing import cast, get_args
@ -73,39 +72,18 @@ def _cached_get_attn_backend(
) -> type[AttentionBackend]:
from vllm.platforms import current_platform
sig = inspect.signature(current_platform.get_attn_backend_cls)
if "use_v1" in sig.parameters:
logger.warning_once(
"use_v1 parameter for get_attn_backend_cls is deprecated and will "
"be removed in v0.13.0 or v1.0.0, whichever is soonest. Please "
"remove it from your plugin code."
)
attention_cls = current_platform.get_attn_backend_cls(
backend,
head_size,
dtype,
kv_cache_dtype,
block_size,
True, # use_v1
use_mla,
has_sink,
use_sparse,
use_mm_prefix,
attn_type,
)
else:
attention_cls = current_platform.get_attn_backend_cls(
backend,
head_size,
dtype,
kv_cache_dtype,
block_size,
use_mla,
has_sink,
use_sparse,
use_mm_prefix,
attn_type,
)
attention_cls = current_platform.get_attn_backend_cls(
backend,
head_size,
dtype,
kv_cache_dtype,
block_size,
use_mla,
has_sink,
use_sparse,
use_mm_prefix,
attn_type,
)
if not attention_cls:
raise ValueError(
f"Invalid attention backend for {current_platform.device_name}"

View File

@ -17,7 +17,6 @@ from vllm.config.utils import (
Range,
config,
get_hash_factors,
handle_deprecated,
hash_factors,
)
from vllm.logger import init_logger
@ -127,27 +126,6 @@ class PassConfig:
fuse_allreduce_rms: bool = Field(default=None)
"""Enable flashinfer allreduce fusion."""
# Deprecated flags
enable_fusion: bool = Field(default=None)
"""Deprecated in: v0.12.0. Use fuse_norm_quant and fuse_act_quant
instead. Will be removed in v0.13.0 or v1.0.0, whichever is sooner.
"""
enable_attn_fusion: bool = Field(default=None)
"""Deprecated in: v0.12.0. Use fuse_attn_quant instead.
Will be removed in v0.13.0 or v1.0.0, whichever is sooner."""
enable_noop: bool = Field(default=None)
"""Deprecated in: v0.12.0. Use eliminate_noops instead.
Will be removed in v0.13.0 or v1.0.0, whichever is sooner."""
enable_sequence_parallelism: bool = Field(default=None)
"""Deprecated in: v0.12.0. Use enable_sp instead.
Will be removed in v0.13.0 or v1.0.0, whichever is sooner."""
enable_async_tp: bool = Field(default=None)
"""Deprecated in: v0.12.0. Use fuse_gemm_comms instead.
Will be removed in v0.13.0 or v1.0.0, whichever is sooner."""
enable_fi_allreduce_fusion: bool = Field(default=None)
"""Deprecated in: v0.12.0. Use fuse_allreduce_rms instead.
Will be removed in v0.13.0 or v1.0.0, whichever is sooner."""
fi_allreduce_fusion_max_size_mb: float | None = None
"""The threshold of the communicated tensor sizes under which
vllm should use flashinfer fused allreduce. Specified as a
@ -206,15 +184,7 @@ class PassConfig:
Any future fields that don't affect compilation should be excluded.
"""
ignored_fields = [
"enable_fusion",
"enable_attn_fusion",
"enable_noop",
"enable_sequence_parallelism",
"enable_async_tp",
"enable_fi_allreduce_fusion",
]
return hash_factors(get_hash_factors(self, ignored_factors=ignored_fields))
return hash_factors(get_hash_factors(self, set()))
@field_validator(
"fuse_norm_quant",
@ -224,12 +194,6 @@ class PassConfig:
"enable_sp",
"fuse_gemm_comms",
"fuse_allreduce_rms",
"enable_fusion",
"enable_attn_fusion",
"enable_noop",
"enable_sequence_parallelism",
"enable_async_tp",
"enable_fi_allreduce_fusion",
mode="wrap",
)
@classmethod
@ -242,49 +206,6 @@ class PassConfig:
def __post_init__(self) -> None:
# Handle deprecation and defaults
# Map old flags to new flags and issue warnings
handle_deprecated(
self,
"enable_fusion",
["fuse_norm_quant", "fuse_act_quant"],
"v0.13.0 or v1.0.0, whichever is sooner",
)
handle_deprecated(
self,
"enable_attn_fusion",
"fuse_attn_quant",
"v0.13.0 or v1.0.0, whichever is sooner",
)
handle_deprecated(
self,
"enable_sequence_parallelism",
"enable_sp",
"v0.13.0 or v1.0.0, whichever is sooner",
)
handle_deprecated(
self,
"enable_async_tp",
"fuse_gemm_comms",
"v0.13.0 or v1.0.0, whichever is sooner",
)
handle_deprecated(
self,
"enable_fi_allreduce_fusion",
"fuse_allreduce_rms",
"v0.13.0 or v1.0.0, whichever is sooner",
)
handle_deprecated(
self,
"enable_noop",
"eliminate_noops",
"v0.13.0 or v1.0.0, whichever is sooner",
)
if not self.eliminate_noops:
if self.fuse_norm_quant or self.fuse_act_quant:
logger.warning_once(

View File

@ -73,17 +73,6 @@ logger = init_logger(__name__)
RunnerOption = Literal["auto", RunnerType]
ConvertType = Literal["none", "embed", "classify", "reward"]
ConvertOption = Literal["auto", ConvertType]
TaskOption = Literal[
"auto",
"generate",
"embedding",
"embed",
"classify",
"score",
"reward",
"transcription",
"draft",
]
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
LogprobsMode = Literal[
@ -93,12 +82,6 @@ HfOverrides = dict[str, Any] | Callable[[PretrainedConfig], PretrainedConfig]
ModelImpl = Literal["auto", "vllm", "transformers", "terratorch"]
LayerBlockType = Literal["attention", "linear_attention", "mamba"]
_RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = {
"generate": ["generate", "transcription"],
"pooling": ["embedding", "embed", "classify", "score", "reward"],
"draft": ["draft"],
}
_RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = {
"generate": [],
"pooling": ["embed", "classify", "reward"],
@ -126,12 +109,6 @@ class ModelConfig:
"""Convert the model using adapters defined in
[vllm.model_executor.models.adapters][]. The most common use case is to
adapt a text generation model to be used for pooling tasks."""
task: TaskOption | None = None
"""[DEPRECATED] The task to use the model for. If the model supports more
than one model runner, this is used to select which model runner to run.
Note that the model may support other tasks using the same model runner.
"""
tokenizer: SkipValidation[str] = None # type: ignore
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
name or path will be used."""
@ -335,7 +312,6 @@ class ModelConfig:
ignored_factors = {
"runner",
"convert",
"task",
"tokenizer",
"tokenizer_mode",
"seed",
@ -510,97 +486,6 @@ class ModelConfig:
is_generative_model = registry.is_text_generation_model(architectures, self)
is_pooling_model = registry.is_pooling_model(architectures, self)
def _task_to_convert(task: TaskOption) -> ConvertType:
if task == "embedding" or task == "embed":
return "embed"
if task == "classify":
return "classify"
if task == "reward":
logger.warning(
"Pooling models now default support all pooling; "
"you can use it without any settings."
)
return "embed"
if task == "score":
new_task = self._get_default_pooling_task(architectures)
return "classify" if new_task == "classify" else "embed"
return "none"
if self.task is not None:
runner: RunnerOption = "auto"
convert: ConvertOption = "auto"
msg_prefix = (
"The 'task' option has been deprecated and will be "
"removed in v0.13.0 or v1.0, whichever comes first."
)
msg_hint = "Please remove this option."
is_generative_task = self.task in _RUNNER_TASKS["generate"]
is_pooling_task = self.task in _RUNNER_TASKS["pooling"]
if is_generative_model and is_pooling_model:
if is_generative_task:
runner = "generate"
convert = "auto"
msg_hint = (
"Please replace this option with `--runner "
"generate` to continue using this model "
"as a generative model."
)
elif is_pooling_task:
runner = "pooling"
convert = "auto"
msg_hint = (
"Please replace this option with `--runner "
"pooling` to continue using this model "
"as a pooling model."
)
else: # task == "auto"
pass
elif is_generative_model or is_pooling_model:
if is_generative_task:
runner = "generate"
convert = "auto"
msg_hint = "Please remove this option"
elif is_pooling_task:
runner = "pooling"
convert = _task_to_convert(self.task)
msg_hint = (
"Please replace this option with `--convert "
f"{convert}` to continue using this model "
"as a pooling model."
)
else: # task == "auto"
pass
else:
# Neither generative nor pooling model - try to convert if possible
if is_pooling_task:
runner = "pooling"
convert = _task_to_convert(self.task)
msg_hint = (
"Please replace this option with `--runner pooling "
f"--convert {convert}` to continue using this model "
"as a pooling model."
)
else:
debug_info = {
"architectures": architectures,
"is_generative_model": is_generative_model,
"is_pooling_model": is_pooling_model,
}
raise AssertionError(
"The model should be a generative or "
"pooling model when task is set to "
f"{self.task!r}. Found: {debug_info}"
)
self.runner = runner
self.convert = convert
msg = f"{msg_prefix} {msg_hint}"
warnings.warn(msg, DeprecationWarning, stacklevel=2)
self.runner_type = self._get_runner_type(architectures, self.runner)
self.convert_type = self._get_convert_type(
architectures, self.runner_type, self.convert
@ -903,6 +788,13 @@ class ModelConfig:
runner_type: RunnerType,
convert: ConvertOption,
) -> ConvertType:
if convert == "reward":
logger.warning(
"`--convert reward` is deprecated and will be removed in v0.15. "
"Please use `--convert embed` instead."
)
return "embed"
if convert != "auto":
return convert
@ -918,22 +810,6 @@ class ModelConfig:
return convert_type
def _get_default_pooling_task(
self,
architectures: list[str],
) -> Literal["embed", "classify", "reward"]:
if self.registry.is_cross_encoder_model(architectures, self):
return "classify"
for arch in architectures:
match = try_match_architecture_defaults(arch, runner_type="pooling")
if match:
_, (_, convert_type) = match
assert convert_type != "none"
return convert_type
return "embed"
def _parse_quant_hf_config(self, hf_config: PretrainedConfig):
quant_cfg = getattr(hf_config, "quantization_config", None)
if quant_cfg is None:

View File

@ -317,11 +317,6 @@ class ParallelConfig:
"num_redundant_experts."
)
if self.prefill_context_parallel_size > 1:
raise ValueError(
"Prefill context parallelism is not fully supported. "
"Please set prefill_context_parallel_size to 1."
)
return self
@property

View File

@ -111,13 +111,15 @@ class PoolerConfig:
def get_use_activation(o: object):
if softmax := getattr(o, "softmax", None) is not None:
logger.warning_once(
"softmax will be deprecated, please use use_activation instead."
"softmax will be deprecated and will be removed in v0.15. "
"Please use use_activation instead."
)
return softmax
if activation := getattr(o, "activation", None) is not None:
logger.warning_once(
"activation will be deprecated, please use use_activation instead."
"activation will be deprecated and will be removed in v0.15. "
"Please use use_activation instead."
)
return activation

View File

@ -820,11 +820,6 @@ class VllmConfig:
f"({self.parallel_config.cp_kv_cache_interleave_size})."
)
assert (
self.parallel_config.cp_kv_cache_interleave_size == 1
or self.speculative_config is None
), "MTP with cp_kv_cache_interleave_size > 1 is not supported now."
# Do this after all the updates to compilation_config.mode
self.compilation_config.set_splitting_ops_for_v1(
all2all_backend=self.parallel_config.all2all_backend,
@ -1014,7 +1009,7 @@ class VllmConfig:
max_graph_size = min(max_num_seqs * 2, 512)
# 1, 2, 4, then multiples of 8 up to 256 and then multiples of 16
# up to max_graph_size
cuda_graph_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list(
cudagraph_capture_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list(
range(256, max_graph_size + 1, 16))
In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`

View File

@ -5,7 +5,7 @@ import queue
import threading
import time
from abc import ABC, abstractmethod
from collections import deque
from collections import Counter, deque
from collections.abc import Callable
from dataclasses import asdict
from itertools import count
@ -54,11 +54,26 @@ class BlockStored(KVCacheEvent):
lora_id: int | None
medium: str | None
def __hash__(self) -> int:
return hash(
(
tuple(self.block_hashes),
self.parent_block_hash,
tuple(self.token_ids),
self.block_size,
self.lora_id,
self.medium,
)
)
class BlockRemoved(KVCacheEvent):
block_hashes: list[ExternalBlockHash]
medium: str | None
def __hash__(self) -> int:
return hash((tuple(self.block_hashes), self.medium))
class AllBlocksCleared(KVCacheEvent):
pass
@ -68,6 +83,119 @@ class KVEventBatch(EventBatch):
events: list[BlockStored | BlockRemoved | AllBlocksCleared]
class KVEventAggregator:
"""
Aggregates KV events across multiple workers.
Tracks how many times each event appears and returns only those
that were emitted by all workers.
"""
__slots__ = ("_event_counter", "_num_workers")
def __init__(self, num_workers: int) -> None:
if num_workers <= 0:
raise ValueError("num_workers must be greater than zero.")
self._event_counter: Counter[KVCacheEvent] = Counter()
self._num_workers: int = num_workers
def add_events(self, events: list[KVCacheEvent]) -> None:
"""
Add events from a worker batch.
:param events: List of KVCacheEvent objects.
"""
if not isinstance(events, list):
raise TypeError("events must be a list of KVCacheEvent.")
self._event_counter.update(events)
def get_common_events(self) -> list[KVCacheEvent]:
"""
Return events that appeared in all workers.
:return: List of events present in all workers.
"""
return [
event
for event, count in self._event_counter.items()
if count == self._num_workers
]
def get_all_events(self) -> list[KVCacheEvent]:
"""
Return all events for all workers.
:return: List of events for all workers.
"""
return list(self._event_counter.elements())
def clear_events(self) -> None:
"""
Clear all tracked events.
"""
self._event_counter.clear()
def increment_workers(self, count: int = 1) -> None:
"""
Increment the number of workers contributing events.
:param count: Number to increment the workers by.
"""
if count <= 0:
raise ValueError("count must be positive.")
self._num_workers += count
def reset_workers(self) -> None:
"""
Reset the number of workers to 1.
"""
self._num_workers = 1
def get_number_of_workers(self) -> int:
"""
Return the number of workers.
:return: int number of workers.
"""
return self._num_workers
def __repr__(self) -> str:
return (
f"<KVEventAggregator workers={self._num_workers}, "
f"events={len(self._event_counter)}>"
)
class KVConnectorKVEvents(ABC):
"""
Abstract base class for KV events.
Acts as a container for KV events from the connector.
"""
@abstractmethod
def add_events(self, events: list[KVCacheEvent]) -> None:
raise NotImplementedError
@abstractmethod
def aggregate(self) -> "KVConnectorKVEvents":
raise NotImplementedError
@abstractmethod
def increment_workers(self, count: int = 1) -> None:
raise NotImplementedError
@abstractmethod
def get_all_events(self) -> list[KVCacheEvent]:
raise NotImplementedError
@abstractmethod
def get_number_of_workers(self) -> int:
raise NotImplementedError
@abstractmethod
def clear_events(self) -> None:
raise NotImplementedError
class EventPublisher(ABC):
"""Lightweight publisher for EventBatch batches with data parallelism
support.

View File

@ -78,6 +78,7 @@ class KVOutputAggregator:
finished_sending = set[str]()
finished_recving = set[str]()
aggregated_kv_connector_stats = None
combined_kv_cache_events = None
invalid_block_ids = set[int]()
for model_runner_output in outputs:
assert model_runner_output is not None
@ -119,6 +120,19 @@ class KVOutputAggregator:
aggregated_kv_connector_stats.aggregate(kv_connector_stats)
)
# Combine kv_cache_events from all workers.
if combined_kv_cache_events is None:
# Use the first worker's kv_cache events as start event list.
combined_kv_cache_events = kv_output.kv_cache_events
elif kv_cache_events := kv_output.kv_cache_events:
assert isinstance(
combined_kv_cache_events,
type(kv_cache_events),
)
worker_kv_cache_events = kv_cache_events.get_all_events()
combined_kv_cache_events.add_events(worker_kv_cache_events)
combined_kv_cache_events.increment_workers(1)
invalid_block_ids |= kv_output.invalid_block_ids
# select output of the worker specified by output_rank
@ -129,6 +143,7 @@ class KVOutputAggregator:
finished_sending=finished_sending or None,
finished_recving=finished_recving or None,
kv_connector_stats=aggregated_kv_connector_stats or None,
kv_cache_events=combined_kv_cache_events or None,
invalid_block_ids=invalid_block_ids,
expected_finished_count=self._expected_finished_count,
)

View File

@ -49,7 +49,7 @@ from vllm.v1.outputs import KVConnectorOutput
if TYPE_CHECKING:
from vllm.config import VllmConfig
from vllm.distributed.kv_events import KVCacheEvent
from vllm.distributed.kv_events import KVCacheEvent, KVConnectorKVEvents
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
KVConnectorPromMetrics,
KVConnectorStats,
@ -379,6 +379,14 @@ class KVConnectorBase_V1(ABC):
"""
return None
def get_kv_connector_kv_cache_events(self) -> Optional["KVConnectorKVEvents"]:
"""
Get the KV connector kv cache events collected during the last interval.
This function should be called by the model runner every time after the
model execution and before cleanup.
"""
return None
def get_handshake_metadata(self) -> KVConnectorHandshakeMetadata | None:
"""
Get the KVConnector handshake metadata for this connector.

View File

@ -1,14 +1,18 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any
import torch
from lmcache.integration.vllm.vllm_v1_adapter import (
LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl,
)
from vllm.attention.backends.abstract import AttentionMetadata
from vllm.config import VllmConfig
from vllm.distributed.kv_events import (
BlockStored,
KVCacheEvent,
KVConnectorKVEvents,
KVEventAggregator,
)
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
KVConnectorBase_V1,
KVConnectorMetadata,
@ -16,6 +20,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
)
from vllm.logger import init_logger
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.outputs import KVConnectorOutput
if TYPE_CHECKING:
from vllm.forward_context import ForwardContext
@ -26,6 +31,44 @@ if TYPE_CHECKING:
logger = init_logger(__name__)
class LMCacheKVEvents(KVConnectorKVEvents):
"""
Concrete implementation of KVConnectorKVEvents using KVEventAggregator.
"""
def __init__(self, num_workers: int) -> None:
self._aggregator = KVEventAggregator(num_workers)
def add_events(self, events: list[KVCacheEvent]) -> None:
self._aggregator.add_events(events)
def aggregate(self) -> "LMCacheKVEvents":
"""
Aggregate KV events and retain only common events.
"""
common_events = self._aggregator.get_common_events()
self._aggregator.clear_events()
self._aggregator.add_events(common_events)
self._aggregator.reset_workers()
return self
def increment_workers(self, count: int = 1) -> None:
self._aggregator.increment_workers(count)
def get_all_events(self) -> list[KVCacheEvent]:
return self._aggregator.get_all_events()
def get_number_of_workers(self) -> int:
return self._aggregator.get_number_of_workers()
def clear_events(self) -> None:
self._aggregator.clear_events()
self._aggregator.reset_workers()
def __repr__(self) -> str:
return f"<LMCacheKVEvents events={self.get_all_events()}>"
class LMCacheConnectorV1(KVConnectorBase_V1):
def __init__(
self,
@ -50,10 +93,17 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
cls = _adapter.LMCacheConnectorV1Impl
else:
logger.info("Initializing latest dev LMCache connector")
# lazy import
from lmcache.integration.vllm.vllm_v1_adapter import (
LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl,
)
cls = LMCacheConnectorLatestImpl
self._lmcache_engine = cls(vllm_config, role, self)
self._kv_cache_events: LMCacheKVEvents | None = None
# ==============================
# Worker-side methods
# ==============================
@ -151,6 +201,31 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
# Fallback for older versions that don't support this method
return set()
def get_kv_connector_kv_cache_events(self) -> LMCacheKVEvents | None:
"""
Get the KV connector kv cache events collected during the last interval.
"""
events = self._lmcache_engine.get_kv_events() # type: ignore [attr-defined]
if not events:
return None
blocks: list[BlockStored] = [
BlockStored(
block_hashes=e.block_hashes,
parent_block_hash=e.parent_block_hash,
token_ids=e.token_ids,
lora_id=e.lora_id,
block_size=e.block_size,
medium=e.medium,
)
for e in events
]
lmcache_kv_events = LMCacheKVEvents(num_workers=1)
lmcache_kv_events.add_events(blocks)
return lmcache_kv_events
# ==============================
# Scheduler-side methods
# ==============================
@ -198,6 +273,28 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
"""
return self._lmcache_engine.build_connector_meta(scheduler_output)
def update_connector_output(self, connector_output: KVConnectorOutput):
"""
Update KVConnector state from worker-side connectors output.
Args:
connector_output (KVConnectorOutput): the worker-side
connectors output.
"""
# Get the KV events
kv_cache_events = connector_output.kv_cache_events
if not kv_cache_events or not isinstance(kv_cache_events, LMCacheKVEvents):
return
if self._kv_cache_events is None:
self._kv_cache_events = kv_cache_events
else:
self._kv_cache_events.add_events(kv_cache_events.get_all_events())
self._kv_cache_events.increment_workers(
kv_cache_events.get_number_of_workers()
)
return
def request_finished(
self,
request: "Request",
@ -214,3 +311,17 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
returned by the engine.
"""
return self._lmcache_engine.request_finished(request, block_ids)
def take_events(self) -> Iterable["KVCacheEvent"]:
"""
Take the KV cache events from the connector.
Yields:
New KV cache events since the last call.
"""
if self._kv_cache_events is not None:
self._kv_cache_events.aggregate()
kv_cache_events = self._kv_cache_events.get_all_events()
yield from kv_cache_events
self._kv_cache_events.clear_events()
self._kv_cache_events = None

View File

@ -259,6 +259,12 @@ class MultiConnector(KVConnectorBase_V1):
agg_block_ids |= c.get_block_ids_with_load_errors()
return agg_block_ids
# TODO: Add a generic implementation of 'get_kv_connector_kv_cache_events' method
# for the MultiConnector. It should be able to get events from multiple
# connectors, handling the case where only a subset of the requested connectors
# implements the 'get_kv_connector_kv_cache_events'
# Follow on PR from https://github.com/vllm-project/vllm/pull/28309#pullrequestreview-3566351082
# ==============================
# Scheduler-side methods
# ==============================

View File

@ -71,7 +71,6 @@ from vllm.config.model import (
LogprobsMode,
ModelDType,
RunnerOption,
TaskOption,
TokenizerMode,
)
from vllm.config.multimodal import MMCacheType, MMEncoderTPMode
@ -360,7 +359,6 @@ class EngineArgs:
hf_config_path: str | None = ModelConfig.hf_config_path
runner: RunnerOption = ModelConfig.runner
convert: ConvertOption = ModelConfig.convert
task: TaskOption | None = ModelConfig.task
skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode
@ -373,9 +371,8 @@ class EngineArgs:
config_format: str = ModelConfig.config_format
dtype: ModelDType = ModelConfig.dtype
kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
seed: int | None = 0
seed: int = ModelConfig.seed
max_model_len: int | None = ModelConfig.max_model_len
cuda_graph_sizes: list[int] | None = CompilationConfig.cudagraph_capture_sizes
cudagraph_capture_sizes: list[int] | None = (
CompilationConfig.cudagraph_capture_sizes
)
@ -463,7 +460,6 @@ class EngineArgs:
MultiModalConfig, "media_io_kwargs"
)
mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs
disable_mm_preprocessor_cache: bool = False # DEPRECATED
mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
mm_processor_cache_type: MMCacheType | None = (
MultiModalConfig.mm_processor_cache_type
@ -560,9 +556,6 @@ class EngineArgs:
use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load
pt_load_map_location: str = LoadConfig.pt_load_map_location
# DEPRECATED
enable_multimodal_encoder_data_parallel: bool = False
logits_processors: list[str | type[LogitsProcessor]] | None = (
ModelConfig.logits_processors
)
@ -630,7 +623,6 @@ class EngineArgs:
model_group.add_argument("--model", **model_kwargs["model"])
model_group.add_argument("--runner", **model_kwargs["runner"])
model_group.add_argument("--convert", **model_kwargs["convert"])
model_group.add_argument("--task", **model_kwargs["task"], deprecated=True)
model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"])
model_group.add_argument(
@ -884,11 +876,6 @@ class EngineArgs:
parallel_group.add_argument(
"--worker-extension-cls", **parallel_kwargs["worker_extension_cls"]
)
parallel_group.add_argument(
"--enable-multimodal-encoder-data-parallel",
action="store_true",
deprecated=True,
)
# KV cache arguments
cache_kwargs = get_kwargs(CacheConfig)
@ -962,9 +949,6 @@ class EngineArgs:
multimodal_group.add_argument(
"--mm-processor-cache-gb", **multimodal_kwargs["mm_processor_cache_gb"]
)
multimodal_group.add_argument(
"--disable-mm-preprocessor-cache", action="store_true", deprecated=True
)
multimodal_group.add_argument(
"--mm-processor-cache-type", **multimodal_kwargs["mm_processor_cache_type"]
)
@ -1126,15 +1110,6 @@ class EngineArgs:
compilation_group.add_argument(
"--cudagraph-capture-sizes", **compilation_kwargs["cudagraph_capture_sizes"]
)
compilation_kwargs["cudagraph_capture_sizes"]["help"] = (
"--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or v1.0.0,"
" whichever is soonest. Please use --cudagraph-capture-sizes instead."
)
compilation_group.add_argument(
"--cuda-graph-sizes",
**compilation_kwargs["cudagraph_capture_sizes"],
deprecated=True,
)
compilation_group.add_argument(
"--max-cudagraph-capture-size",
**compilation_kwargs["max_cudagraph_capture_size"],
@ -1207,62 +1182,20 @@ class EngineArgs:
if is_gguf(self.model):
self.quantization = self.load_format = "gguf"
# NOTE(woosuk): In V1, we use separate processes for workers (unless
# VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here
# doesn't affect the user process.
if self.seed is None:
logger.warning_once(
"`seed=None` is equivalent to `seed=0` in V1 Engine. "
"You will no longer be allowed to pass `None` in v0.13.",
scope="local",
if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
logger.warning(
"The global random seed is set to %d. Since "
"VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may "
"affect the random state of the Python process that "
"launched vLLM.",
self.seed,
)
self.seed = 0
if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
logger.warning(
"The global random seed is set to %d. Since "
"VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may "
"affect the random state of the Python process that "
"launched vLLM.",
self.seed,
)
if self.disable_mm_preprocessor_cache:
logger.warning_once(
"`--disable-mm-preprocessor-cache` is deprecated "
"and will be removed in v0.13. "
"Please use `--mm-processor-cache-gb 0` instead.",
scope="local",
)
self.mm_processor_cache_gb = 0
elif envs.VLLM_MM_INPUT_CACHE_GIB != 4:
logger.warning_once(
"VLLM_MM_INPUT_CACHE_GIB` is deprecated "
"and will be removed in v0.13. "
"Please use `--mm-processor-cache-gb %d` instead.",
envs.VLLM_MM_INPUT_CACHE_GIB,
scope="local",
)
self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB
if self.enable_multimodal_encoder_data_parallel:
logger.warning_once(
"--enable-multimodal-encoder-data-parallel` is deprecated "
"and will be removed in v0.13. "
"Please use `--mm-encoder-tp-mode data` instead.",
scope="local",
)
self.mm_encoder_tp_mode = "data"
return ModelConfig(
model=self.model,
hf_config_path=self.hf_config_path,
runner=self.runner,
convert=self.convert,
task=self.task,
tokenizer=self.tokenizer,
tokenizer_mode=self.tokenizer_mode,
trust_remote_code=self.trust_remote_code,
@ -1747,18 +1680,6 @@ class EngineArgs:
# Compilation config overrides
compilation_config = copy.deepcopy(self.compilation_config)
if self.cuda_graph_sizes is not None:
logger.warning(
"--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or "
"v1.0.0, whichever is soonest. Please use --cudagraph-capture-sizes "
"instead."
)
if compilation_config.cudagraph_capture_sizes is not None:
raise ValueError(
"cuda_graph_sizes and compilation_config."
"cudagraph_capture_sizes are mutually exclusive"
)
compilation_config.cudagraph_capture_sizes = self.cuda_graph_sizes
if self.cudagraph_capture_sizes is not None:
if compilation_config.cudagraph_capture_sizes is not None:
raise ValueError(
@ -1868,6 +1789,7 @@ class EngineArgs:
except Exception:
# This is only used to set default_max_num_batched_tokens
device_memory = 0
device_name = ""
# NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces
# throughput, see PR #17885 for more details.
@ -1932,16 +1854,6 @@ class EngineArgs:
default_chunked_prefill = model_config.is_chunked_prefill_supported
default_prefix_caching = model_config.is_prefix_caching_supported
if self.prefill_context_parallel_size > 1:
default_chunked_prefill = False
default_prefix_caching = False
logger.warning_once(
"--prefill-context-parallel-size > 1 is not compatible with "
"chunked prefill and prefix caching now. Chunked prefill "
"and prefix caching have been disabled by default.",
scope="local",
)
if self.enable_chunked_prefill is None:
self.enable_chunked_prefill = default_chunked_prefill
@ -2127,11 +2039,13 @@ def human_readable_int(value):
"k": 10**3,
"m": 10**6,
"g": 10**9,
"t": 10**12,
}
binary_multiplier = {
"K": 2**10,
"M": 2**20,
"G": 2**30,
"T": 2**40,
}
number, suffix = match.groups()

View File

@ -198,7 +198,7 @@ class LLM:
quantization: QuantizationMethods | None = None,
revision: str | None = None,
tokenizer_revision: str | None = None,
seed: int | None = None,
seed: int = 0,
gpu_memory_utilization: float = 0.9,
swap_space: float = 4,
cpu_offload_gb: float = 0,

View File

@ -176,7 +176,7 @@ class FrontendArgs:
enable_force_include_usage: bool = False
"""If set to True, including usage on every request."""
enable_tokenizer_info_endpoint: bool = False
"""Enable the /get_tokenizer_info endpoint. May expose chat
"""Enable the `/tokenizer_info` endpoint. May expose chat
templates and other tokenizer configuration."""
enable_log_outputs: bool = False
"""If True, log model outputs (generations).

View File

@ -72,10 +72,9 @@ if TYPE_CHECKING:
VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
VLLM_MEDIA_CONNECTOR: str = "http"
VLLM_MM_INPUT_CACHE_GIB: int = 4
VLLM_TARGET_DEVICE: str = "cuda"
VLLM_MAIN_CUDA_VERSION: str = "12.9"
VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest"
VLLM_FLOAT32_MATMUL_PRECISION: Literal["ieee", "tf32"] = "ieee"
MAX_JOBS: str | None = None
NVCC_THREADS: str | None = None
VLLM_USE_PRECOMPILED: bool = False
@ -457,11 +456,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
or "12.9",
# Controls PyTorch float32 matmul precision mode within vLLM workers.
# Valid options mirror torch.set_float32_matmul_precision
# Accepted values:
# - "ieee" (default): force full IEEE FP32 matmul precision.
# - "tf32": enable TensorFloat32-based fast matmul.
"VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices(
"VLLM_FLOAT32_MATMUL_PRECISION",
"highest",
["highest", "high", "medium"],
"ieee",
["ieee", "tf32"],
case_sensitive=False,
),
# Maximum number of compilation jobs to run in parallel.
@ -786,9 +787,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
# imported at runtime.
# If a non-existing backend is used, an AssertionError will be thrown.
"VLLM_MEDIA_CONNECTOR": lambda: os.getenv("VLLM_MEDIA_CONNECTOR", "http"),
# [DEPRECATED] Cache size (in GiB per process) for multimodal input cache
# Default is 4 GiB per API process + 4 GiB per engine core process
"VLLM_MM_INPUT_CACHE_GIB": lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "4")),
# Path to the XLA persistent cache directory.
# Only used for XLA devices such as TPUs.
"VLLM_XLA_CACHE_PATH": lambda: os.path.expanduser(
@ -1681,7 +1679,6 @@ def compile_factors() -> dict[str, object]:
"VLLM_MEDIA_CONNECTOR",
"VLLM_ASSETS_CACHE",
"VLLM_ASSETS_CACHE_MODEL_CLEAN",
"VLLM_MM_INPUT_CACHE_GIB",
"VLLM_WORKER_MULTIPROC_METHOD",
"VLLM_ENABLE_V1_MULTIPROCESSING",
"VLLM_V1_OUTPUT_PROC_CHUNK_SIZE",

View File

@ -229,6 +229,11 @@ def suppress_logging(level: int = logging.INFO) -> Generator[None, Any, None]:
# guaranteed by the Python GIL.
_configure_vllm_root_logger()
# Transformers uses httpx to access the Hugging Face Hub. httpx is quite verbose,
# so we set its logging level to WARNING when vLLM's logging level is INFO.
if envs.VLLM_LOGGING_LEVEL == "INFO":
logging.getLogger("httpx").setLevel(logging.WARNING)
logger = init_logger(__name__)

View File

@ -1257,6 +1257,7 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]):
)
height = min(height, overrides.height)
num_frames = max(num_frames, 2) # GLM 4.6V requires 2 frames
video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8)
video_items = []
for i in range(num_videos):

View File

@ -53,6 +53,22 @@ The output embeddings must be one of the following formats:
"""
def _require_is_multimodal(is_multimodal: Tensor | None) -> Tensor:
"""
A helper function to be used in the context of
[vllm.model_executor.models.interfaces.SupportsMultiModal.embed_input_ids][]
to provide a better error message.
"""
if is_multimodal is None:
raise ValueError(
"`embed_input_ids` now requires `is_multimodal` arg, "
"please update your model runner according to "
"https://github.com/vllm-project/vllm/pull/16229."
)
return is_multimodal
@runtime_checkable
class SupportsMultiModal(Protocol):
"""The interface required for all multi-modal models."""
@ -111,13 +127,7 @@ class SupportsMultiModal(Protocol):
the appearances of their corresponding multimodal data item in the
input prompt.
"""
if hasattr(self, "get_multimodal_embeddings"):
logger.warning_once(
"`get_multimodal_embeddings` for vLLM models is deprecated and will be "
"removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename "
"this method to `embed_multimodal`."
)
return self.get_multimodal_embeddings(**kwargs)
...
def get_language_model(self) -> VllmModel:
"""
@ -196,17 +206,10 @@ class SupportsMultiModal(Protocol):
if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
return inputs_embeds
if is_multimodal is None:
raise ValueError(
"`embed_input_ids` now requires `is_multimodal` arg, "
"please update your model runner according to "
"https://github.com/vllm-project/vllm/pull/16229."
)
return _merge_multimodal_embeddings(
inputs_embeds=inputs_embeds,
multimodal_embeddings=multimodal_embeddings,
is_multimodal=is_multimodal,
is_multimodal=_require_is_multimodal(is_multimodal),
)

View File

@ -49,13 +49,7 @@ class VllmModel(Protocol[T_co]):
def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
"""Apply token embeddings to `input_ids`."""
if hasattr(self, "get_input_embeddings"):
logger.warning_once(
"`get_input_embeddings` for vLLM models is deprecated and will be "
"removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename "
"this method to `embed_input_ids`."
)
return self.get_input_embeddings(input_ids)
...
def forward(self, input_ids: torch.Tensor, positions: torch.Tensor) -> T_co: ...
@ -68,15 +62,6 @@ def _check_vllm_model_init(model: type[object] | object) -> bool:
def _check_vllm_model_embed_input_ids(model: type[object] | object) -> bool:
model_embed_input_ids = getattr(model, "embed_input_ids", None)
if not callable(model_embed_input_ids):
model_get_input_embeddings = getattr(model, "get_input_embeddings", None)
if callable(model_get_input_embeddings):
logger.warning(
"`get_input_embeddings` for vLLM models is deprecated and will be "
"removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename "
"this method to `embed_input_ids`."
)
model.embed_input_ids = model_get_input_embeddings
return True
logger.warning(
"The model (%s) is missing the `embed_input_ids` method.",
model,

View File

@ -18,15 +18,10 @@ from vllm.model_executor.models.deepseek_v2 import (
DeepseekV2DecoderLayer,
DeepseekV2Model,
)
from vllm.model_executor.models.interfaces import MultiModalEmbeddings
from vllm.model_executor.models.mistral_large_3 import MistralLarge3ForCausalLM
from vllm.multimodal.inputs import NestedTensors
from .utils import (
_merge_multimodal_embeddings,
make_empty_intermediate_tensors_factory,
maybe_prefix,
)
from .interfaces import SupportsMultiModal
from .utils import make_empty_intermediate_tensors_factory, maybe_prefix
logger = init_logger(__name__)
@ -117,26 +112,10 @@ class EagleMistralLarge3ForCausalLM(MistralLarge3ForCausalLM):
)
super().__init__(vllm_config=vllm_config, prefix=prefix)
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: MultiModalEmbeddings | None = None,
*,
is_multimodal: torch.Tensor | None = None,
handle_oov_mm_token: bool = False,
) -> torch.Tensor:
inputs_embeds = super().embed_input_ids(input_ids)
def get_language_model(self) -> torch.nn.Module:
return self.model
if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
return inputs_embeds
assert is_multimodal is not None
return _merge_multimodal_embeddings(
inputs_embeds=inputs_embeds,
multimodal_embeddings=multimodal_embeddings,
is_multimodal=is_multimodal,
)
embed_input_ids = SupportsMultiModal.embed_input_ids # type: ignore
def forward(
self,
@ -155,11 +134,3 @@ class EagleMistralLarge3ForCausalLM(MistralLarge3ForCausalLM):
"model.embed_tokens.weight",
"lm_head.weight",
}
def embed_input_ids(
self,
input_ids: torch.Tensor,
multimodal_embeddings: NestedTensors | None = None,
is_multimodal: torch.Tensor | None = None,
) -> torch.Tensor:
return self.model.embed_input_ids(input_ids)

View File

@ -64,6 +64,7 @@ from .interfaces import (
SupportsMultiModal,
SupportsPP,
SupportsQuant,
_require_is_multimodal,
)
from .utils import (
AutoWeightsLoader,
@ -687,17 +688,10 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant)
if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
return inputs_embeds
if is_multimodal is None:
raise ValueError(
"`embed_input_ids` now requires `is_multimodal` arg, "
"please update your model runner according to "
"https://github.com/vllm-project/vllm/pull/16229."
)
return _merge_multimodal_embeddings(
inputs_embeds=inputs_embeds,
multimodal_embeddings=multimodal_embeddings,
is_multimodal=is_multimodal,
is_multimodal=_require_is_multimodal(is_multimodal),
)
def forward(

View File

@ -93,6 +93,7 @@ from .interfaces import (
SupportsMRoPE,
SupportsMultiModal,
SupportsPP,
_require_is_multimodal,
)
from .qwen2_5_vl import (
Qwen2_5_VisionAttention,
@ -1572,12 +1573,7 @@ class Qwen3VLForConditionalGeneration(
if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
return inputs_embeds
if is_multimodal is None:
raise ValueError(
"`embed_input_ids` now requires `is_multimodal` arg, "
"please update your model runner according to "
"https://github.com/vllm-project/vllm/pull/16229."
)
is_multimodal = _require_is_multimodal(is_multimodal)
if self.use_deepstack:
(

View File

@ -283,8 +283,15 @@ class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]):
# They can be passed to the underlying
# media loaders (e.g. custom implementations)
# for flexible control.
# Allow per-request override of video backend via kwargs.
# This enables users to specify a different backend than the
# global VLLM_VIDEO_LOADER_BACKEND env var, e.g.:
# --media-io-kwargs '{"video": {"video_backend": "torchcodec"}}'
video_loader_backend = (
kwargs.pop("video_backend", None) or envs.VLLM_VIDEO_LOADER_BACKEND
)
self.kwargs = kwargs
video_loader_backend = envs.VLLM_VIDEO_LOADER_BACKEND
self.video_loader = VIDEO_LOADER_REGISTRY.load(video_loader_backend)
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, dict[str, Any]]:

View File

@ -19,6 +19,10 @@ logger = init_logger(__name__)
class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser):
"""
Reasoning parser for MiniMax M2 model.
MiniMax M2 models don't generate <think> start token, only </think> end
token. All content before </think> is reasoning, content after is the
actual response.
"""
@property
@ -31,6 +35,45 @@ class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser):
"""The token that ends reasoning content."""
return "</think>"
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
"""
Extract reasoning content from a delta message for streaming.
MiniMax M2 models don't generate <think> start token, so we assume
all content is reasoning until we encounter the </think> end token.
"""
# Skip single end token
if len(delta_token_ids) == 1 and delta_token_ids[0] == self.end_token_id:
return None
# Check if end token has already appeared in previous tokens
# meaning we're past the reasoning phase
if self.end_token_id in previous_token_ids:
# We're past the reasoning phase, this is content
return DeltaMessage(content=delta_text)
# Check if end token is in delta tokens
if self.end_token_id in delta_token_ids:
# End token in delta, split reasoning and content
end_index = delta_text.find(self.end_token)
reasoning = delta_text[:end_index]
content = delta_text[end_index + len(self.end_token) :]
return DeltaMessage(
reasoning=reasoning if reasoning else None,
content=content if content else None,
)
# No end token yet, all content is reasoning
return DeltaMessage(reasoning=delta_text)
class MiniMaxM2AppendThinkReasoningParser(ReasoningParser):
"""

View File

@ -1654,6 +1654,33 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
# Convert from (L, N, P) to (N, P, L)
self.W_UK_T = W_UK.permute(1, 2, 0)
def _concat_k_nope_k_pe(
self, k_nope: torch.Tensor, k_pe: torch.Tensor
) -> torch.Tensor:
"""
Efficiently concatenate k_nope and k_pe tensors along the last dimension.
This function avoids the performance penalty of torch.cat with expanded
non-contiguous tensors by pre-allocating the output and using direct copies.
Args:
k_nope: Tensor of shape [..., nope_dim]
k_pe: Tensor to broadcast and concatenate, typically shape [..., 1, pe_dim]
or [..., pe_dim]
Returns:
Tensor of shape [..., nope_dim + pe_dim]
"""
k = torch.empty(
(*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]),
dtype=k_nope.dtype,
device=k_nope.device,
)
# Direct copies with efficient broadcasting
k[..., : k_nope.shape[-1]] = k_nope
k[..., k_nope.shape[-1] :] = k_pe
return k
def _compute_prefill_context(
self,
q: torch.Tensor,
@ -1690,7 +1717,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
)
k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
k = self._concat_k_nope_k_pe(k_nope, k_pe)
attn_output, attn_softmax_lse = self._run_prefill_context_chunk(
prefill=prefill_metadata,
@ -1794,7 +1821,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
-1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim
)
k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
k = self._concat_k_nope_k_pe(k_nope, k_pe)
attn_output, attn_softmax_lse = self._run_prefill_context_chunk(
prefill=prefill_metadata,
@ -1843,7 +1870,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
)
k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
k = self._concat_k_nope_k_pe(k_nope, k_pe)
output_prefill = self._run_prefill_new_tokens(
prefill=attn_metadata.prefill,

View File

@ -12,9 +12,11 @@ from vllm.compilation.cuda_graph import CUDAGraphStat
from vllm.v1.core.sched.output import SchedulerOutput
if TYPE_CHECKING:
from vllm.distributed.kv_events import KVConnectorKVEvents
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
else:
KVConnectorStats = object
KVConnectorKVEvents = object
class LogprobsLists(NamedTuple):
@ -108,6 +110,7 @@ class KVConnectorOutput:
finished_sending: set[str] | None = None
finished_recving: set[str] | None = None
kv_connector_stats: KVConnectorStats | None = None
kv_cache_events: KVConnectorKVEvents | None = None
# IDs of externally computed KV blocks that failed to load.
# Requests referencing these blocks should be rescheduled to recompute them
invalid_block_ids: set[int] = field(default_factory=set)
@ -123,6 +126,7 @@ class KVConnectorOutput:
not self.finished_sending
and not self.finished_recving
and not self.kv_connector_stats
and not self.kv_cache_events
and not self.invalid_block_ids
)

View File

@ -0,0 +1,42 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import TYPE_CHECKING, Any, cast
from vllm.config import VllmConfig, get_layers_from_vllm_config
if TYPE_CHECKING:
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
else:
AttentionLayerBase = object
def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None:
pcp_size = vllm_config.parallel_config.prefill_context_parallel_size
dcp_size = vllm_config.parallel_config.decode_context_parallel_size
interleave_size = vllm_config.parallel_config.cp_kv_cache_interleave_size
if pcp_size * dcp_size > 1:
layer_type = cast(type[Any], AttentionLayerBase)
layers = get_layers_from_vllm_config(vllm_config, layer_type)
for layer in layers.values():
layer_impl = getattr(layer, "impl", None)
if layer_impl is None:
continue
if vllm_config.speculative_config is not None and interleave_size > 1:
assert layer_impl.supports_mtp_with_cp_non_trivial_interleave_size, (
"MTP with cp_kv_cache_interleave_size > 1 is not "
f"supported in {layer_impl.__class__.__name__}."
)
if dcp_size > 1:
assert layer_impl.need_to_return_lse_for_decode, (
"DCP requires attention impls to return"
" the softmax lse for decode, but the impl "
f"{layer_impl.__class__.__name__} "
"does not return the softmax lse for decode."
)
if pcp_size > 1:
assert layer_impl.supports_pcp, (
"PCP requires attention impls' support, "
f"but the impl {layer_impl.__class__.__name__} "
"does not support PCP."
)

View File

@ -149,6 +149,7 @@ from vllm.v1.spec_decode.ngram_proposer import NgramProposer
from vllm.v1.spec_decode.suffix_decoding import SuffixDecodingProposer
from vllm.v1.structured_output.utils import apply_grammar_bitmask
from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext
from vllm.v1.worker.cp_utils import check_attention_cp_compatibility
from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
from vllm.v1.worker.ec_connector_model_runner_mixin import ECConnectorModelRunnerMixin
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@ -4846,6 +4847,9 @@ class GPUModelRunner(
attention_backend_list, kv_cache_config.kv_cache_groups
)
# Check if attention backend supports PCP&DCP and related features.
check_attention_cp_compatibility(self.vllm_config)
for i, attn_backend_map in enumerate(attention_backend_maps):
self.attn_groups.append(create_attn_groups(attn_backend_map, i))
@ -5504,20 +5508,6 @@ class GPUModelRunner(
kv_transfer_group.register_kv_caches(kv_caches)
kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks)
if self.dcp_world_size > 1:
layer_type = cast(type[Any], AttentionLayerBase)
layers = get_layers_from_vllm_config(self.vllm_config, layer_type)
for layer in layers.values():
layer_impl = getattr(layer, "impl", None)
if layer_impl is None:
continue
assert layer_impl.need_to_return_lse_for_decode, (
"DCP requires attention impls to return"
" the softmax lse for decode, but the impl "
f"{layer_impl.__class__.__name__} "
"does not return the softmax lse for decode."
)
def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
"""
Add encoder-only layers to the KV cache config.

View File

@ -81,7 +81,7 @@ class Worker(WorkerBase):
# configure float32 matmul precision according to vLLM env.
precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
torch.set_float32_matmul_precision(precision)
torch.backends.cuda.matmul.fp32_precision = precision
if self.model_config.trust_remote_code:
# note: lazy import to avoid importing torch before initializing

View File

@ -22,7 +22,6 @@ from vllm.distributed.kv_transfer import (
has_kv_transfer_group,
)
from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
from vllm.forward_context import get_forward_context, set_forward_context
from vllm.logger import init_logger
from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig
@ -138,16 +137,10 @@ class KVConnectorModelRunnerMixin:
)
output.invalid_block_ids = kv_connector.get_block_ids_with_load_errors()
output.kv_connector_stats = (
KVConnectorModelRunnerMixin.get_kv_connector_stats()
)
kv_connector.clear_connector_metadata()
output.kv_connector_stats = kv_connector.get_kv_connector_stats()
output.kv_cache_events = kv_connector.get_kv_connector_kv_cache_events()
@staticmethod
def get_kv_connector_stats() -> KVConnectorStats | None:
if has_kv_transfer_group():
return get_kv_transfer_group().get_kv_connector_stats()
return None
kv_connector.clear_connector_metadata()
@staticmethod
def use_uniform_kv_cache(