Merge branch 'main' into model_arch_cfg

This commit is contained in:
Xingyu Liu 2025-12-10 10:16:26 -08:00 committed by GitHub
commit 72730a74d5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
58 changed files with 867 additions and 234 deletions

View File

@ -36,6 +36,11 @@ function cpu_tests() {
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run model tests
docker exec cpu-test bash -c "
set -e
pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
# Run kernel tests
docker exec cpu-test bash -c "
set -e

View File

@ -574,7 +574,7 @@ async def benchmark(
)
print(
"{:<40} {:<10.2f}".format(
"Total Token throughput (tok/s):", metrics.total_token_throughput
"Total token throughput (tok/s):", metrics.total_token_throughput
)
)

View File

@ -251,17 +251,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
endif()
# Build ACL with CMake
set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
set(CMAKE_BUILD_TYPE "Release")
set(ARM_COMPUTE_ARCH "armv8.2-a")
set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
set(ARM_COMPUTE_ENABLE_OPENMP "ON")
set(ARM_COMPUTE_ENABLE_WERROR "OFF")
set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
set(ARM_COMPUTE_BUILD_TESTING "OFF")
set(_cmake_config_cmd
${CMAKE_COMMAND} -G Ninja -B build
-DARM_COMPUTE_BUILD_SHARED_LIB=OFF

View File

@ -117,7 +117,6 @@ torch::Tensor get_scheduler_metadata(
input.casual = casual;
input.isa = isa;
input.enable_kv_split = enable_kv_split;
TORCH_CHECK(casual, "Only supports casual mask for now.");
VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {

View File

@ -186,7 +186,7 @@ struct AttentionMetadata {
// - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2
// * q_tile_size * 4, partial output, max + sum (float)
// Reduction scratchpad contains:
// - flags: bool array to indicate wether the split is finished
// - flags: bool array to indicate whether the split is finished
// - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size
// - max, sum: 2 * split_num * q_tile_size * 4
class AttentionScratchPad {

View File

@ -617,7 +617,7 @@ struct MacheteCollectiveMma {
// Same as upstream, should be kept the same when possible, not formatted for
// easier comparison
// with `SwapAB ? N : M -> M` since we dont support SwapAB
// with `SwapAB ? N : M -> M` since we don't support SwapAB
// clang-format off
template<class ProblemShape>
static bool

View File

@ -1241,33 +1241,16 @@ __global__ void wvSplitK_hf_big_(const int K, const int M, const int Bx,
}
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
// Find the min val of div2 that doesn't increase N/(div1*div2)
int mindiv(int N, int div1, int div2) {
int nPrRnd = div1 * div2;
int rnds0 = N / nPrRnd;
nPrRnd -= div1 * 3;
int rnds3 = N / nPrRnd;
nPrRnd -= div1;
int rnds4 = N / nPrRnd;
nPrRnd -= div1;
int rnds5 = N / nPrRnd;
nPrRnd -= div1;
int rnds6 = N / nPrRnd;
nPrRnd -= div1;
int rnds7 = N / nPrRnd;
nPrRnd -= div1;
int rnds8 = N / nPrRnd;
nPrRnd -= div1;
int rnds9 = N / nPrRnd;
nPrRnd -= div1;
int rtn = div2;
if (rnds0 == rnds3) rtn = div2 - 3;
if (rnds0 == rnds4) rtn = div2 - 4;
if (rnds0 == rnds5) rtn = div2 - 5;
if (rnds0 == rnds6) rtn = div2 - 6;
if (rnds0 == rnds7) rtn = div2 - 7;
if (rnds0 == rnds8) rtn = div2 - 8;
if (rnds0 == rnds9) rtn = div2 - 9;
return rtn;
int rnds[13];
for (int i = 0; i < 13; i++) {
rnds[i] = (N + nPrRnd - 1) / nPrRnd;
nPrRnd -= div1;
}
for (int i = 12; i >= 0; i--)
if (rnds[0] == rnds[i]) return (div2 - i);
}
torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
@ -1300,26 +1283,37 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
const int max_lds_len = get_lds_size() / 2;
#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
_N) \
{ \
dim3 block(64, _WvPrGrp); \
if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \
wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N> \
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
biasf4, c, __wvPrGrp, CuCount); \
} else if (K_in * N_in <= max_lds_len * 1.2) { \
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \
wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N> \
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
biasf4, c, __wvPrGrp, CuCount); \
} else { \
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp); \
wvSplitK_hf_big_<fptype, 64, _YTILEb, _WvPrGrp, 8, _UNRLb, _N> \
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
biasf4, c, __wvPrGrp, CuCount); \
} \
#define WVSPLITK(_YTILE, _UNRL, _N) \
{ \
dim3 block(64, 16); \
int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, 16); \
if ((K_in * N_in <= max_lds_len) && (M_in % _YTILE == 0)) \
wvSplitK_hf_sml_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
biasf4, c, __wvPrGrp, CuCount); \
else if (K_in * N_in <= max_lds_len * 1.2) \
wvSplitK_hf_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
biasf4, c, __wvPrGrp, CuCount); \
else \
wvSplitK_hf_big_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
biasf4, c, __wvPrGrp, CuCount); \
}
#define WVSPLIT_TILE(_sYT, __N) \
{ \
bool fit_lds = (K_in * N_in <= max_lds_len); \
if (_sYT <= 1) \
WVSPLITK(1, 4, __N) \
else if ((__N == 1) || (!fit_lds) || (_sYT <= 4 * 2)) \
WVSPLITK(2, 2, __N) \
else if (_sYT <= 4 * 3) \
WVSPLITK(3, 2, __N) \
else if (__N == 4) \
WVSPLITK(4, 1, __N) \
else \
WVSPLITK(4, 2, __N) \
}
AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] {
@ -1331,18 +1325,23 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
? reinterpret_cast<const fptype*>(in_bias->data_ptr())
: nullptr;
fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
// first shoot for biggest tile-size that keeps all simd busy,
// then cut the active waves to balance their distribution...
int sYT = (M_in + CuCount * 4 - 1) / (CuCount * 4);
switch (N_in) {
case 1:
WVSPLITK(16, 2, 2, 2, 2, 2, 2, 1)
WVSPLIT_TILE(sYT, 1)
break;
case 2:
WVSPLITK(16, 2, 2, 2, 2, 2, 2, 2)
WVSPLIT_TILE(sYT, 2)
break;
case 3:
WVSPLITK(16, 4, 7, 7, 1, 1, 1, 3)
WVSPLIT_TILE(sYT, 3)
break;
case 4:
WVSPLITK(16, 4, 7, 7, 1, 1, 1, 4)
WVSPLIT_TILE(sYT, 4)
break;
default:
throw std::runtime_error(

View File

@ -84,7 +84,7 @@ Total input tokens: 1369
Total generated tokens: 2212
Request throughput (req/s): 1.73
Output token throughput (tok/s): 382.89
Total Token throughput (tok/s): 619.85
Total token throughput (tok/s): 619.85
---------------Time to First Token----------------
Mean TTFT (ms): 71.54
Median TTFT (ms): 73.88

View File

@ -21,30 +21,20 @@ The mental model is that server-level metrics help explain the values of request
### v1 Metrics
In v1, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
In v1, an extensive set of metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix, for example:
- `vllm:num_requests_running` (Gauge) - Number of requests currently running.
- `vllm:num_requests_waiting` (Gauge) - Number of requests currently waiting.
- `vllm:kv_cache_usage_perc` (Gauge) - Fraction of used KV cache blocks (01).
- `vllm:prefix_cache_queries` (Counter) - Number of prefix cache queries.
- `vllm:prefix_cache_hits` (Counter) - Number of prefix cache hits.
- `vllm:mm_cache_queries` (Counter) - (For multimodal models) Number of multimodal cache queries.
- `vllm:mm_cache_hits` (Counter) - (For multimodal models) Number of multimodal cache hits.
- `vllm:num_preemptions_total` (Counter) - Number of preemptions.
- `vllm:prompt_tokens_total` (Counter) - Total number of prompt tokens processed.
- `vllm:generation_tokens_total` (Counter) - Total number of generated tokens.
- `vllm:iteration_tokens_total` (Histogram) - Histogram of tokens processed in each engine step.
- `vllm:cache_config_info` (Gauge) - Information about the cache configuration.
- `vllm:request_success_total` (Counter) - Number of finished requests (by finish reason).
- `vllm:request_prompt_tokens` (Histogram) - Histogram of input prompt token counts.
- `vllm:request_generation_tokens` (Histogram) - Histogram of generation token counts.
- `vllm:request_params_n` (Histogram) - Histogram of request parameter n.
- `vllm:request_params_max_tokens` - (Histogram) - Histogram of max_tokens parameter in requests.
- `vllm:time_to_first_token_seconds` (Histogram) - Time to first token (TTFT).
- `vllm:inter_token_latency_seconds` (Histogram) - Inter-token latency.
- `vllm:e2e_request_latency_seconds` (Histogram) - End-to-end request latency.
- `vllm:request_queue_time_seconds` (Histogram) - Time spent in the queue.
- `vllm:request_inference_time_seconds` (Histogram) - Request inference time.
- `vllm:request_prefill_time_seconds` (Histogram) - Request prefill time.
- `vllm:request_decode_time_seconds` (Histogram) - Request decode time.

View File

@ -22,7 +22,7 @@ python tools/install_nixl_from_source_ubuntu.py
NixlConnector uses NIXL library for underlying communication, which supports multiple transport backends. UCX (Unified Communication X) is the primary default transport library used by NIXL. Configure transport environment variables:
```bash
# Example UCX configuration, adjust according to your enviroment
# Example UCX configuration, adjust according to your environment
export UCX_TLS=all # or specify specific transports like "rc,ud,sm,^cuda_ipc" ..etc
export UCX_NET_DEVICES=all # or specify network devices like "mlx5_0:1,mlx5_1:1"
```

View File

@ -0,0 +1,149 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import ast
import logging
from pathlib import Path
from typing import Literal
logger = logging.getLogger("mkdocs")
ROOT_DIR = Path(__file__).parent.parent.parent.parent
DOCS_DIR = ROOT_DIR / "docs"
GENERATED_METRICS_DIR = DOCS_DIR / "generated" / "metrics"
# Files to scan for metric definitions - each will generate a separate table
METRIC_SOURCE_FILES = [
{"path": "vllm/v1/metrics/loggers.py", "output": "general.md"},
{
"path": "vllm/v1/spec_decode/metrics.py",
"output": "spec_decode.md",
},
{
"path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py",
"output": "nixl_connector.md",
},
]
class MetricExtractor(ast.NodeVisitor):
"""AST visitor to extract metric definitions."""
def __init__(self):
self.metrics: list[dict[str, str]] = []
def visit_Call(self, node: ast.Call) -> None:
"""Visit function calls to find metric class instantiations."""
metric_type = self._get_metric_type(node)
if metric_type:
name = self._extract_kwarg(node, "name")
documentation = self._extract_kwarg(node, "documentation")
if name:
self.metrics.append(
{
"name": name,
"type": metric_type,
"documentation": documentation or "",
}
)
self.generic_visit(node)
def _get_metric_type(self, node: ast.Call) -> str | None:
"""Determine if this call creates a metric and return its type."""
metric_type_map = {
"_gauge_cls": "gauge",
"_counter_cls": "counter",
"_histogram_cls": "histogram",
}
if isinstance(node.func, ast.Attribute):
return metric_type_map.get(node.func.attr)
return None
def _extract_kwarg(self, node: ast.Call, key: str) -> str | None:
"""Extract a keyword argument value from a function call."""
for keyword in node.keywords:
if keyword.arg == key:
return self._get_string_value(keyword.value)
return None
def _get_string_value(self, node: ast.AST) -> str | None:
"""Extract string value from an AST node."""
if isinstance(node, ast.Constant):
return str(node.value) if node.value is not None else None
return None
def extract_metrics_from_file(filepath: Path) -> list[dict[str, str]]:
"""Parse a Python file and extract all metric definitions."""
try:
with open(filepath, encoding="utf-8") as f:
source = f.read()
tree = ast.parse(source, filename=str(filepath))
extractor = MetricExtractor()
extractor.visit(tree)
return extractor.metrics
except Exception as e:
raise RuntimeError(f"Failed to parse {filepath}: {e}") from e
def generate_markdown_table(metrics: list[dict[str, str]]) -> str:
"""Generate a markdown table from extracted metrics."""
if not metrics:
return "No metrics found.\n"
# Sort by type, then by name
metrics_sorted = sorted(metrics, key=lambda m: (m["type"], m["name"]))
lines = []
lines.append("| Metric Name | Type | Description |")
lines.append("|-------------|------|-------------|")
for metric in metrics_sorted:
name = metric["name"]
metric_type = metric["type"].capitalize()
doc = metric["documentation"].replace("\n", " ").strip()
lines.append(f"| `{name}` | {metric_type} | {doc} |")
return "\n".join(lines) + "\n"
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
"""Generate metrics documentation tables from source files."""
logger.info("Generating metrics documentation")
# Create generated directory if it doesn't exist
GENERATED_METRICS_DIR.mkdir(parents=True, exist_ok=True)
total_metrics = 0
for source_config in METRIC_SOURCE_FILES:
source_path = source_config["path"]
output_file = source_config["output"]
filepath = ROOT_DIR / source_path
if not filepath.exists():
raise FileNotFoundError(f"Metrics source file not found: {filepath}")
logger.debug("Extracting metrics from: %s", source_path)
metrics = extract_metrics_from_file(filepath)
logger.debug("Found %d metrics in %s", len(metrics), source_path)
# Generate and write the markdown table for this source
table_content = generate_markdown_table(metrics)
output_path = GENERATED_METRICS_DIR / output_file
with open(output_path, "w", encoding="utf-8") as f:
f.write(table_content)
total_metrics += len(metrics)
logger.info(
"Generated metrics table: %s (%d metrics)",
output_path.relative_to(ROOT_DIR),
len(metrics),
)
logger.info(
"Total metrics generated: %d across %d files",
total_metrics,
len(METRIC_SOURCE_FILES),
)

View File

@ -33,11 +33,19 @@ Then query the endpoint to get the latest metrics from the server:
The following metrics are exposed:
??? code
## General Metrics
```python
--8<-- "vllm/engine/metrics.py:metrics-definitions"
```
--8<-- "docs/generated/metrics/general.md"
## Speculative Decoding Metrics
--8<-- "docs/generated/metrics/spec_decode.md"
## NIXL KV Connector Metrics
--8<-- "docs/generated/metrics/nixl_connector.md"
## Deprecation Policy
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,

View File

@ -51,6 +51,7 @@ hooks:
- docs/mkdocs/hooks/remove_announcement.py
- docs/mkdocs/hooks/generate_examples.py
- docs/mkdocs/hooks/generate_argparse.py
- docs/mkdocs/hooks/generate_metrics.py
- docs/mkdocs/hooks/url_schemes.py
plugins:

View File

@ -75,7 +75,7 @@ torchgeo==0.7.0
mteb==2.1.2
# Data processing
xgrammar==0.1.27
xgrammar @ git+https://github.com/divakar-amd/xgrammar@3272f7c520564858056a60480d5afdf69ae79c84
# Test async scheduling
# Utilities

View File

@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
from openai.types.responses.response_function_tool_call_output_item import (
ResponseFunctionToolCallOutputItem,
)
@ -14,7 +15,8 @@ from openai.types.responses.response_reasoning_item import (
)
from vllm.entrypoints.responses_utils import (
construct_chat_message_with_tool_call,
_construct_single_message_from_response_item,
construct_chat_messages_with_tool_call,
convert_tool_responses_to_completions_format,
)
@ -42,7 +44,43 @@ class TestResponsesUtils:
assert result == {"type": "function", "function": input_tool}
def test_construct_chat_message_with_tool_call(self):
def test_construct_chat_messages_with_tool_call(self):
"""Test construction of chat messages with tool calls."""
reasoning_item = ResponseReasoningItem(
id="lol",
summary=[],
type="reasoning",
content=[
Content(
text="Leroy Jenkins",
type="reasoning_text",
)
],
encrypted_content=None,
status=None,
)
mcp_tool_item = ResponseFunctionToolCall(
id="mcp_123",
call_id="call_123",
type="function_call",
status="completed",
name="python",
arguments='{"code": "123+456"}',
)
input_items = [reasoning_item, mcp_tool_item]
messages = construct_chat_messages_with_tool_call(input_items)
assert len(messages) == 1
message = messages[0]
assert message["role"] == "assistant"
assert message["reasoning"] == "Leroy Jenkins"
assert message["tool_calls"][0]["id"] == "call_123"
assert message["tool_calls"][0]["function"]["name"] == "python"
assert (
message["tool_calls"][0]["function"]["arguments"] == '{"code": "123+456"}'
)
def test_construct_single_message_from_response_item(self):
item = ResponseReasoningItem(
id="lol",
summary=[],
@ -56,7 +94,7 @@ class TestResponsesUtils:
encrypted_content=None,
status=None,
)
formatted_item = construct_chat_message_with_tool_call(item)
formatted_item = _construct_single_message_from_response_item(item)
assert formatted_item["role"] == "assistant"
assert formatted_item["reasoning"] == "Leroy Jenkins"
@ -74,7 +112,7 @@ class TestResponsesUtils:
status=None,
)
formatted_item = construct_chat_message_with_tool_call(item)
formatted_item = _construct_single_message_from_response_item(item)
assert formatted_item["role"] == "assistant"
assert (
formatted_item["reasoning"]
@ -88,7 +126,7 @@ class TestResponsesUtils:
output="1234",
status="completed",
)
formatted_item = construct_chat_message_with_tool_call(tool_call_output)
formatted_item = _construct_single_message_from_response_item(tool_call_output)
assert formatted_item["role"] == "tool"
assert formatted_item["content"] == "1234"
assert formatted_item["tool_call_id"] == "temp"
@ -102,7 +140,7 @@ class TestResponsesUtils:
status=None,
)
with pytest.raises(ValueError):
construct_chat_message_with_tool_call(item)
_construct_single_message_from_response_item(item)
output_item = ResponseOutputMessage(
id="msg_bf585bbbe3d500e0",
@ -119,6 +157,6 @@ class TestResponsesUtils:
type="message",
)
formatted_item = construct_chat_message_with_tool_call(output_item)
formatted_item = _construct_single_message_from_response_item(output_item)
assert formatted_item["role"] == "assistant"
assert formatted_item["content"] == "dongyi"

View File

@ -7,7 +7,8 @@ import math
import pytest
import torch
from vllm.platforms import current_platform
from vllm.platforms import CpuArchEnum, current_platform
from vllm.v1.attention.backends.cpu_attn import _get_attn_isa
if not current_platform.is_cpu():
pytest.skip("skipping CPU-only tests", allow_module_level=True)
@ -36,6 +37,21 @@ SEQ_LENS = [ # (q_len, kv_len)
]
def get_attn_isa(
block_size: int | None = None,
dtype: torch.dtype | None = None,
):
if block_size and dtype:
return _get_attn_isa(dtype, block_size)
else:
if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
return "neon"
elif torch._C._cpu._is_amx_tile_supported():
return "amx"
else:
return "vec"
# rand number generation takes too much time, cache rand tensors
@functools.lru_cache(maxsize=128, typed=False)
def tensor_cache(
@ -452,6 +468,49 @@ def test_varlen_with_paged_kv_normal_vec16(
)
@pytest.mark.parametrize("seq_lens", SEQ_LENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", [96, 128])
@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
@pytest.mark.parametrize("dtype", QTYPES)
@pytest.mark.parametrize("soft_cap", [None])
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("use_alibi", [False])
@pytest.mark.parametrize("use_sink", [False])
@pytest.mark.parametrize("isa", ["neon"])
@pytest.mark.skipif(
current_platform.get_cpu_architecture() != CpuArchEnum.ARM,
reason="Not an Arm CPU.",
)
def test_varlen_with_paged_kv_normal_neon(
seq_lens: list[tuple[int, int]],
num_heads: tuple[int, int],
head_size: int,
sliding_window: int | None,
dtype: torch.dtype,
block_size: int,
soft_cap: float | None,
num_blocks: int,
use_alibi: bool,
use_sink: bool,
isa: str,
) -> None:
varlen_with_paged_kv(
seq_lens=seq_lens,
num_heads=num_heads,
head_size=head_size,
sliding_window=sliding_window,
dtype=dtype,
block_size=block_size,
soft_cap=soft_cap,
num_blocks=num_blocks,
use_alibi=use_alibi,
use_sink=use_sink,
isa=isa,
)
@pytest.mark.parametrize("seq_lens", SEQ_LENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", [96])
@ -462,9 +521,7 @@ def test_varlen_with_paged_kv_normal_vec16(
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("use_alibi", [False])
@pytest.mark.parametrize("use_sink", [False])
@pytest.mark.parametrize(
"isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
)
@pytest.mark.parametrize("isa", [get_attn_isa()])
def test_varlen_with_paged_kv_softcap(
seq_lens: list[tuple[int, int]],
num_heads: tuple[int, int],
@ -503,9 +560,7 @@ def test_varlen_with_paged_kv_softcap(
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("use_alibi", [True])
@pytest.mark.parametrize("use_sink", [False])
@pytest.mark.parametrize(
"isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
)
@pytest.mark.parametrize("isa", [get_attn_isa()])
def test_varlen_with_paged_kv_alibi(
seq_lens: list[tuple[int, int]],
num_heads: tuple[int, int],
@ -544,9 +599,7 @@ def test_varlen_with_paged_kv_alibi(
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("use_alibi", [False])
@pytest.mark.parametrize("use_sink", [True])
@pytest.mark.parametrize(
"isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
)
@pytest.mark.parametrize("isa", [get_attn_isa()])
def test_varlen_with_paged_kv_sink(
seq_lens: list[tuple[int, int]],
num_heads: tuple[int, int],

View File

@ -54,6 +54,10 @@ def setup_cuda():
torch.set_default_device("cuda")
@pytest.mark.skipif(
current_platform.is_fp8_fnuz(),
reason="This platform supports e4m3fnuz, not e4m3fn.",
)
@pytest.mark.parametrize(
"num_tokens,d,dtype,group_size,seed",
itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS),
@ -78,14 +82,14 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
torch.manual_seed(seed)
factor_for_scale = 1e-2
fp8_info = torch.finfo(torch.float8_e4m3fn)
fp8_info = torch.finfo(current_platform.fp8_dtype())
fp8_max, fp8_min = fp8_info.max, fp8_info.min
A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(current_platform.fp8_dtype())
B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(current_platform.fp8_dtype())
block_n, block_k = block_size[0], block_size[1]
n_tiles = (N + block_n - 1) // block_n
@ -103,6 +107,9 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
assert rel_diff < 0.001
@pytest.mark.skipif(
not current_platform.is_cuda(), reason="CUTLASS only supported on CUDA platform."
)
@torch.inference_mode()
def test_w8a8_block_fp8_cutlass_matmul():
# Test simple case where weight.shape % 128 != 0,
@ -151,6 +158,10 @@ def test_w8a8_block_fp8_cutlass_matmul():
assert rel_diff < 0.001
@pytest.mark.skipif(
current_platform.is_fp8_fnuz(),
reason="This platform supports e4m3fnuz, not e4m3fn.",
)
@pytest.mark.parametrize(
"M,N,K,block_size,out_dtype,seed",
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS),

View File

@ -15,6 +15,9 @@ from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils.math_utils import cdiv
if not current_platform.is_cuda():
pytest.skip("These tests use CUTLASS which requires CUDA", allow_module_level=True)
MNK_FACTORS = [
(1, 256, 128),
(1, 16384, 1024),

View File

@ -21,6 +21,9 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
from vllm.platforms import current_platform
from vllm.scalar_type import ScalarType, scalar_types
if not current_platform.is_cuda():
pytest.skip("These tests use CUTLASS which requires CUDA", allow_module_level=True)
# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
# unit tests to a common utility function. Currently the use of
# `is_quant_method_supported` conflates kernels with quantization methods

View File

@ -92,16 +92,19 @@ def run_test(
*,
tensor_parallel_size: int,
distributed_executor_backend: str | None = None,
dtype: str = "half",
) -> None:
prompt_list = PROMPTS * 10
expected_list = EXPECTED[model] * 10
with vllm_runner(
model,
dtype="half",
dtype=dtype,
max_model_len=448,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
# TODO (NickLucche) figure out output differences with non-eager and re-enable
enforce_eager=True,
) as vllm_model:
llm = vllm_model.llm
@ -120,12 +123,28 @@ def run_test(
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("dtype", ["half"])
@create_new_process_for_each_test()
def test_models(vllm_runner, model) -> None:
def test_models(vllm_runner, model, dtype) -> None:
run_test(
vllm_runner,
model,
tensor_parallel_size=1,
dtype=dtype,
)
@pytest.mark.cpu_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("dtype", ["half"])
def test_models_cpu(vllm_runner, model, dtype) -> None:
# @create_new_process_for_each_test() does not work for some runners
# TODO: to fix cpu privilege issues in run-cpu-test-arm.sh
run_test(
vllm_runner,
model,
tensor_parallel_size=1,
dtype=dtype,
)

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import base64
import mimetypes
import os
@ -186,6 +187,7 @@ async def test_fetch_image_error_conversion():
connector.fetch_image(broken_img)
@pytest.mark.flaky(reruns=3, reruns_delay=5)
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
@ -198,8 +200,12 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
}
)
video_sync, metadata_sync = connector.fetch_video(video_url)
video_async, metadata_async = await connector.fetch_video_async(video_url)
try:
video_sync, metadata_sync = connector.fetch_video(video_url)
video_async, metadata_async = await connector.fetch_video_async(video_url)
except (TimeoutError, asyncio.TimeoutError) as e:
pytest.skip(f"Timeout fetching video (CI network flakiness): {e}")
assert np.array_equal(video_sync, video_async)
assert metadata_sync == metadata_async

View File

@ -106,8 +106,8 @@ def create_common_attn_metadata(
query_start_loc=query_start_loc,
query_start_loc_cpu=query_start_loc_cpu,
seq_lens=seq_lens,
seq_lens_cpu=seq_lens_cpu,
num_computed_tokens_cpu=num_computed_tokens_cpu,
_seq_lens_cpu=seq_lens_cpu,
_num_computed_tokens_cpu=num_computed_tokens_cpu,
num_reqs=batch_spec.batch_size,
num_actual_tokens=num_tokens,
max_query_len=max_query_len,

View File

@ -8,6 +8,7 @@ import torch._dynamo.config as dynamo_config
from vllm import SamplingParams
from vllm.logprobs import Logprob
from vllm.platforms import current_platform
from vllm.sampling_params import StructuredOutputsParams
from vllm.v1.metrics.reader import Metric
@ -70,6 +71,18 @@ def test_without_spec_decoding(
(True, "uni", True, None, True),
]
if current_platform.is_rocm():
# On ROCm, Only test with structured_outputs (deterministic)
# and skip chunk_prefill (more variable).
test_configs = [
cfg
for cfg in test_configs
if not cfg[4] # skip chunk_prefill=True
]
test_sampling_params = [
p for p in test_sampling_params if p.get("structured_outputs") is not None
]
run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)
@ -108,7 +121,14 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
(True, "uni", True, spec_config_short, True),
]
run_tests(monkeypatch, MTP_MODEL, test_configs, test_sampling_params)
# On ROCm, use TRITON_ATTN + float32 for better numerical consistency
run_tests(
monkeypatch,
MTP_MODEL,
test_configs,
test_sampling_params,
is_testing_with_spec_decoding=True,
)
@dynamo_config.patch(cache_size_limit=16)
@ -117,13 +137,21 @@ def run_tests(
model: str,
test_configs: list[tuple],
test_sampling_params: list[dict[str, Any]],
is_testing_with_spec_decoding: bool = False,
):
"""Test consistency of combos of async scheduling, preemption,
uni/multiproc executor with spec decoding."""
with monkeypatch.context() as m:
# avoid precision errors
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
if current_platform.is_rocm():
if is_testing_with_spec_decoding:
# Use TRITON_ATTN for spec decoding test for consistency
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
else:
m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA")
else:
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
# lock matmul precision to full FP32
m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
# m.setenv("VLLM_BATCH_INVARIANT", "1")
@ -145,6 +173,7 @@ def run_tests(
async_scheduling,
spec_config,
test_prefill_chunking=test_prefill_chunking,
is_testing_with_spec_decoding=is_testing_with_spec_decoding,
)
outputs.append(test_results)
@ -174,17 +203,34 @@ def run_tests(
name_0=f"baseline=[{baseline_config}], params={params}",
name_1=f"config=[{test_config}], params={params}",
)
assert _all_logprobs_match(base_logprobs, test_logprobs)
# On ROCm with TRITON_ATTN (spec decoding test), skip strict
# logprobs comparison when logprobs are requested
skip_logprobs_check = (
current_platform.is_rocm()
and params.get("logprobs")
and is_testing_with_spec_decoding
)
if not skip_logprobs_check:
assert _all_logprobs_match(base_logprobs, test_logprobs)
if (
base_acceptance_rate is not None
and test_acceptance_rate is not None
):
if "spec_mml=None" in test_config:
# Preemption causes more variance in acceptance rates
if (
current_platform.is_rocm()
and "preemption=True" in test_config
):
tolerance = 0.10
else:
tolerance = 0.05
assert (
test_acceptance_rate > base_acceptance_rate
or test_acceptance_rate
== pytest.approx(base_acceptance_rate, rel=5e-2)
== pytest.approx(base_acceptance_rate, rel=tolerance)
)
else:
# Currently the reported acceptance rate is expected to be
@ -215,6 +261,7 @@ def run_test(
async_scheduling: bool,
spec_config: dict[str, Any] | None,
test_prefill_chunking: bool,
is_testing_with_spec_decoding: bool = False,
):
spec_decoding = spec_config is not None
cache_arg: dict[str, Any] = (
@ -233,6 +280,15 @@ def run_test(
print("-" * 80)
print(f"---- TESTING {test_str}: {test_config}")
print("-" * 80)
# On ROCm: use float16 for first test (ROCM_AITER_FA), but float32 for
# spec decoding test (TRITON_ATTN) for better precision.
# On others: always use float32.
if current_platform.is_rocm() and not is_testing_with_spec_decoding:
dtype = "float16"
else:
dtype = "float32"
with VllmRunner(
model,
max_model_len=512,
@ -242,7 +298,7 @@ def run_test(
# enforce_eager=True,
async_scheduling=async_scheduling,
distributed_executor_backend=executor,
dtype="float32", # avoid precision errors
dtype=dtype,
speculative_config=spec_config,
disable_log_stats=False,
**cache_arg,
@ -302,11 +358,21 @@ def _all_logprobs_match(req_a, req_b) -> bool:
def _logprobs_match(lps_a: dict[int, Logprob], lps_b: dict[int, Logprob]) -> bool:
return len(lps_a) == len(lps_b) and all(
a.decoded_token == b.decoded_token
and a.rank == b.rank
and a.logprob == pytest.approx(b.logprob, rel=1e-3, abs=1e-6)
for a, b in ((lps_a[x], lps_b[x]) for x in lps_a)
if current_platform.is_rocm():
# ROCm has higher numerical variance
# due to use of float16.
rel_tol, abs_tol = 5e-2, 1e-5
else:
rel_tol, abs_tol = 1e-3, 1e-6
return (
len(lps_a) == len(lps_b)
and lps_a.keys() == lps_b.keys()
and all(
a.decoded_token == b.decoded_token
and a.rank == b.rank
and a.logprob == pytest.approx(b.logprob, rel=rel_tol, abs=abs_tol)
for a, b in ((lps_a[x], lps_b[x]) for x in lps_a)
)
)

View File

@ -0,0 +1,131 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Test that verifies no implicit GPU-CPU synchronization occurs during
speculative decoding generation under expected conditions.
"""
import multiprocessing
import sys
import traceback
import pytest
import torch
@pytest.fixture
def sync_tracker():
"""
Fixture that patches CommonAttentionMetadata.seq_lens_cpu to detect
lazy init syncs. Prints stack traces immediately when syncs occur.
"""
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
# Shared counter for cross-process communication (inherited by fork)
sync_count = multiprocessing.Value("i", 0)
# Save original property
original_prop = CommonAttentionMetadata.seq_lens_cpu
original_fget = original_prop.fget
# Create tracking wrapper
def tracking_seq_lens_cpu(self):
if self._seq_lens_cpu is None:
# Increment counter
with sync_count.get_lock():
sync_count.value += 1
count = sync_count.value
# Print stack trace immediately (shows in subprocess output)
print(f"\n{'=' * 60}", file=sys.stderr)
print(f"SYNC #{count}: seq_lens_cpu lazy init triggered!", file=sys.stderr)
print(f"{'=' * 60}", file=sys.stderr)
traceback.print_stack(file=sys.stderr)
print(f"{'=' * 60}\n", file=sys.stderr)
sys.stderr.flush()
return original_fget(self)
# Apply patch
CommonAttentionMetadata.seq_lens_cpu = property(tracking_seq_lens_cpu)
class SyncTracker:
@property
def count(self) -> int:
return sync_count.value
def assert_no_sync(self, msg: str = ""):
count = sync_count.value
assert count == 0, (
f"Unexpected GPU-CPU sync: seq_lens_cpu lazy init triggered "
f"{count} times. See stack traces above. {msg}"
)
yield SyncTracker()
# Restore original property
CommonAttentionMetadata.seq_lens_cpu = original_prop
torch._dynamo.reset()
# Test configurations: (model, spec_model, method, num_spec_tokens, backend_env)
SPEC_DECODE_CONFIGS = [
pytest.param(
"meta-llama/Llama-3.2-1B-Instruct",
"nm-testing/Llama3_2_1B_speculator.eagle3",
"eagle3",
2,
id="eagle3-llama",
),
pytest.param(
"eagle618/deepseek-v3-random",
"eagle618/eagle-deepseek-v3-random",
"eagle",
2,
id="eagle-mla-deepseek",
),
]
@pytest.mark.parametrize(
"model,spec_model,method,num_spec_tokens",
SPEC_DECODE_CONFIGS,
)
def test_no_sync_with_spec_decode(
sync_tracker,
model: str,
spec_model: str,
method: str,
num_spec_tokens: int,
):
"""
Test that no implicit GPU-CPU sync occurs during speculative decoding
generation.
"""
# Import vLLM AFTER sync_tracker fixture has applied the patch
from vllm import LLM, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory
llm = LLM(
model=model,
max_model_len=256,
speculative_config={
"method": method,
"num_speculative_tokens": num_spec_tokens,
"model": spec_model,
},
enforce_eager=True,
async_scheduling=True,
)
outputs = llm.generate(
["Hello, my name is"],
SamplingParams(temperature=0, max_tokens=10),
)
assert len(outputs) == 1
assert len(outputs[0].outputs[0].text) > 0
del llm
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()
sync_tracker.assert_no_sync()

View File

@ -191,8 +191,8 @@ def test_suffix_decoding_acceptance(
# Expect the acceptance rate to improve.
assert first_accept_rate < last_accept_rate
# Heuristic: expect at least 82.5% acceptance rate at the end.
assert last_accept_rate > 0.825
# Heuristic: expect at least 80.0% acceptance rate at the end.
assert last_accept_rate > 0.80
del spec_llm
torch.cuda.empty_cache()

View File

@ -88,8 +88,8 @@ def forward_attention(
query_start_loc=query_start_loc,
query_start_loc_cpu=query_start_loc.cpu(),
seq_lens=seq_lens,
seq_lens_cpu=seq_lens.cpu(),
num_computed_tokens_cpu=context_lens.cpu(),
_seq_lens_cpu=seq_lens.cpu(),
_num_computed_tokens_cpu=context_lens.cpu(),
num_reqs=batch_size,
num_actual_tokens=num_actual_tokens,
max_query_len=max_query_len,

View File

@ -103,7 +103,7 @@ def create_cross_attention_backend(
# needed here to know how many tokens to attend to from the cached
# cross-attention KV cache.
new_metadata.seq_lens = common_attn_metadata.encoder_seq_lens
new_metadata.seq_lens_cpu = torch.from_numpy(
new_metadata._seq_lens_cpu = torch.from_numpy(
common_attn_metadata.encoder_seq_lens_cpu
)

View File

@ -788,7 +788,7 @@ async def benchmark(
)
print(
"{:<40} {:<10.2f}".format(
"Total Token throughput (tok/s):", metrics.total_token_throughput
"Total token throughput (tok/s):", metrics.total_token_throughput
)
)

View File

@ -667,8 +667,9 @@ class VllmConfig:
default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
self._apply_optimization_level_defaults(default_config)
if (
self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
self.compilation_config.cudagraph_mode.requires_piecewise_compilation()
and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
):
logger.info(
@ -693,22 +694,29 @@ class VllmConfig:
if current_platform.support_static_graph_mode():
# if cudagraph_mode has full cudagraphs, we need to check support
if (
self.compilation_config.cudagraph_mode.has_full_cudagraphs()
and self.model_config is not None
):
if self.model_config.pooler_config is not None:
if model_config := self.model_config:
if (
self.compilation_config.cudagraph_mode.has_full_cudagraphs()
and model_config.pooler_config is not None
):
logger.warning_once(
"Pooling models do not support full cudagraphs. "
"Overriding cudagraph_mode to PIECEWISE."
)
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
elif self.model_config.is_encoder_decoder:
logger.warning_once(
"Encoder-decoder models do not support full cudagraphs. "
"Overriding cudagraph_mode to PIECEWISE."
elif (
model_config.is_encoder_decoder
and self.compilation_config.cudagraph_mode
not in (CUDAGraphMode.NONE, CUDAGraphMode.FULL_DECODE_ONLY)
):
logger.info_once(
"Encoder-decoder models do not support %s. "
"Overriding cudagraph_mode to FULL_DECODE_ONLY.",
self.compilation_config.cudagraph_mode.name,
)
self.compilation_config.cudagraph_mode = (
CUDAGraphMode.FULL_DECODE_ONLY
)
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
# disable cudagraph when enforce eager execution
if self.model_config is not None and self.model_config.enforce_eager:
@ -1048,8 +1056,14 @@ class VllmConfig:
self.compilation_config.max_cudagraph_capture_size
)
if max_cudagraph_capture_size is None:
decode_query_len = 1
if (
self.speculative_config
and self.speculative_config.num_speculative_tokens
):
decode_query_len += self.speculative_config.num_speculative_tokens
max_cudagraph_capture_size = min(
self.scheduler_config.max_num_seqs * 2, 512
self.scheduler_config.max_num_seqs * decode_query_len * 2, 512
)
max_num_tokens = self.scheduler_config.max_num_batched_tokens
max_cudagraph_capture_size = min(max_num_tokens, max_cudagraph_capture_size)

View File

@ -8,3 +8,5 @@ Shared constants for vLLM entrypoints.
# These constants help mitigate header abuse attacks
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT = 4194304 # 4 MB
H11_MAX_HEADER_COUNT_DEFAULT = 256
MCP_PREFIX = "mcp_"

View File

@ -19,6 +19,7 @@ from vllm import envs
from vllm.entrypoints.chat_utils import (
ChatTemplateContentFormatOption,
)
from vllm.entrypoints.constants import MCP_PREFIX
from vllm.entrypoints.openai.parser.harmony_utils import (
get_encoding,
get_streamable_parser_for_assistant,
@ -303,7 +304,7 @@ class ParsableContext(ConversationContext):
result_str = result.content[0].text
message = ResponseFunctionToolCallOutputItem(
id=f"fco_{random_uuid()}",
id=f"mcpo_{random_uuid()}",
type="function_call_output",
call_id=f"call_{random_uuid()}",
output=result_str,
@ -385,6 +386,9 @@ class ParsableContext(ConversationContext):
if not self.parser.response_messages:
return []
last_msg = self.parser.response_messages[-1]
# change this to a mcp_ function call
last_msg.id = f"{MCP_PREFIX}{random_uuid()}"
self.parser.response_messages[-1] = last_msg
if last_msg.name == "code_interpreter":
return await self.call_python_tool(self._tool_sessions["python"], last_msg)
elif last_msg.name == "web_search_preview":

View File

@ -1339,6 +1339,7 @@ class OpenAIServing:
)
engine_prompt = engine_prompts[0]
request_prompt = request_prompts[0]
prompt_text, _, _ = self._get_prompt_components(request_prompt)
# Update the sampling params.
sampling_params.max_tokens = self.max_model_len - len(

View File

@ -22,6 +22,7 @@ from openai.types.responses.response_reasoning_item import ResponseReasoningItem
from openai.types.responses.tool import Tool
from vllm import envs
from vllm.entrypoints.constants import MCP_PREFIX
from vllm.entrypoints.openai.protocol import (
ChatCompletionMessageParam,
ResponseInputOutputItem,
@ -44,13 +45,13 @@ def make_response_output_items_from_parsable_context(
)
if isinstance(output_messages[-1], ResponseFunctionToolCall):
mcp_message = McpCall(
id=f"mcp_{random_uuid()}",
id=f"{MCP_PREFIX}{random_uuid()}",
arguments=output_messages[-1].arguments,
name=output_messages[-1].name,
server_label=output_messages[
-1
].name, # TODO: store the server label
type="mcp_call",
type=f"{MCP_PREFIX}call",
status="completed",
output=message.output,
# TODO: support error output
@ -98,12 +99,63 @@ def construct_input_messages(
if isinstance(request_input, str):
messages.append({"role": "user", "content": request_input})
else:
for item in request_input:
messages.append(construct_chat_message_with_tool_call(item))
input_messages = construct_chat_messages_with_tool_call(request_input)
messages.extend(input_messages)
return messages
def construct_chat_message_with_tool_call(
def _maybe_combine_reasoning_and_tool_call(
item: ResponseInputOutputItem, messages: list[ChatCompletionMessageParam]
) -> ChatCompletionMessageParam | None:
"""Many models treat MCP calls and reasoning as a single message.
This function checks if the last message is a reasoning message and
the current message is a tool call"""
if not (
isinstance(item, ResponseFunctionToolCall) and item.id.startswith(MCP_PREFIX)
):
return None
if len(messages) == 0:
return None
last_message = messages[-1]
if not (
last_message.get("role") == "assistant"
and last_message.get("reasoning") is not None
):
return None
last_message["tool_calls"] = [
ChatCompletionMessageToolCallParam(
id=item.call_id,
function=FunctionCallTool(
name=item.name,
arguments=item.arguments,
),
type="function",
)
]
return last_message
def construct_chat_messages_with_tool_call(
input_messages: list[ResponseInputOutputItem],
) -> list[ChatCompletionMessageParam]:
"""This function wraps _construct_single_message_from_response_item
Because some chatMessages come from multiple response items
for example a reasoning item and a MCP tool call are two response items
but are one chat message
"""
messages: list[ChatCompletionMessageParam] = []
for item in input_messages:
maybe_combined_message = _maybe_combine_reasoning_and_tool_call(item, messages)
if maybe_combined_message is not None:
messages[-1] = maybe_combined_message
else:
messages.append(_construct_single_message_from_response_item(item))
return messages
def _construct_single_message_from_response_item(
item: ResponseInputOutputItem,
) -> ChatCompletionMessageParam:
if isinstance(item, ResponseFunctionToolCall):

View File

@ -881,7 +881,7 @@ class FusedMoE(CustomOp):
# Record that the clone will be used by shared_experts_stream
# to avoid gc issue from deallocation of hidden_states_clone
# For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
# NOTE: We dont need shared_output.record_stream(current_stream())
# NOTE: We don't need shared_output.record_stream(current_stream())
# because we synch the streams before using shared_output.
hidden_states_clone.record_stream(self.shared_experts_stream)

View File

@ -28,7 +28,7 @@ class CompressedTensorsW4A16Fp4(CompressedTensorsScheme):
@classmethod
def get_min_capability(cls) -> int:
# dont restrict as emulations
# don't restrict as emulations
return 80
def create_weights(

View File

@ -31,7 +31,6 @@ from vllm.model_executor.utils import replace_parameter
from vllm.platforms import current_platform
from vllm.triton_utils import tl, triton
from vllm.utils.deep_gemm import (
DeepGemmQuantScaleFMT,
fp8_gemm_nt,
is_deep_gemm_e8m0_used,
is_deep_gemm_supported,
@ -248,6 +247,7 @@ class W8A8BlockFp8LinearOp:
self.act_quant_group_shape = act_quant_group_shape
self.is_deep_gemm_supported = is_deep_gemm_supported()
self.is_hopper = current_platform.is_device_capability(90)
self.is_blackwell = current_platform.is_device_capability(100)
self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used()
# Get the correct blockscale mul and input quant operations.
@ -303,7 +303,7 @@ class W8A8BlockFp8LinearOp:
weight: torch.Tensor,
weight_scale: torch.Tensor,
) -> torch.Tensor:
if DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0:
if self.use_deep_gemm_e8m0 and self.is_blackwell:
q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm(
input_2d,
group_size=self.act_quant_group_shape.col,

View File

@ -502,6 +502,7 @@ class HunYuanVisionTransformer(nn.Module):
cu_seqlens: list = [0]
hidden_states = x.to(device=self.device, dtype=self.dtype)
# embeddings = patch_embeds + patch_pos_embed
hidden_states = self.embeddings(hidden_states, grid_thw)
for t, h, w in grid_thw:
@ -515,8 +516,14 @@ class HunYuanVisionTransformer(nn.Module):
hidden_states = hidden_states.reshape(seq_len, -1)
hidden_states = hidden_states.unsqueeze(0)
for layer_num, layer in enumerate(self.layers):
hidden_states = layer(hidden_states)
# build per-image lengths once
split_lengths = [int(h) * int(w) for (_, h, w) in grid_thw]
for layer in self.layers:
# hidden_states: (1, T_total, D)
parts = hidden_states.split(split_lengths, dim=1) # list of (1, L_i, D)
parts = [layer(p) for p in parts]
hidden_states = torch.cat(parts, dim=1)
# adapter
split_lengths = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()

View File

@ -201,7 +201,7 @@ class MiniMaxM2Attention(nn.Module):
self.rotary_emb = get_rope(
self.head_dim,
rotary_dim=rotary_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
rope_parameters=rope_parameters,
)

View File

@ -403,6 +403,7 @@ class Qwen3MoeModel(nn.Module):
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
self.config = config
self.quant_config = quant_config
self.embed_tokens = VocabParallelEmbedding(
config.vocab_size,
config.hidden_size,
@ -505,6 +506,19 @@ class Qwen3MoeModel(nn.Module):
loaded_params: set[str] = set()
expert_params_mapping = self.get_expert_mapping()
for name, loaded_weight in weights:
if self.quant_config is not None and (
scale_name := self.quant_config.get_cache_scale(name)
):
# Loading kv cache quantization scales
param = params_dict[scale_name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
assert loaded_weight.numel() == 1, (
f"KV scale numel {loaded_weight.numel()} != 1"
)
loaded_weight = loaded_weight.squeeze()
weight_loader(param, loaded_weight)
loaded_params.add(scale_name)
continue
for param_name, weight_name, shard_id in stacked_params_mapping:
# Skip non-stacked layers and experts (experts handled below).
if weight_name not in name:

View File

@ -89,7 +89,7 @@ def _extract_data_from_linear_base_module(
assert m.quant_method.quant_config is not None
w = m.weight
ws = m.weight_scale
ws = m.weight_scale_inv if hasattr(m, "weight_scale_inv") else m.weight_scale
quant_block_size = m.quant_method.quant_config.weight_block_size
assert isinstance(w, torch.Tensor)

View File

@ -954,7 +954,7 @@ MultiModalKwargsOptionalItems: TypeAlias = (
)
@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.13.")
@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.14.")
class MultiModalKwargs(UserDict[str, NestedTensors]):
"""
A dictionary that represents the keyword arguments to
@ -964,7 +964,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
@staticmethod
@deprecated(
"`MultiModalKwargs.from_hf_inputs` is deprecated and "
"will be removed in v0.13. "
"will be removed in v0.14. "
"Please use `MultiModalKwargsItems.from_hf_inputs` and "
"access the tensor data using `.get_data()`."
)
@ -977,7 +977,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
@staticmethod
@deprecated(
"`MultiModalKwargs.from_items` is deprecated and "
"will be removed in v0.13. "
"will be removed in v0.14. "
"Please use `MultiModalKwargsItems.from_seq` and "
"access the tensor data using `.get_data()`."
)

View File

@ -429,12 +429,12 @@ def group_mm_kwargs_by_modality(
if merge_by_field_config is not None:
logger.warning_once(
"The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` "
"is deprecated and will be removed in v0.13."
"is deprecated and will be removed in v0.14."
)
if multimodal_cpu_fields is not None:
logger.warning_once(
"The `multimodal_cpu_fields` argument of `group_mm_kwargs_by_modality` "
"is deprecated and will be removed in v0.13."
"is deprecated and will be removed in v0.14."
)
from vllm.multimodal.inputs import MultiModalKwargsItems

View File

@ -17,6 +17,8 @@ class DeepseekV32Tokenizer(HfTokenizer):
self.name_or_path = (
tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else ""
)
self._added_vocab = self.tokenizer.get_added_vocab()
self._added_vocab_size = len(self._added_vocab)
@classmethod
def from_pretrained(
@ -98,7 +100,7 @@ class DeepseekV32Tokenizer(HfTokenizer):
def __len__(self) -> int:
# </think> is an added token in DeepseekV32 tokenizer
return self.vocab_size + len(self.get_added_vocab())
return self.vocab_size + self._added_vocab_size
def __call__(
self,
@ -120,7 +122,7 @@ class DeepseekV32Tokenizer(HfTokenizer):
return self.tokenizer.get_vocab()
def get_added_vocab(self) -> dict[str, int]:
return self.tokenizer.get_added_vocab()
return self._added_vocab.copy()
def encode(
self,

View File

@ -17,7 +17,7 @@ def __getattr__(name: str):
warnings.warn(
"`vllm.transformers_utils.tokenizer.AnyTokenizer` has been moved to "
"`vllm.tokenizers.TokenizerLike`. "
"The old name will be removed in v0.13.",
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)
@ -29,7 +29,7 @@ def __getattr__(name: str):
warnings.warn(
"`vllm.transformers_utils.tokenizer.get_tokenizer` "
"has been moved to `vllm.tokenizers.get_tokenizer`. "
"The old name will be removed in v0.13.",
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)
@ -41,7 +41,7 @@ def __getattr__(name: str):
warnings.warn(
"`vllm.transformers_utils.tokenizer.cached_get_tokenizer` "
"has been moved to `vllm.tokenizers.cached_get_tokenizer`. "
"The old name will be removed in v0.13.",
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)
@ -53,7 +53,7 @@ def __getattr__(name: str):
warnings.warn(
"`vllm.transformers_utils.tokenizer.cached_tokenizer_from_config` "
"has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. "
"The old name will be removed in v0.13.",
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)
@ -65,7 +65,7 @@ def __getattr__(name: str):
warnings.warn(
"`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` "
"has been moved to `vllm.tokenizers.init_tokenizer_from_config`. "
"The old name will be removed in v0.13.",
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)
@ -75,7 +75,7 @@ def __getattr__(name: str):
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@deprecated("Will be removed in v0.13. Please use `tokenizer.decode()` instead.")
@deprecated("Will be removed in v0.14. Please use `tokenizer.decode()` instead.")
def decode_tokens(
tokenizer: TokenizerLike,
token_ids: list[int],
@ -97,7 +97,7 @@ def decode_tokens(
return tokenizer.decode(token_ids, **kw_args)
@deprecated("Will be removed in v0.13. Please use `tokenizer.encode()` instead.")
@deprecated("Will be removed in v0.14. Please use `tokenizer.encode()` instead.")
def encode_tokens(
tokenizer: TokenizerLike,
text: str,

View File

@ -11,7 +11,7 @@ def __getattr__(name: str):
warnings.warn(
"`vllm.transformers_utils.tokenizer_base.TokenizerBase` has been "
"moved to `vllm.tokenizers.TokenizerLike`. "
"The old name will be removed in v0.13.",
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)
@ -23,7 +23,7 @@ def __getattr__(name: str):
warnings.warn(
"`vllm.transformers_utils.tokenizer_base.TokenizerRegistry` has been "
"moved to `vllm.tokenizers.TokenizerRegistry`. "
"The old name will be removed in v0.13.",
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)

View File

@ -21,7 +21,7 @@ from vllm.v1.attention.backends.utils import (
CommonAttentionMetadata,
split_decodes_and_prefills,
)
from vllm.v1.kv_cache_interface import AttentionSpec
from vllm.v1.kv_cache_interface import AttentionSpec, CrossAttentionSpec
logger = init_logger(__name__)
@ -50,11 +50,13 @@ class CPUAttentionBackend(AttentionBackend):
@classmethod
def supports_attn_type(cls, attn_type: str) -> bool:
"""CPU attention supports decoder and encoder-only attention."""
"""CPU attention supports decoder,
encoder-only and encoder-decoder attention."""
return attn_type in (
AttentionType.DECODER,
AttentionType.ENCODER,
AttentionType.ENCODER_ONLY,
AttentionType.ENCODER_DECODER,
)
@staticmethod
@ -136,6 +138,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
self.window_size = -1
self.block_size = vllm_config.cache_config.block_size
self.isa = _get_attn_isa(self.dtype, self.block_size)
self.is_cross_attention = isinstance(kv_cache_spec, CrossAttentionSpec)
def build(
self,
@ -151,7 +154,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
seq_lens = common_attn_metadata.seq_lens
block_table_tensor = common_attn_metadata.block_table_tensor
slot_mapping = common_attn_metadata.slot_mapping
causal = common_attn_metadata.causal
causal = False if self.is_cross_attention else common_attn_metadata.causal
sdpa_start_loc = query_start_loc
num_decode_tokens = 0
@ -171,22 +174,19 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
query_start_loc = query_start_loc[: num_decodes + 1]
block_table_tensor = block_table_tensor[:num_decodes]
sheduler_metadata = None
if causal:
# for decode batch, use the custom kernel
sheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
num_reqs=num_reqs,
num_heads=self.num_heads,
num_kv_heads=self.num_kv_heads,
head_dim=self.head_dim,
seq_lens=seq_lens,
dtype=self.dtype,
query_start_loc=query_start_loc,
causal=causal,
sliding_window_size=self.window_size,
isa=self.isa,
enable_kv_split=True,
)
sheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
num_reqs=num_reqs,
num_heads=self.num_heads,
num_kv_heads=self.num_kv_heads,
head_dim=self.head_dim,
seq_lens=seq_lens,
dtype=self.dtype,
query_start_loc=query_start_loc,
causal=causal,
sliding_window_size=self.window_size,
isa=self.isa,
enable_kv_split=True,
)
attn_metadata = CPUAttentionMetadata(
isa=self.isa,

View File

@ -370,6 +370,6 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
num_accepted_tokens = torch.diff(m.query_start_loc)
num_decode_draft_tokens_cpu = (num_accepted_tokens - 1).cpu()
m.num_computed_tokens_cpu = m.seq_lens_cpu - num_accepted_tokens.cpu()
m._num_computed_tokens_cpu = m.seq_lens_cpu - num_accepted_tokens.cpu()
return self.build(0, m, num_accepted_tokens, num_decode_draft_tokens_cpu)

View File

@ -18,7 +18,7 @@ from typing import (
import numpy as np
import torch
from typing_extensions import runtime_checkable
from typing_extensions import deprecated, runtime_checkable
from vllm.config import VllmConfig, get_layers_from_vllm_config
from vllm.utils.math_utils import cdiv
@ -66,11 +66,6 @@ class CommonAttentionMetadata:
"""(batch_size + 1,), the start location of each request in query Tensor"""
seq_lens: torch.Tensor
seq_lens_cpu: torch.Tensor
"""(batch_size,), the length of each request including both computed tokens
and newly scheduled tokens"""
num_computed_tokens_cpu: torch.Tensor
"""(batch_size,), the number of computed tokens for each request"""
num_reqs: int
@ -81,7 +76,7 @@ class CommonAttentionMetadata:
max_query_len: int
"""Longest query in batch"""
max_seq_len: int
"""Longest context length in batch"""
"""Longest context length (may be an upper bound)"""
block_table_tensor: torch.Tensor
slot_mapping: torch.Tensor
@ -100,6 +95,40 @@ class CommonAttentionMetadata:
dcp_local_seq_lens_cpu: torch.Tensor | None = None
"""Sequence lengths of the local rank in decode context parallelism world"""
# WARNING: Deprecated fields. Will be removed in a future release (v0.14.0)
_seq_lens_cpu: torch.Tensor | None = None
_num_computed_tokens_cpu: torch.Tensor | None = None
@property
@deprecated(
"""
Prefer using device seq_lens directly to avoid implicit H<>D sync.
If a CPU copy is needed, use `seq_lens.cpu()` instead.
Will be removed in a future release (v0.14.0)
"""
)
def seq_lens_cpu(self) -> torch.Tensor:
if self._seq_lens_cpu is None:
self._seq_lens_cpu = self.seq_lens.to("cpu")
return self._seq_lens_cpu
@property
@deprecated(
"""
Prefer using device seq_lens directly to avoid implicit H<>D sync which breaks full
async scheduling. If a CPU copy is needed, it can be derived from
query_start_loc_cpu and seq_lens.
Will be removed in a future release (v0.14.0)
"""
)
def num_computed_tokens_cpu(self) -> torch.Tensor:
if self._num_computed_tokens_cpu is None:
query_seq_lens = (
self.query_start_loc_cpu[1:] - self.query_start_loc_cpu[:-1]
)
self._num_computed_tokens_cpu = self.seq_lens_cpu - query_seq_lens
return self._num_computed_tokens_cpu
# TODO(lucas): remove once we have FULL-CG spec-decode support
def unpadded(
self, num_actual_tokens: int, num_actual_reqs: int
@ -109,8 +138,12 @@ class CommonAttentionMetadata:
query_start_loc=self.query_start_loc[: num_actual_reqs + 1],
query_start_loc_cpu=self.query_start_loc_cpu[: num_actual_reqs + 1],
seq_lens=self.seq_lens[:num_actual_reqs],
seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs],
num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs],
_seq_lens_cpu=self._seq_lens_cpu[:num_actual_reqs]
if self._seq_lens_cpu is not None
else None,
_num_computed_tokens_cpu=self._num_computed_tokens_cpu[:num_actual_reqs]
if self._num_computed_tokens_cpu is not None
else None,
num_reqs=num_actual_reqs,
num_actual_tokens=num_actual_tokens,
max_query_len=self.max_query_len,
@ -224,14 +257,14 @@ def _make_metadata_with_slice(
query_start_loc=query_start_loc,
query_start_loc_cpu=query_start_loc_cpu,
seq_lens=seq_lens,
seq_lens_cpu=seq_lens_cpu,
num_computed_tokens_cpu=num_computed_tokens_cpu,
num_reqs=num_requests,
num_actual_tokens=num_actual_tokens,
max_query_len=max_query_len,
max_seq_len=max_seq_len,
block_table_tensor=block_table_tensor,
slot_mapping=slot_mapping,
_seq_lens_cpu=seq_lens_cpu,
_num_computed_tokens_cpu=num_computed_tokens_cpu,
)
@ -689,9 +722,7 @@ def make_local_attention_virtual_batches(
return CommonAttentionMetadata(
query_start_loc_cpu=query_start_loc_cpu,
query_start_loc=query_start_loc_cpu.to(device=device, non_blocking=True),
seq_lens_cpu=seq_lens_cpu,
seq_lens=seq_lens_cpu.to(device=device, non_blocking=True),
num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local),
num_reqs=len(seq_lens_cpu),
num_actual_tokens=common_attn_metadata.num_actual_tokens,
max_query_len=seqlens_q_local.max(),
@ -699,6 +730,8 @@ def make_local_attention_virtual_batches(
block_table_tensor=block_table_local,
slot_mapping=common_attn_metadata.slot_mapping,
causal=True,
_seq_lens_cpu=seq_lens_cpu,
_num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local),
)
@ -719,7 +752,6 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
logits_indices = logits_indices_padded[:num_logits_indices]
num_reqs = common_attn_metadata.num_reqs
query_start_loc = common_attn_metadata.query_start_loc
seq_lens = common_attn_metadata.seq_lens
# Example inputs
# num_reqs: 3
# generation_indices: [14, 18, 19, 27]
@ -748,9 +780,7 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
common_attn_metadata = CommonAttentionMetadata(
query_start_loc=decode_query_start_loc,
query_start_loc_cpu=decode_query_start_loc.to("cpu", non_blocking=True),
seq_lens=seq_lens,
seq_lens_cpu=seq_lens.to("cpu", non_blocking=True),
num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu,
seq_lens=common_attn_metadata.seq_lens,
num_reqs=num_reqs,
num_actual_tokens=total_num_decode_tokens,
max_query_len=decode_max_query_len,
@ -758,6 +788,8 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
block_table_tensor=common_attn_metadata.block_table_tensor,
slot_mapping=common_attn_metadata.slot_mapping,
causal=True,
_seq_lens_cpu=common_attn_metadata._seq_lens_cpu,
_num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu,
)
return common_attn_metadata

View File

@ -192,7 +192,7 @@ class AsyncLLM(EngineClient):
@property
@deprecated(
"`AsyncLLM.processor` has been renamed to `AsyncLLM.input_processor`. "
"The old name will be removed in v0.13."
"The old name will be removed in v0.14."
)
def processor(self):
return self.input_processor

View File

@ -139,7 +139,7 @@ class LLMEngine:
@property
@deprecated(
"`LLMEngine.processor` has been renamed to `LLMEngine.input_processor`. "
"The old name will be removed in v0.13."
"The old name will be removed in v0.14."
)
def processor(self):
return self.input_processor

View File

@ -10,7 +10,7 @@ def __getattr__(name: str):
warnings.warn(
"`vllm.v1.engine.processor.Processor` has been moved to "
"`vllm.v1.engine.input_processor.InputProcessor`. "
"The old name will be removed in v0.13.",
"The old name will be removed in v0.14.",
DeprecationWarning,
stacklevel=2,
)

View File

@ -440,16 +440,16 @@ class EagleProposer:
# of main model.
# Increment the sequence lengths.
common_attn_metadata.seq_lens += 1
# This is an out-of-place operation to avoid modifying the original tensor.
common_attn_metadata.seq_lens_cpu = common_attn_metadata.seq_lens_cpu + 1
# For the requests that exceed the max model length, we set the
# sequence length to 1 to minimize their overheads in attention.
common_attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1)
common_attn_metadata.num_computed_tokens_cpu = (
common_attn_metadata.seq_lens_cpu - 1
)
# Also update the CPU-side shadow; NOTE: this is hacky and should be
# removed in when common_attn_metadata.seq_lens_cpu is deprecated.
if common_attn_metadata._seq_lens_cpu is not None:
common_attn_metadata._seq_lens_cpu += 1
if common_attn_metadata._num_computed_tokens_cpu is not None:
common_attn_metadata._num_computed_tokens_cpu += 1
# Compute the slot mapping.
if self.uses_mrope:
@ -656,8 +656,8 @@ class EagleProposer:
query_start_loc=common_attn_metadata.query_start_loc,
seq_lens=common_attn_metadata.seq_lens,
query_start_loc_cpu=query_start_loc_cpu,
seq_lens_cpu=common_attn_metadata.seq_lens_cpu,
num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu,
_seq_lens_cpu=common_attn_metadata._seq_lens_cpu,
_num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu,
num_reqs=common_attn_metadata.num_reqs,
num_actual_tokens=total_num_tokens,
max_query_len=new_query_len_per_req.max().item(),
@ -932,8 +932,8 @@ class EagleProposer:
query_start_loc=new_query_start_loc_cpu.to(device, non_blocking=True),
seq_lens=new_seq_lens_cpu.to(device, non_blocking=True),
query_start_loc_cpu=new_query_start_loc_cpu,
seq_lens_cpu=new_seq_lens_cpu,
num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu,
_seq_lens_cpu=new_seq_lens_cpu,
_num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu,
num_reqs=common_attn_metadata.num_reqs,
num_actual_tokens=total_num_tokens,
max_query_len=new_query_len_per_req.max().item(),

View File

@ -38,16 +38,16 @@ class MedusaProposer:
self,
target_hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> list[list[int]]:
) -> torch.Tensor:
# Generate blocks and compute logits
blocks = self.model(target_hidden_states)
logits = self.model.compute_logits(blocks)
# Get draft tokens and transpose the result
# TODO(woosuk): OPTIMIZATION: Return GPU tensor without GPU-CPU
# synchronization.
draft_tokens = [logit.argmax(dim=-1).tolist() for logit in logits]
return [list(row) for row in zip(*draft_tokens)]
# Compute argmax for each Medusa head and stack into a single tensor
# Shape: [batch_size, num_heads]
draft_tokens = torch.stack([logit.argmax(dim=-1) for logit in logits], dim=1)
return draft_tokens
def load_model(self, target_model: nn.Module) -> None:
from vllm.compilation.backends import set_model_tag

View File

@ -10,7 +10,7 @@ import torch
import vllm.envs
from vllm.logger import init_logger
from vllm.sampling_params import SamplingParams
from vllm.tokenizers import MistralTokenizer
from vllm.tokenizers import DeepseekV32Tokenizer, MistralTokenizer
from vllm.utils.import_utils import LazyLoader
from vllm.v1.structured_output.backend_types import (
StructuredOutputBackend,
@ -56,6 +56,27 @@ class XgrammarBackend(StructuredOutputBackend):
stop_token_ids=stop_token_ids,
add_prefix_space=True,
)
elif isinstance(self.tokenizer, DeepseekV32Tokenizer):
# copy from xgr.TokenizerInfo.from_huggingface()
# because we are using a custom tokenizer wrapper here.
vocab_dict = self.tokenizer.get_vocab()
tokenizer_vocab_size = max(len(vocab_dict), self.tokenizer.max_token_id + 1)
vocab_size = self.vocab_size or tokenizer_vocab_size
# maintain tokenizer's indexing
encoded_vocab = [""] * vocab_size
for token, idx in vocab_dict.items():
if idx < vocab_size:
encoded_vocab[idx] = token
stop_token_ids = [self.tokenizer.eos_token_id]
backend_str = self.tokenizer.tokenizer.backend_tokenizer.to_str()
metadata = xgr.TokenizerInfo._detect_metadata_from_hf(backend_str)
tokenizer_info = xgr.TokenizerInfo(
encoded_vocab=encoded_vocab,
vocab_type=metadata["vocab_type"],
vocab_size=vocab_size,
stop_token_ids=stop_token_ids,
add_prefix_space=metadata["add_prefix_space"],
)
else:
tokenizer_info = xgr.TokenizerInfo.from_huggingface(
self.tokenizer,

View File

@ -168,9 +168,9 @@ def build_attn_metadata(
query_start_loc=query_start_loc_gpu,
query_start_loc_cpu=query_start_loc_cpu,
seq_lens=seq_lens,
seq_lens_cpu=seq_lens_cpu,
_seq_lens_cpu=seq_lens_cpu,
max_seq_len=max_seq_len,
num_computed_tokens_cpu=num_computed_tokens_cpu,
_num_computed_tokens_cpu=num_computed_tokens_cpu,
num_reqs=num_reqs,
num_actual_tokens=num_tokens,
max_query_len=max_query_len,

View File

@ -1267,6 +1267,8 @@ class GPUModelRunner(
if not isinstance(kv_cache_spec, CrossAttentionSpec):
return None, None
# Zero out buffer for padding requests that are not actually scheduled (CGs)
self.encoder_seq_lens.np[:num_reqs] = 0
# Build encoder_seq_lens array mapping request indices to
# encoder lengths for inputs scheduled in this batch
for req_id in num_scheduled_tokens:
@ -1626,8 +1628,8 @@ class GPUModelRunner(
query_start_loc=query_start_loc,
query_start_loc_cpu=query_start_loc_cpu,
seq_lens=seq_lens,
seq_lens_cpu=seq_lens_cpu,
num_computed_tokens_cpu=num_computed_tokens_cpu,
_seq_lens_cpu=seq_lens_cpu,
_num_computed_tokens_cpu=num_computed_tokens_cpu,
num_actual_tokens=num_tokens_padded,
num_reqs=num_reqs_padded,
max_query_len=max_query_len,
@ -2764,6 +2766,7 @@ class GPUModelRunner(
# be improved in model runner v2)
force_uniform_decode: bool | None = None,
force_has_lora: bool | None = None,
num_encoder_reqs: int = 0,
) -> tuple[
CUDAGraphMode,
BatchDescriptor,
@ -2780,6 +2783,11 @@ class GPUModelRunner(
if force_uniform_decode is None
else force_uniform_decode
)
# Encoder-decoder models only support CG for decoder_step > 0 (no enc_output
# is present). Also, chunked-prefill is disabled, so batch are uniform.
has_encoder_output = (
self.model_config.is_encoder_decoder and num_encoder_reqs > 0
)
has_lora = (
len(self.input_batch.lora_id_to_lora_request) > 0
@ -2799,7 +2807,7 @@ class GPUModelRunner(
)
cudagraph_mode, batch_descriptor = dispatch_cudagraph(
num_tokens_padded, use_cascade_attn
num_tokens_padded, use_cascade_attn or has_encoder_output
)
num_tokens_padded = batch_descriptor.num_tokens
@ -2997,6 +3005,7 @@ class GPUModelRunner(
num_scheduled_tokens_np=num_scheduled_tokens_np,
max_num_scheduled_tokens=max_num_scheduled_tokens,
use_cascade_attn=cascade_attn_prefix_lens is not None,
num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs),
)
logger.debug(
@ -4871,7 +4880,7 @@ class GPUModelRunner(
# we need to adjust the cudagraph sizes to be a multiple of the uniform
# decode query length to avoid: https://github.com/vllm-project/vllm/issues/28207
# temp-fix: https://github.com/vllm-project/vllm/issues/28207#issuecomment-3504004536
# Will be removed in the near future when we have seperate cudagraph capture
# Will be removed in the near future when we have separate cudagraph capture
# sizes for decode and mixed prefill-decode.
if (
cudagraph_mode.decode_mode() == CUDAGraphMode.FULL

View File

@ -10,7 +10,7 @@ import torch
import torch.nn as nn
import vllm.envs as envs
from vllm.config import VllmConfig
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.distributed import (
ensure_model_parallel_initialized,
init_distributed_environment,
@ -207,7 +207,8 @@ class TPUWorker:
# one compiled bytecode. Having one FX graph/cached bytecode per
# compiled model is required for `support_torch_compile` decorator to
# skip dynamo guard.
self.model_runner.reset_dynamo_cache()
with set_current_vllm_config(self.vllm_config):
self.model_runner.reset_dynamo_cache()
# Get the maximum amount of memory used by the model weights and
# intermediate activations.

View File

@ -135,7 +135,7 @@ class AttentionGroup:
kv_cache_spec: KVCacheSpec
kv_cache_group_id: int
# When ubatching is enabled we will have a metadata builder for each ubatch
# so that if they use internal persistant buffers for cudagraphs, and they
# so that if they use internal persistent buffers for cudagraphs, and they
# won't have to worry about conflicting with the other ubatches.
metadata_builders: list[AttentionMetadataBuilder] = field(
default_factory=lambda: []
@ -313,8 +313,12 @@ def bind_kv_cache(
# TODO - analyze where runner_kv_caches is used and the right
# way to ensure it properly reflects multiple attention layers
# in the same decoder block.
if current_platform.is_cuda_alike() or current_platform.is_xpu():
# We know that the GPU runner is not impacted by this
if (
current_platform.is_cuda_alike()
or current_platform.is_xpu()
or current_platform.is_cpu()
):
# We know that the GPU / CPU runner is not impacted by this
# case. Some test code depends on runner_kv_caches, but
# not in a way that's impacted by ignoring this.
pass