mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-26 13:55:48 +08:00
Merge branch 'main' into model_arch_cfg
This commit is contained in:
commit
72730a74d5
@ -36,6 +36,11 @@ function cpu_tests() {
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run model tests
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
|
||||
|
||||
# Run kernel tests
|
||||
docker exec cpu-test bash -c "
|
||||
set -e
|
||||
|
||||
@ -574,7 +574,7 @@ async def benchmark(
|
||||
)
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
"Total Token throughput (tok/s):", metrics.total_token_throughput
|
||||
"Total token throughput (tok/s):", metrics.total_token_throughput
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@ -251,17 +251,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
||||
endif()
|
||||
|
||||
# Build ACL with CMake
|
||||
set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
|
||||
set(CMAKE_BUILD_TYPE "Release")
|
||||
set(ARM_COMPUTE_ARCH "armv8.2-a")
|
||||
set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
|
||||
set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
|
||||
set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
|
||||
set(ARM_COMPUTE_ENABLE_OPENMP "ON")
|
||||
set(ARM_COMPUTE_ENABLE_WERROR "OFF")
|
||||
set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
|
||||
set(ARM_COMPUTE_BUILD_TESTING "OFF")
|
||||
|
||||
set(_cmake_config_cmd
|
||||
${CMAKE_COMMAND} -G Ninja -B build
|
||||
-DARM_COMPUTE_BUILD_SHARED_LIB=OFF
|
||||
|
||||
@ -117,7 +117,6 @@ torch::Tensor get_scheduler_metadata(
|
||||
input.casual = casual;
|
||||
input.isa = isa;
|
||||
input.enable_kv_split = enable_kv_split;
|
||||
TORCH_CHECK(casual, "Only supports casual mask for now.");
|
||||
|
||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
|
||||
CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
|
||||
|
||||
@ -186,7 +186,7 @@ struct AttentionMetadata {
|
||||
// - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2
|
||||
// * q_tile_size * 4, partial output, max + sum (float)
|
||||
// Reduction scratchpad contains:
|
||||
// - flags: bool array to indicate wether the split is finished
|
||||
// - flags: bool array to indicate whether the split is finished
|
||||
// - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size
|
||||
// - max, sum: 2 * split_num * q_tile_size * 4
|
||||
class AttentionScratchPad {
|
||||
|
||||
@ -617,7 +617,7 @@ struct MacheteCollectiveMma {
|
||||
|
||||
// Same as upstream, should be kept the same when possible, not formatted for
|
||||
// easier comparison
|
||||
// with `SwapAB ? N : M -> M` since we dont support SwapAB
|
||||
// with `SwapAB ? N : M -> M` since we don't support SwapAB
|
||||
// clang-format off
|
||||
template<class ProblemShape>
|
||||
static bool
|
||||
|
||||
@ -1241,33 +1241,16 @@ __global__ void wvSplitK_hf_big_(const int K, const int M, const int Bx,
|
||||
}
|
||||
#endif // defined(__HIP__GFX9__) TODO: Add NAVI support
|
||||
|
||||
// Find the min val of div2 that doesn't increase N/(div1*div2)
|
||||
int mindiv(int N, int div1, int div2) {
|
||||
int nPrRnd = div1 * div2;
|
||||
int rnds0 = N / nPrRnd;
|
||||
nPrRnd -= div1 * 3;
|
||||
int rnds3 = N / nPrRnd;
|
||||
nPrRnd -= div1;
|
||||
int rnds4 = N / nPrRnd;
|
||||
nPrRnd -= div1;
|
||||
int rnds5 = N / nPrRnd;
|
||||
nPrRnd -= div1;
|
||||
int rnds6 = N / nPrRnd;
|
||||
nPrRnd -= div1;
|
||||
int rnds7 = N / nPrRnd;
|
||||
nPrRnd -= div1;
|
||||
int rnds8 = N / nPrRnd;
|
||||
nPrRnd -= div1;
|
||||
int rnds9 = N / nPrRnd;
|
||||
nPrRnd -= div1;
|
||||
int rtn = div2;
|
||||
if (rnds0 == rnds3) rtn = div2 - 3;
|
||||
if (rnds0 == rnds4) rtn = div2 - 4;
|
||||
if (rnds0 == rnds5) rtn = div2 - 5;
|
||||
if (rnds0 == rnds6) rtn = div2 - 6;
|
||||
if (rnds0 == rnds7) rtn = div2 - 7;
|
||||
if (rnds0 == rnds8) rtn = div2 - 8;
|
||||
if (rnds0 == rnds9) rtn = div2 - 9;
|
||||
return rtn;
|
||||
int rnds[13];
|
||||
for (int i = 0; i < 13; i++) {
|
||||
rnds[i] = (N + nPrRnd - 1) / nPrRnd;
|
||||
nPrRnd -= div1;
|
||||
}
|
||||
for (int i = 12; i >= 0; i--)
|
||||
if (rnds[0] == rnds[i]) return (div2 - i);
|
||||
}
|
||||
|
||||
torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
@ -1300,26 +1283,37 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
const int max_lds_len = get_lds_size() / 2;
|
||||
|
||||
#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
|
||||
_N) \
|
||||
{ \
|
||||
dim3 block(64, _WvPrGrp); \
|
||||
if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \
|
||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \
|
||||
wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
|
||||
biasf4, c, __wvPrGrp, CuCount); \
|
||||
} else if (K_in * N_in <= max_lds_len * 1.2) { \
|
||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \
|
||||
wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
|
||||
biasf4, c, __wvPrGrp, CuCount); \
|
||||
} else { \
|
||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp); \
|
||||
wvSplitK_hf_big_<fptype, 64, _YTILEb, _WvPrGrp, 8, _UNRLb, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
|
||||
biasf4, c, __wvPrGrp, CuCount); \
|
||||
} \
|
||||
#define WVSPLITK(_YTILE, _UNRL, _N) \
|
||||
{ \
|
||||
dim3 block(64, 16); \
|
||||
int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, 16); \
|
||||
if ((K_in * N_in <= max_lds_len) && (M_in % _YTILE == 0)) \
|
||||
wvSplitK_hf_sml_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
|
||||
biasf4, c, __wvPrGrp, CuCount); \
|
||||
else if (K_in * N_in <= max_lds_len * 1.2) \
|
||||
wvSplitK_hf_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
|
||||
biasf4, c, __wvPrGrp, CuCount); \
|
||||
else \
|
||||
wvSplitK_hf_big_<fptype, 64, _YTILE, 16, 8, _UNRL, _N> \
|
||||
<<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
|
||||
biasf4, c, __wvPrGrp, CuCount); \
|
||||
}
|
||||
|
||||
#define WVSPLIT_TILE(_sYT, __N) \
|
||||
{ \
|
||||
bool fit_lds = (K_in * N_in <= max_lds_len); \
|
||||
if (_sYT <= 1) \
|
||||
WVSPLITK(1, 4, __N) \
|
||||
else if ((__N == 1) || (!fit_lds) || (_sYT <= 4 * 2)) \
|
||||
WVSPLITK(2, 2, __N) \
|
||||
else if (_sYT <= 4 * 3) \
|
||||
WVSPLITK(3, 2, __N) \
|
||||
else if (__N == 4) \
|
||||
WVSPLITK(4, 1, __N) \
|
||||
else \
|
||||
WVSPLITK(4, 2, __N) \
|
||||
}
|
||||
|
||||
AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] {
|
||||
@ -1331,18 +1325,23 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
|
||||
? reinterpret_cast<const fptype*>(in_bias->data_ptr())
|
||||
: nullptr;
|
||||
fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
|
||||
|
||||
// first shoot for biggest tile-size that keeps all simd busy,
|
||||
// then cut the active waves to balance their distribution...
|
||||
int sYT = (M_in + CuCount * 4 - 1) / (CuCount * 4);
|
||||
|
||||
switch (N_in) {
|
||||
case 1:
|
||||
WVSPLITK(16, 2, 2, 2, 2, 2, 2, 1)
|
||||
WVSPLIT_TILE(sYT, 1)
|
||||
break;
|
||||
case 2:
|
||||
WVSPLITK(16, 2, 2, 2, 2, 2, 2, 2)
|
||||
WVSPLIT_TILE(sYT, 2)
|
||||
break;
|
||||
case 3:
|
||||
WVSPLITK(16, 4, 7, 7, 1, 1, 1, 3)
|
||||
WVSPLIT_TILE(sYT, 3)
|
||||
break;
|
||||
case 4:
|
||||
WVSPLITK(16, 4, 7, 7, 1, 1, 1, 4)
|
||||
WVSPLIT_TILE(sYT, 4)
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
|
||||
@ -84,7 +84,7 @@ Total input tokens: 1369
|
||||
Total generated tokens: 2212
|
||||
Request throughput (req/s): 1.73
|
||||
Output token throughput (tok/s): 382.89
|
||||
Total Token throughput (tok/s): 619.85
|
||||
Total token throughput (tok/s): 619.85
|
||||
---------------Time to First Token----------------
|
||||
Mean TTFT (ms): 71.54
|
||||
Median TTFT (ms): 73.88
|
||||
|
||||
@ -21,30 +21,20 @@ The mental model is that server-level metrics help explain the values of request
|
||||
|
||||
### v1 Metrics
|
||||
|
||||
In v1, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
|
||||
In v1, an extensive set of metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix, for example:
|
||||
|
||||
- `vllm:num_requests_running` (Gauge) - Number of requests currently running.
|
||||
- `vllm:num_requests_waiting` (Gauge) - Number of requests currently waiting.
|
||||
- `vllm:kv_cache_usage_perc` (Gauge) - Fraction of used KV cache blocks (0–1).
|
||||
- `vllm:prefix_cache_queries` (Counter) - Number of prefix cache queries.
|
||||
- `vllm:prefix_cache_hits` (Counter) - Number of prefix cache hits.
|
||||
- `vllm:mm_cache_queries` (Counter) - (For multimodal models) Number of multimodal cache queries.
|
||||
- `vllm:mm_cache_hits` (Counter) - (For multimodal models) Number of multimodal cache hits.
|
||||
- `vllm:num_preemptions_total` (Counter) - Number of preemptions.
|
||||
- `vllm:prompt_tokens_total` (Counter) - Total number of prompt tokens processed.
|
||||
- `vllm:generation_tokens_total` (Counter) - Total number of generated tokens.
|
||||
- `vllm:iteration_tokens_total` (Histogram) - Histogram of tokens processed in each engine step.
|
||||
- `vllm:cache_config_info` (Gauge) - Information about the cache configuration.
|
||||
- `vllm:request_success_total` (Counter) - Number of finished requests (by finish reason).
|
||||
- `vllm:request_prompt_tokens` (Histogram) - Histogram of input prompt token counts.
|
||||
- `vllm:request_generation_tokens` (Histogram) - Histogram of generation token counts.
|
||||
- `vllm:request_params_n` (Histogram) - Histogram of request parameter n.
|
||||
- `vllm:request_params_max_tokens` - (Histogram) - Histogram of max_tokens parameter in requests.
|
||||
- `vllm:time_to_first_token_seconds` (Histogram) - Time to first token (TTFT).
|
||||
- `vllm:inter_token_latency_seconds` (Histogram) - Inter-token latency.
|
||||
- `vllm:e2e_request_latency_seconds` (Histogram) - End-to-end request latency.
|
||||
- `vllm:request_queue_time_seconds` (Histogram) - Time spent in the queue.
|
||||
- `vllm:request_inference_time_seconds` (Histogram) - Request inference time.
|
||||
- `vllm:request_prefill_time_seconds` (Histogram) - Request prefill time.
|
||||
- `vllm:request_decode_time_seconds` (Histogram) - Request decode time.
|
||||
|
||||
|
||||
@ -22,7 +22,7 @@ python tools/install_nixl_from_source_ubuntu.py
|
||||
NixlConnector uses NIXL library for underlying communication, which supports multiple transport backends. UCX (Unified Communication X) is the primary default transport library used by NIXL. Configure transport environment variables:
|
||||
|
||||
```bash
|
||||
# Example UCX configuration, adjust according to your enviroment
|
||||
# Example UCX configuration, adjust according to your environment
|
||||
export UCX_TLS=all # or specify specific transports like "rc,ud,sm,^cuda_ipc" ..etc
|
||||
export UCX_NET_DEVICES=all # or specify network devices like "mlx5_0:1,mlx5_1:1"
|
||||
```
|
||||
|
||||
149
docs/mkdocs/hooks/generate_metrics.py
Normal file
149
docs/mkdocs/hooks/generate_metrics.py
Normal file
@ -0,0 +1,149 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import ast
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
logger = logging.getLogger("mkdocs")
|
||||
|
||||
ROOT_DIR = Path(__file__).parent.parent.parent.parent
|
||||
DOCS_DIR = ROOT_DIR / "docs"
|
||||
GENERATED_METRICS_DIR = DOCS_DIR / "generated" / "metrics"
|
||||
|
||||
# Files to scan for metric definitions - each will generate a separate table
|
||||
METRIC_SOURCE_FILES = [
|
||||
{"path": "vllm/v1/metrics/loggers.py", "output": "general.md"},
|
||||
{
|
||||
"path": "vllm/v1/spec_decode/metrics.py",
|
||||
"output": "spec_decode.md",
|
||||
},
|
||||
{
|
||||
"path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py",
|
||||
"output": "nixl_connector.md",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
class MetricExtractor(ast.NodeVisitor):
|
||||
"""AST visitor to extract metric definitions."""
|
||||
|
||||
def __init__(self):
|
||||
self.metrics: list[dict[str, str]] = []
|
||||
|
||||
def visit_Call(self, node: ast.Call) -> None:
|
||||
"""Visit function calls to find metric class instantiations."""
|
||||
metric_type = self._get_metric_type(node)
|
||||
if metric_type:
|
||||
name = self._extract_kwarg(node, "name")
|
||||
documentation = self._extract_kwarg(node, "documentation")
|
||||
|
||||
if name:
|
||||
self.metrics.append(
|
||||
{
|
||||
"name": name,
|
||||
"type": metric_type,
|
||||
"documentation": documentation or "",
|
||||
}
|
||||
)
|
||||
|
||||
self.generic_visit(node)
|
||||
|
||||
def _get_metric_type(self, node: ast.Call) -> str | None:
|
||||
"""Determine if this call creates a metric and return its type."""
|
||||
metric_type_map = {
|
||||
"_gauge_cls": "gauge",
|
||||
"_counter_cls": "counter",
|
||||
"_histogram_cls": "histogram",
|
||||
}
|
||||
if isinstance(node.func, ast.Attribute):
|
||||
return metric_type_map.get(node.func.attr)
|
||||
return None
|
||||
|
||||
def _extract_kwarg(self, node: ast.Call, key: str) -> str | None:
|
||||
"""Extract a keyword argument value from a function call."""
|
||||
for keyword in node.keywords:
|
||||
if keyword.arg == key:
|
||||
return self._get_string_value(keyword.value)
|
||||
return None
|
||||
|
||||
def _get_string_value(self, node: ast.AST) -> str | None:
|
||||
"""Extract string value from an AST node."""
|
||||
if isinstance(node, ast.Constant):
|
||||
return str(node.value) if node.value is not None else None
|
||||
return None
|
||||
|
||||
|
||||
def extract_metrics_from_file(filepath: Path) -> list[dict[str, str]]:
|
||||
"""Parse a Python file and extract all metric definitions."""
|
||||
try:
|
||||
with open(filepath, encoding="utf-8") as f:
|
||||
source = f.read()
|
||||
|
||||
tree = ast.parse(source, filename=str(filepath))
|
||||
extractor = MetricExtractor()
|
||||
extractor.visit(tree)
|
||||
return extractor.metrics
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Failed to parse {filepath}: {e}") from e
|
||||
|
||||
|
||||
def generate_markdown_table(metrics: list[dict[str, str]]) -> str:
|
||||
"""Generate a markdown table from extracted metrics."""
|
||||
if not metrics:
|
||||
return "No metrics found.\n"
|
||||
|
||||
# Sort by type, then by name
|
||||
metrics_sorted = sorted(metrics, key=lambda m: (m["type"], m["name"]))
|
||||
|
||||
lines = []
|
||||
lines.append("| Metric Name | Type | Description |")
|
||||
lines.append("|-------------|------|-------------|")
|
||||
|
||||
for metric in metrics_sorted:
|
||||
name = metric["name"]
|
||||
metric_type = metric["type"].capitalize()
|
||||
doc = metric["documentation"].replace("\n", " ").strip()
|
||||
lines.append(f"| `{name}` | {metric_type} | {doc} |")
|
||||
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
"""Generate metrics documentation tables from source files."""
|
||||
logger.info("Generating metrics documentation")
|
||||
|
||||
# Create generated directory if it doesn't exist
|
||||
GENERATED_METRICS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
total_metrics = 0
|
||||
for source_config in METRIC_SOURCE_FILES:
|
||||
source_path = source_config["path"]
|
||||
output_file = source_config["output"]
|
||||
|
||||
filepath = ROOT_DIR / source_path
|
||||
if not filepath.exists():
|
||||
raise FileNotFoundError(f"Metrics source file not found: {filepath}")
|
||||
|
||||
logger.debug("Extracting metrics from: %s", source_path)
|
||||
metrics = extract_metrics_from_file(filepath)
|
||||
logger.debug("Found %d metrics in %s", len(metrics), source_path)
|
||||
|
||||
# Generate and write the markdown table for this source
|
||||
table_content = generate_markdown_table(metrics)
|
||||
output_path = GENERATED_METRICS_DIR / output_file
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(table_content)
|
||||
|
||||
total_metrics += len(metrics)
|
||||
logger.info(
|
||||
"Generated metrics table: %s (%d metrics)",
|
||||
output_path.relative_to(ROOT_DIR),
|
||||
len(metrics),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Total metrics generated: %d across %d files",
|
||||
total_metrics,
|
||||
len(METRIC_SOURCE_FILES),
|
||||
)
|
||||
@ -33,11 +33,19 @@ Then query the endpoint to get the latest metrics from the server:
|
||||
|
||||
The following metrics are exposed:
|
||||
|
||||
??? code
|
||||
## General Metrics
|
||||
|
||||
```python
|
||||
--8<-- "vllm/engine/metrics.py:metrics-definitions"
|
||||
```
|
||||
--8<-- "docs/generated/metrics/general.md"
|
||||
|
||||
## Speculative Decoding Metrics
|
||||
|
||||
--8<-- "docs/generated/metrics/spec_decode.md"
|
||||
|
||||
## NIXL KV Connector Metrics
|
||||
|
||||
--8<-- "docs/generated/metrics/nixl_connector.md"
|
||||
|
||||
## Deprecation Policy
|
||||
|
||||
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
|
||||
but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
|
||||
|
||||
@ -51,6 +51,7 @@ hooks:
|
||||
- docs/mkdocs/hooks/remove_announcement.py
|
||||
- docs/mkdocs/hooks/generate_examples.py
|
||||
- docs/mkdocs/hooks/generate_argparse.py
|
||||
- docs/mkdocs/hooks/generate_metrics.py
|
||||
- docs/mkdocs/hooks/url_schemes.py
|
||||
|
||||
plugins:
|
||||
|
||||
@ -75,7 +75,7 @@ torchgeo==0.7.0
|
||||
mteb==2.1.2
|
||||
|
||||
# Data processing
|
||||
xgrammar==0.1.27
|
||||
xgrammar @ git+https://github.com/divakar-amd/xgrammar@3272f7c520564858056a60480d5afdf69ae79c84
|
||||
# Test async scheduling
|
||||
|
||||
# Utilities
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
|
||||
from openai.types.responses.response_function_tool_call_output_item import (
|
||||
ResponseFunctionToolCallOutputItem,
|
||||
)
|
||||
@ -14,7 +15,8 @@ from openai.types.responses.response_reasoning_item import (
|
||||
)
|
||||
|
||||
from vllm.entrypoints.responses_utils import (
|
||||
construct_chat_message_with_tool_call,
|
||||
_construct_single_message_from_response_item,
|
||||
construct_chat_messages_with_tool_call,
|
||||
convert_tool_responses_to_completions_format,
|
||||
)
|
||||
|
||||
@ -42,7 +44,43 @@ class TestResponsesUtils:
|
||||
|
||||
assert result == {"type": "function", "function": input_tool}
|
||||
|
||||
def test_construct_chat_message_with_tool_call(self):
|
||||
def test_construct_chat_messages_with_tool_call(self):
|
||||
"""Test construction of chat messages with tool calls."""
|
||||
reasoning_item = ResponseReasoningItem(
|
||||
id="lol",
|
||||
summary=[],
|
||||
type="reasoning",
|
||||
content=[
|
||||
Content(
|
||||
text="Leroy Jenkins",
|
||||
type="reasoning_text",
|
||||
)
|
||||
],
|
||||
encrypted_content=None,
|
||||
status=None,
|
||||
)
|
||||
mcp_tool_item = ResponseFunctionToolCall(
|
||||
id="mcp_123",
|
||||
call_id="call_123",
|
||||
type="function_call",
|
||||
status="completed",
|
||||
name="python",
|
||||
arguments='{"code": "123+456"}',
|
||||
)
|
||||
input_items = [reasoning_item, mcp_tool_item]
|
||||
messages = construct_chat_messages_with_tool_call(input_items)
|
||||
|
||||
assert len(messages) == 1
|
||||
message = messages[0]
|
||||
assert message["role"] == "assistant"
|
||||
assert message["reasoning"] == "Leroy Jenkins"
|
||||
assert message["tool_calls"][0]["id"] == "call_123"
|
||||
assert message["tool_calls"][0]["function"]["name"] == "python"
|
||||
assert (
|
||||
message["tool_calls"][0]["function"]["arguments"] == '{"code": "123+456"}'
|
||||
)
|
||||
|
||||
def test_construct_single_message_from_response_item(self):
|
||||
item = ResponseReasoningItem(
|
||||
id="lol",
|
||||
summary=[],
|
||||
@ -56,7 +94,7 @@ class TestResponsesUtils:
|
||||
encrypted_content=None,
|
||||
status=None,
|
||||
)
|
||||
formatted_item = construct_chat_message_with_tool_call(item)
|
||||
formatted_item = _construct_single_message_from_response_item(item)
|
||||
assert formatted_item["role"] == "assistant"
|
||||
assert formatted_item["reasoning"] == "Leroy Jenkins"
|
||||
|
||||
@ -74,7 +112,7 @@ class TestResponsesUtils:
|
||||
status=None,
|
||||
)
|
||||
|
||||
formatted_item = construct_chat_message_with_tool_call(item)
|
||||
formatted_item = _construct_single_message_from_response_item(item)
|
||||
assert formatted_item["role"] == "assistant"
|
||||
assert (
|
||||
formatted_item["reasoning"]
|
||||
@ -88,7 +126,7 @@ class TestResponsesUtils:
|
||||
output="1234",
|
||||
status="completed",
|
||||
)
|
||||
formatted_item = construct_chat_message_with_tool_call(tool_call_output)
|
||||
formatted_item = _construct_single_message_from_response_item(tool_call_output)
|
||||
assert formatted_item["role"] == "tool"
|
||||
assert formatted_item["content"] == "1234"
|
||||
assert formatted_item["tool_call_id"] == "temp"
|
||||
@ -102,7 +140,7 @@ class TestResponsesUtils:
|
||||
status=None,
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
construct_chat_message_with_tool_call(item)
|
||||
_construct_single_message_from_response_item(item)
|
||||
|
||||
output_item = ResponseOutputMessage(
|
||||
id="msg_bf585bbbe3d500e0",
|
||||
@ -119,6 +157,6 @@ class TestResponsesUtils:
|
||||
type="message",
|
||||
)
|
||||
|
||||
formatted_item = construct_chat_message_with_tool_call(output_item)
|
||||
formatted_item = _construct_single_message_from_response_item(output_item)
|
||||
assert formatted_item["role"] == "assistant"
|
||||
assert formatted_item["content"] == "dongyi"
|
||||
|
||||
@ -7,7 +7,8 @@ import math
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.platforms import CpuArchEnum, current_platform
|
||||
from vllm.v1.attention.backends.cpu_attn import _get_attn_isa
|
||||
|
||||
if not current_platform.is_cpu():
|
||||
pytest.skip("skipping CPU-only tests", allow_module_level=True)
|
||||
@ -36,6 +37,21 @@ SEQ_LENS = [ # (q_len, kv_len)
|
||||
]
|
||||
|
||||
|
||||
def get_attn_isa(
|
||||
block_size: int | None = None,
|
||||
dtype: torch.dtype | None = None,
|
||||
):
|
||||
if block_size and dtype:
|
||||
return _get_attn_isa(dtype, block_size)
|
||||
else:
|
||||
if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
|
||||
return "neon"
|
||||
elif torch._C._cpu._is_amx_tile_supported():
|
||||
return "amx"
|
||||
else:
|
||||
return "vec"
|
||||
|
||||
|
||||
# rand number generation takes too much time, cache rand tensors
|
||||
@functools.lru_cache(maxsize=128, typed=False)
|
||||
def tensor_cache(
|
||||
@ -452,6 +468,49 @@ def test_varlen_with_paged_kv_normal_vec16(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seq_lens", SEQ_LENS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("block_size", [96, 128])
|
||||
@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
|
||||
@pytest.mark.parametrize("dtype", QTYPES)
|
||||
@pytest.mark.parametrize("soft_cap", [None])
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||
@pytest.mark.parametrize("use_alibi", [False])
|
||||
@pytest.mark.parametrize("use_sink", [False])
|
||||
@pytest.mark.parametrize("isa", ["neon"])
|
||||
@pytest.mark.skipif(
|
||||
current_platform.get_cpu_architecture() != CpuArchEnum.ARM,
|
||||
reason="Not an Arm CPU.",
|
||||
)
|
||||
def test_varlen_with_paged_kv_normal_neon(
|
||||
seq_lens: list[tuple[int, int]],
|
||||
num_heads: tuple[int, int],
|
||||
head_size: int,
|
||||
sliding_window: int | None,
|
||||
dtype: torch.dtype,
|
||||
block_size: int,
|
||||
soft_cap: float | None,
|
||||
num_blocks: int,
|
||||
use_alibi: bool,
|
||||
use_sink: bool,
|
||||
isa: str,
|
||||
) -> None:
|
||||
varlen_with_paged_kv(
|
||||
seq_lens=seq_lens,
|
||||
num_heads=num_heads,
|
||||
head_size=head_size,
|
||||
sliding_window=sliding_window,
|
||||
dtype=dtype,
|
||||
block_size=block_size,
|
||||
soft_cap=soft_cap,
|
||||
num_blocks=num_blocks,
|
||||
use_alibi=use_alibi,
|
||||
use_sink=use_sink,
|
||||
isa=isa,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seq_lens", SEQ_LENS)
|
||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||
@pytest.mark.parametrize("head_size", [96])
|
||||
@ -462,9 +521,7 @@ def test_varlen_with_paged_kv_normal_vec16(
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||
@pytest.mark.parametrize("use_alibi", [False])
|
||||
@pytest.mark.parametrize("use_sink", [False])
|
||||
@pytest.mark.parametrize(
|
||||
"isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
|
||||
)
|
||||
@pytest.mark.parametrize("isa", [get_attn_isa()])
|
||||
def test_varlen_with_paged_kv_softcap(
|
||||
seq_lens: list[tuple[int, int]],
|
||||
num_heads: tuple[int, int],
|
||||
@ -503,9 +560,7 @@ def test_varlen_with_paged_kv_softcap(
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||
@pytest.mark.parametrize("use_alibi", [True])
|
||||
@pytest.mark.parametrize("use_sink", [False])
|
||||
@pytest.mark.parametrize(
|
||||
"isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
|
||||
)
|
||||
@pytest.mark.parametrize("isa", [get_attn_isa()])
|
||||
def test_varlen_with_paged_kv_alibi(
|
||||
seq_lens: list[tuple[int, int]],
|
||||
num_heads: tuple[int, int],
|
||||
@ -544,9 +599,7 @@ def test_varlen_with_paged_kv_alibi(
|
||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||
@pytest.mark.parametrize("use_alibi", [False])
|
||||
@pytest.mark.parametrize("use_sink", [True])
|
||||
@pytest.mark.parametrize(
|
||||
"isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
|
||||
)
|
||||
@pytest.mark.parametrize("isa", [get_attn_isa()])
|
||||
def test_varlen_with_paged_kv_sink(
|
||||
seq_lens: list[tuple[int, int]],
|
||||
num_heads: tuple[int, int],
|
||||
|
||||
@ -54,6 +54,10 @@ def setup_cuda():
|
||||
torch.set_default_device("cuda")
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_fp8_fnuz(),
|
||||
reason="This platform supports e4m3fnuz, not e4m3fn.",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"num_tokens,d,dtype,group_size,seed",
|
||||
itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS),
|
||||
@ -78,14 +82,14 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
|
||||
def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
|
||||
torch.manual_seed(seed)
|
||||
factor_for_scale = 1e-2
|
||||
fp8_info = torch.finfo(torch.float8_e4m3fn)
|
||||
fp8_info = torch.finfo(current_platform.fp8_dtype())
|
||||
fp8_max, fp8_min = fp8_info.max, fp8_info.min
|
||||
|
||||
A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
|
||||
A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
||||
A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(current_platform.fp8_dtype())
|
||||
|
||||
B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
|
||||
B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
||||
B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(current_platform.fp8_dtype())
|
||||
|
||||
block_n, block_k = block_size[0], block_size[1]
|
||||
n_tiles = (N + block_n - 1) // block_n
|
||||
@ -103,6 +107,9 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
|
||||
assert rel_diff < 0.001
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(), reason="CUTLASS only supported on CUDA platform."
|
||||
)
|
||||
@torch.inference_mode()
|
||||
def test_w8a8_block_fp8_cutlass_matmul():
|
||||
# Test simple case where weight.shape % 128 != 0,
|
||||
@ -151,6 +158,10 @@ def test_w8a8_block_fp8_cutlass_matmul():
|
||||
assert rel_diff < 0.001
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
current_platform.is_fp8_fnuz(),
|
||||
reason="This platform supports e4m3fnuz, not e4m3fn.",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"M,N,K,block_size,out_dtype,seed",
|
||||
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS),
|
||||
|
||||
@ -15,6 +15,9 @@ from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.math_utils import cdiv
|
||||
|
||||
if not current_platform.is_cuda():
|
||||
pytest.skip("These tests use CUTLASS which requires CUDA", allow_module_level=True)
|
||||
|
||||
MNK_FACTORS = [
|
||||
(1, 256, 128),
|
||||
(1, 16384, 1024),
|
||||
|
||||
@ -21,6 +21,9 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.scalar_type import ScalarType, scalar_types
|
||||
|
||||
if not current_platform.is_cuda():
|
||||
pytest.skip("These tests use CUTLASS which requires CUDA", allow_module_level=True)
|
||||
|
||||
# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
|
||||
# unit tests to a common utility function. Currently the use of
|
||||
# `is_quant_method_supported` conflates kernels with quantization methods
|
||||
|
||||
@ -92,16 +92,19 @@ def run_test(
|
||||
*,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: str | None = None,
|
||||
dtype: str = "half",
|
||||
) -> None:
|
||||
prompt_list = PROMPTS * 10
|
||||
expected_list = EXPECTED[model] * 10
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype="half",
|
||||
dtype=dtype,
|
||||
max_model_len=448,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
# TODO (NickLucche) figure out output differences with non-eager and re-enable
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
llm = vllm_model.llm
|
||||
|
||||
@ -120,12 +123,28 @@ def run_test(
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@create_new_process_for_each_test()
|
||||
def test_models(vllm_runner, model) -> None:
|
||||
def test_models(vllm_runner, model, dtype) -> None:
|
||||
run_test(
|
||||
vllm_runner,
|
||||
model,
|
||||
tensor_parallel_size=1,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.cpu_model
|
||||
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_models_cpu(vllm_runner, model, dtype) -> None:
|
||||
# @create_new_process_for_each_test() does not work for some runners
|
||||
# TODO: to fix cpu privilege issues in run-cpu-test-arm.sh
|
||||
run_test(
|
||||
vllm_runner,
|
||||
model,
|
||||
tensor_parallel_size=1,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import mimetypes
|
||||
import os
|
||||
@ -186,6 +187,7 @@ async def test_fetch_image_error_conversion():
|
||||
connector.fetch_image(broken_img)
|
||||
|
||||
|
||||
@pytest.mark.flaky(reruns=3, reruns_delay=5)
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
|
||||
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
|
||||
@ -198,8 +200,12 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
|
||||
}
|
||||
)
|
||||
|
||||
video_sync, metadata_sync = connector.fetch_video(video_url)
|
||||
video_async, metadata_async = await connector.fetch_video_async(video_url)
|
||||
try:
|
||||
video_sync, metadata_sync = connector.fetch_video(video_url)
|
||||
video_async, metadata_async = await connector.fetch_video_async(video_url)
|
||||
except (TimeoutError, asyncio.TimeoutError) as e:
|
||||
pytest.skip(f"Timeout fetching video (CI network flakiness): {e}")
|
||||
|
||||
assert np.array_equal(video_sync, video_async)
|
||||
assert metadata_sync == metadata_async
|
||||
|
||||
|
||||
@ -106,8 +106,8 @@ def create_common_attn_metadata(
|
||||
query_start_loc=query_start_loc,
|
||||
query_start_loc_cpu=query_start_loc_cpu,
|
||||
seq_lens=seq_lens,
|
||||
seq_lens_cpu=seq_lens_cpu,
|
||||
num_computed_tokens_cpu=num_computed_tokens_cpu,
|
||||
_seq_lens_cpu=seq_lens_cpu,
|
||||
_num_computed_tokens_cpu=num_computed_tokens_cpu,
|
||||
num_reqs=batch_spec.batch_size,
|
||||
num_actual_tokens=num_tokens,
|
||||
max_query_len=max_query_len,
|
||||
|
||||
@ -8,6 +8,7 @@ import torch._dynamo.config as dynamo_config
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.logprobs import Logprob
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sampling_params import StructuredOutputsParams
|
||||
from vllm.v1.metrics.reader import Metric
|
||||
|
||||
@ -70,6 +71,18 @@ def test_without_spec_decoding(
|
||||
(True, "uni", True, None, True),
|
||||
]
|
||||
|
||||
if current_platform.is_rocm():
|
||||
# On ROCm, Only test with structured_outputs (deterministic)
|
||||
# and skip chunk_prefill (more variable).
|
||||
test_configs = [
|
||||
cfg
|
||||
for cfg in test_configs
|
||||
if not cfg[4] # skip chunk_prefill=True
|
||||
]
|
||||
test_sampling_params = [
|
||||
p for p in test_sampling_params if p.get("structured_outputs") is not None
|
||||
]
|
||||
|
||||
run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)
|
||||
|
||||
|
||||
@ -108,7 +121,14 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
|
||||
(True, "uni", True, spec_config_short, True),
|
||||
]
|
||||
|
||||
run_tests(monkeypatch, MTP_MODEL, test_configs, test_sampling_params)
|
||||
# On ROCm, use TRITON_ATTN + float32 for better numerical consistency
|
||||
run_tests(
|
||||
monkeypatch,
|
||||
MTP_MODEL,
|
||||
test_configs,
|
||||
test_sampling_params,
|
||||
is_testing_with_spec_decoding=True,
|
||||
)
|
||||
|
||||
|
||||
@dynamo_config.patch(cache_size_limit=16)
|
||||
@ -117,13 +137,21 @@ def run_tests(
|
||||
model: str,
|
||||
test_configs: list[tuple],
|
||||
test_sampling_params: list[dict[str, Any]],
|
||||
is_testing_with_spec_decoding: bool = False,
|
||||
):
|
||||
"""Test consistency of combos of async scheduling, preemption,
|
||||
uni/multiproc executor with spec decoding."""
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
# avoid precision errors
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
|
||||
if current_platform.is_rocm():
|
||||
if is_testing_with_spec_decoding:
|
||||
# Use TRITON_ATTN for spec decoding test for consistency
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
|
||||
else:
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA")
|
||||
else:
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
|
||||
# lock matmul precision to full FP32
|
||||
m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
|
||||
# m.setenv("VLLM_BATCH_INVARIANT", "1")
|
||||
@ -145,6 +173,7 @@ def run_tests(
|
||||
async_scheduling,
|
||||
spec_config,
|
||||
test_prefill_chunking=test_prefill_chunking,
|
||||
is_testing_with_spec_decoding=is_testing_with_spec_decoding,
|
||||
)
|
||||
outputs.append(test_results)
|
||||
|
||||
@ -174,17 +203,34 @@ def run_tests(
|
||||
name_0=f"baseline=[{baseline_config}], params={params}",
|
||||
name_1=f"config=[{test_config}], params={params}",
|
||||
)
|
||||
assert _all_logprobs_match(base_logprobs, test_logprobs)
|
||||
|
||||
# On ROCm with TRITON_ATTN (spec decoding test), skip strict
|
||||
# logprobs comparison when logprobs are requested
|
||||
skip_logprobs_check = (
|
||||
current_platform.is_rocm()
|
||||
and params.get("logprobs")
|
||||
and is_testing_with_spec_decoding
|
||||
)
|
||||
if not skip_logprobs_check:
|
||||
assert _all_logprobs_match(base_logprobs, test_logprobs)
|
||||
|
||||
if (
|
||||
base_acceptance_rate is not None
|
||||
and test_acceptance_rate is not None
|
||||
):
|
||||
if "spec_mml=None" in test_config:
|
||||
# Preemption causes more variance in acceptance rates
|
||||
if (
|
||||
current_platform.is_rocm()
|
||||
and "preemption=True" in test_config
|
||||
):
|
||||
tolerance = 0.10
|
||||
else:
|
||||
tolerance = 0.05
|
||||
assert (
|
||||
test_acceptance_rate > base_acceptance_rate
|
||||
or test_acceptance_rate
|
||||
== pytest.approx(base_acceptance_rate, rel=5e-2)
|
||||
== pytest.approx(base_acceptance_rate, rel=tolerance)
|
||||
)
|
||||
else:
|
||||
# Currently the reported acceptance rate is expected to be
|
||||
@ -215,6 +261,7 @@ def run_test(
|
||||
async_scheduling: bool,
|
||||
spec_config: dict[str, Any] | None,
|
||||
test_prefill_chunking: bool,
|
||||
is_testing_with_spec_decoding: bool = False,
|
||||
):
|
||||
spec_decoding = spec_config is not None
|
||||
cache_arg: dict[str, Any] = (
|
||||
@ -233,6 +280,15 @@ def run_test(
|
||||
print("-" * 80)
|
||||
print(f"---- TESTING {test_str}: {test_config}")
|
||||
print("-" * 80)
|
||||
|
||||
# On ROCm: use float16 for first test (ROCM_AITER_FA), but float32 for
|
||||
# spec decoding test (TRITON_ATTN) for better precision.
|
||||
# On others: always use float32.
|
||||
if current_platform.is_rocm() and not is_testing_with_spec_decoding:
|
||||
dtype = "float16"
|
||||
else:
|
||||
dtype = "float32"
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=512,
|
||||
@ -242,7 +298,7 @@ def run_test(
|
||||
# enforce_eager=True,
|
||||
async_scheduling=async_scheduling,
|
||||
distributed_executor_backend=executor,
|
||||
dtype="float32", # avoid precision errors
|
||||
dtype=dtype,
|
||||
speculative_config=spec_config,
|
||||
disable_log_stats=False,
|
||||
**cache_arg,
|
||||
@ -302,11 +358,21 @@ def _all_logprobs_match(req_a, req_b) -> bool:
|
||||
|
||||
|
||||
def _logprobs_match(lps_a: dict[int, Logprob], lps_b: dict[int, Logprob]) -> bool:
|
||||
return len(lps_a) == len(lps_b) and all(
|
||||
a.decoded_token == b.decoded_token
|
||||
and a.rank == b.rank
|
||||
and a.logprob == pytest.approx(b.logprob, rel=1e-3, abs=1e-6)
|
||||
for a, b in ((lps_a[x], lps_b[x]) for x in lps_a)
|
||||
if current_platform.is_rocm():
|
||||
# ROCm has higher numerical variance
|
||||
# due to use of float16.
|
||||
rel_tol, abs_tol = 5e-2, 1e-5
|
||||
else:
|
||||
rel_tol, abs_tol = 1e-3, 1e-6
|
||||
return (
|
||||
len(lps_a) == len(lps_b)
|
||||
and lps_a.keys() == lps_b.keys()
|
||||
and all(
|
||||
a.decoded_token == b.decoded_token
|
||||
and a.rank == b.rank
|
||||
and a.logprob == pytest.approx(b.logprob, rel=rel_tol, abs=abs_tol)
|
||||
for a, b in ((lps_a[x], lps_b[x]) for x in lps_a)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
|
||||
131
tests/v1/e2e/test_async_spec_decode.py
Normal file
131
tests/v1/e2e/test_async_spec_decode.py
Normal file
@ -0,0 +1,131 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Test that verifies no implicit GPU-CPU synchronization occurs during
|
||||
speculative decoding generation under expected conditions.
|
||||
"""
|
||||
|
||||
import multiprocessing
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sync_tracker():
|
||||
"""
|
||||
Fixture that patches CommonAttentionMetadata.seq_lens_cpu to detect
|
||||
lazy init syncs. Prints stack traces immediately when syncs occur.
|
||||
"""
|
||||
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
|
||||
|
||||
# Shared counter for cross-process communication (inherited by fork)
|
||||
sync_count = multiprocessing.Value("i", 0)
|
||||
|
||||
# Save original property
|
||||
original_prop = CommonAttentionMetadata.seq_lens_cpu
|
||||
original_fget = original_prop.fget
|
||||
|
||||
# Create tracking wrapper
|
||||
def tracking_seq_lens_cpu(self):
|
||||
if self._seq_lens_cpu is None:
|
||||
# Increment counter
|
||||
with sync_count.get_lock():
|
||||
sync_count.value += 1
|
||||
count = sync_count.value
|
||||
# Print stack trace immediately (shows in subprocess output)
|
||||
print(f"\n{'=' * 60}", file=sys.stderr)
|
||||
print(f"SYNC #{count}: seq_lens_cpu lazy init triggered!", file=sys.stderr)
|
||||
print(f"{'=' * 60}", file=sys.stderr)
|
||||
traceback.print_stack(file=sys.stderr)
|
||||
print(f"{'=' * 60}\n", file=sys.stderr)
|
||||
sys.stderr.flush()
|
||||
return original_fget(self)
|
||||
|
||||
# Apply patch
|
||||
CommonAttentionMetadata.seq_lens_cpu = property(tracking_seq_lens_cpu)
|
||||
|
||||
class SyncTracker:
|
||||
@property
|
||||
def count(self) -> int:
|
||||
return sync_count.value
|
||||
|
||||
def assert_no_sync(self, msg: str = ""):
|
||||
count = sync_count.value
|
||||
assert count == 0, (
|
||||
f"Unexpected GPU-CPU sync: seq_lens_cpu lazy init triggered "
|
||||
f"{count} times. See stack traces above. {msg}"
|
||||
)
|
||||
|
||||
yield SyncTracker()
|
||||
|
||||
# Restore original property
|
||||
CommonAttentionMetadata.seq_lens_cpu = original_prop
|
||||
torch._dynamo.reset()
|
||||
|
||||
|
||||
# Test configurations: (model, spec_model, method, num_spec_tokens, backend_env)
|
||||
SPEC_DECODE_CONFIGS = [
|
||||
pytest.param(
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"nm-testing/Llama3_2_1B_speculator.eagle3",
|
||||
"eagle3",
|
||||
2,
|
||||
id="eagle3-llama",
|
||||
),
|
||||
pytest.param(
|
||||
"eagle618/deepseek-v3-random",
|
||||
"eagle618/eagle-deepseek-v3-random",
|
||||
"eagle",
|
||||
2,
|
||||
id="eagle-mla-deepseek",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model,spec_model,method,num_spec_tokens",
|
||||
SPEC_DECODE_CONFIGS,
|
||||
)
|
||||
def test_no_sync_with_spec_decode(
|
||||
sync_tracker,
|
||||
model: str,
|
||||
spec_model: str,
|
||||
method: str,
|
||||
num_spec_tokens: int,
|
||||
):
|
||||
"""
|
||||
Test that no implicit GPU-CPU sync occurs during speculative decoding
|
||||
generation.
|
||||
"""
|
||||
# Import vLLM AFTER sync_tracker fixture has applied the patch
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
llm = LLM(
|
||||
model=model,
|
||||
max_model_len=256,
|
||||
speculative_config={
|
||||
"method": method,
|
||||
"num_speculative_tokens": num_spec_tokens,
|
||||
"model": spec_model,
|
||||
},
|
||||
enforce_eager=True,
|
||||
async_scheduling=True,
|
||||
)
|
||||
|
||||
outputs = llm.generate(
|
||||
["Hello, my name is"],
|
||||
SamplingParams(temperature=0, max_tokens=10),
|
||||
)
|
||||
|
||||
assert len(outputs) == 1
|
||||
assert len(outputs[0].outputs[0].text) > 0
|
||||
|
||||
del llm
|
||||
torch.cuda.empty_cache()
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
sync_tracker.assert_no_sync()
|
||||
@ -191,8 +191,8 @@ def test_suffix_decoding_acceptance(
|
||||
# Expect the acceptance rate to improve.
|
||||
assert first_accept_rate < last_accept_rate
|
||||
|
||||
# Heuristic: expect at least 82.5% acceptance rate at the end.
|
||||
assert last_accept_rate > 0.825
|
||||
# Heuristic: expect at least 80.0% acceptance rate at the end.
|
||||
assert last_accept_rate > 0.80
|
||||
|
||||
del spec_llm
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
@ -88,8 +88,8 @@ def forward_attention(
|
||||
query_start_loc=query_start_loc,
|
||||
query_start_loc_cpu=query_start_loc.cpu(),
|
||||
seq_lens=seq_lens,
|
||||
seq_lens_cpu=seq_lens.cpu(),
|
||||
num_computed_tokens_cpu=context_lens.cpu(),
|
||||
_seq_lens_cpu=seq_lens.cpu(),
|
||||
_num_computed_tokens_cpu=context_lens.cpu(),
|
||||
num_reqs=batch_size,
|
||||
num_actual_tokens=num_actual_tokens,
|
||||
max_query_len=max_query_len,
|
||||
|
||||
@ -103,7 +103,7 @@ def create_cross_attention_backend(
|
||||
# needed here to know how many tokens to attend to from the cached
|
||||
# cross-attention KV cache.
|
||||
new_metadata.seq_lens = common_attn_metadata.encoder_seq_lens
|
||||
new_metadata.seq_lens_cpu = torch.from_numpy(
|
||||
new_metadata._seq_lens_cpu = torch.from_numpy(
|
||||
common_attn_metadata.encoder_seq_lens_cpu
|
||||
)
|
||||
|
||||
|
||||
@ -788,7 +788,7 @@ async def benchmark(
|
||||
)
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
"Total Token throughput (tok/s):", metrics.total_token_throughput
|
||||
"Total token throughput (tok/s):", metrics.total_token_throughput
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@ -667,8 +667,9 @@ class VllmConfig:
|
||||
|
||||
default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
|
||||
self._apply_optimization_level_defaults(default_config)
|
||||
|
||||
if (
|
||||
self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
|
||||
self.compilation_config.cudagraph_mode.requires_piecewise_compilation()
|
||||
and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
|
||||
):
|
||||
logger.info(
|
||||
@ -693,22 +694,29 @@ class VllmConfig:
|
||||
|
||||
if current_platform.support_static_graph_mode():
|
||||
# if cudagraph_mode has full cudagraphs, we need to check support
|
||||
if (
|
||||
self.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
||||
and self.model_config is not None
|
||||
):
|
||||
if self.model_config.pooler_config is not None:
|
||||
if model_config := self.model_config:
|
||||
if (
|
||||
self.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
||||
and model_config.pooler_config is not None
|
||||
):
|
||||
logger.warning_once(
|
||||
"Pooling models do not support full cudagraphs. "
|
||||
"Overriding cudagraph_mode to PIECEWISE."
|
||||
)
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||
elif self.model_config.is_encoder_decoder:
|
||||
logger.warning_once(
|
||||
"Encoder-decoder models do not support full cudagraphs. "
|
||||
"Overriding cudagraph_mode to PIECEWISE."
|
||||
elif (
|
||||
model_config.is_encoder_decoder
|
||||
and self.compilation_config.cudagraph_mode
|
||||
not in (CUDAGraphMode.NONE, CUDAGraphMode.FULL_DECODE_ONLY)
|
||||
):
|
||||
logger.info_once(
|
||||
"Encoder-decoder models do not support %s. "
|
||||
"Overriding cudagraph_mode to FULL_DECODE_ONLY.",
|
||||
self.compilation_config.cudagraph_mode.name,
|
||||
)
|
||||
self.compilation_config.cudagraph_mode = (
|
||||
CUDAGraphMode.FULL_DECODE_ONLY
|
||||
)
|
||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||
|
||||
# disable cudagraph when enforce eager execution
|
||||
if self.model_config is not None and self.model_config.enforce_eager:
|
||||
@ -1048,8 +1056,14 @@ class VllmConfig:
|
||||
self.compilation_config.max_cudagraph_capture_size
|
||||
)
|
||||
if max_cudagraph_capture_size is None:
|
||||
decode_query_len = 1
|
||||
if (
|
||||
self.speculative_config
|
||||
and self.speculative_config.num_speculative_tokens
|
||||
):
|
||||
decode_query_len += self.speculative_config.num_speculative_tokens
|
||||
max_cudagraph_capture_size = min(
|
||||
self.scheduler_config.max_num_seqs * 2, 512
|
||||
self.scheduler_config.max_num_seqs * decode_query_len * 2, 512
|
||||
)
|
||||
max_num_tokens = self.scheduler_config.max_num_batched_tokens
|
||||
max_cudagraph_capture_size = min(max_num_tokens, max_cudagraph_capture_size)
|
||||
|
||||
@ -8,3 +8,5 @@ Shared constants for vLLM entrypoints.
|
||||
# These constants help mitigate header abuse attacks
|
||||
H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT = 4194304 # 4 MB
|
||||
H11_MAX_HEADER_COUNT_DEFAULT = 256
|
||||
|
||||
MCP_PREFIX = "mcp_"
|
||||
|
||||
@ -19,6 +19,7 @@ from vllm import envs
|
||||
from vllm.entrypoints.chat_utils import (
|
||||
ChatTemplateContentFormatOption,
|
||||
)
|
||||
from vllm.entrypoints.constants import MCP_PREFIX
|
||||
from vllm.entrypoints.openai.parser.harmony_utils import (
|
||||
get_encoding,
|
||||
get_streamable_parser_for_assistant,
|
||||
@ -303,7 +304,7 @@ class ParsableContext(ConversationContext):
|
||||
result_str = result.content[0].text
|
||||
|
||||
message = ResponseFunctionToolCallOutputItem(
|
||||
id=f"fco_{random_uuid()}",
|
||||
id=f"mcpo_{random_uuid()}",
|
||||
type="function_call_output",
|
||||
call_id=f"call_{random_uuid()}",
|
||||
output=result_str,
|
||||
@ -385,6 +386,9 @@ class ParsableContext(ConversationContext):
|
||||
if not self.parser.response_messages:
|
||||
return []
|
||||
last_msg = self.parser.response_messages[-1]
|
||||
# change this to a mcp_ function call
|
||||
last_msg.id = f"{MCP_PREFIX}{random_uuid()}"
|
||||
self.parser.response_messages[-1] = last_msg
|
||||
if last_msg.name == "code_interpreter":
|
||||
return await self.call_python_tool(self._tool_sessions["python"], last_msg)
|
||||
elif last_msg.name == "web_search_preview":
|
||||
|
||||
@ -1339,6 +1339,7 @@ class OpenAIServing:
|
||||
)
|
||||
engine_prompt = engine_prompts[0]
|
||||
request_prompt = request_prompts[0]
|
||||
prompt_text, _, _ = self._get_prompt_components(request_prompt)
|
||||
|
||||
# Update the sampling params.
|
||||
sampling_params.max_tokens = self.max_model_len - len(
|
||||
|
||||
@ -22,6 +22,7 @@ from openai.types.responses.response_reasoning_item import ResponseReasoningItem
|
||||
from openai.types.responses.tool import Tool
|
||||
|
||||
from vllm import envs
|
||||
from vllm.entrypoints.constants import MCP_PREFIX
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ChatCompletionMessageParam,
|
||||
ResponseInputOutputItem,
|
||||
@ -44,13 +45,13 @@ def make_response_output_items_from_parsable_context(
|
||||
)
|
||||
if isinstance(output_messages[-1], ResponseFunctionToolCall):
|
||||
mcp_message = McpCall(
|
||||
id=f"mcp_{random_uuid()}",
|
||||
id=f"{MCP_PREFIX}{random_uuid()}",
|
||||
arguments=output_messages[-1].arguments,
|
||||
name=output_messages[-1].name,
|
||||
server_label=output_messages[
|
||||
-1
|
||||
].name, # TODO: store the server label
|
||||
type="mcp_call",
|
||||
type=f"{MCP_PREFIX}call",
|
||||
status="completed",
|
||||
output=message.output,
|
||||
# TODO: support error output
|
||||
@ -98,12 +99,63 @@ def construct_input_messages(
|
||||
if isinstance(request_input, str):
|
||||
messages.append({"role": "user", "content": request_input})
|
||||
else:
|
||||
for item in request_input:
|
||||
messages.append(construct_chat_message_with_tool_call(item))
|
||||
input_messages = construct_chat_messages_with_tool_call(request_input)
|
||||
messages.extend(input_messages)
|
||||
return messages
|
||||
|
||||
|
||||
def construct_chat_message_with_tool_call(
|
||||
def _maybe_combine_reasoning_and_tool_call(
|
||||
item: ResponseInputOutputItem, messages: list[ChatCompletionMessageParam]
|
||||
) -> ChatCompletionMessageParam | None:
|
||||
"""Many models treat MCP calls and reasoning as a single message.
|
||||
This function checks if the last message is a reasoning message and
|
||||
the current message is a tool call"""
|
||||
if not (
|
||||
isinstance(item, ResponseFunctionToolCall) and item.id.startswith(MCP_PREFIX)
|
||||
):
|
||||
return None
|
||||
if len(messages) == 0:
|
||||
return None
|
||||
last_message = messages[-1]
|
||||
if not (
|
||||
last_message.get("role") == "assistant"
|
||||
and last_message.get("reasoning") is not None
|
||||
):
|
||||
return None
|
||||
|
||||
last_message["tool_calls"] = [
|
||||
ChatCompletionMessageToolCallParam(
|
||||
id=item.call_id,
|
||||
function=FunctionCallTool(
|
||||
name=item.name,
|
||||
arguments=item.arguments,
|
||||
),
|
||||
type="function",
|
||||
)
|
||||
]
|
||||
return last_message
|
||||
|
||||
|
||||
def construct_chat_messages_with_tool_call(
|
||||
input_messages: list[ResponseInputOutputItem],
|
||||
) -> list[ChatCompletionMessageParam]:
|
||||
"""This function wraps _construct_single_message_from_response_item
|
||||
Because some chatMessages come from multiple response items
|
||||
for example a reasoning item and a MCP tool call are two response items
|
||||
but are one chat message
|
||||
"""
|
||||
messages: list[ChatCompletionMessageParam] = []
|
||||
for item in input_messages:
|
||||
maybe_combined_message = _maybe_combine_reasoning_and_tool_call(item, messages)
|
||||
if maybe_combined_message is not None:
|
||||
messages[-1] = maybe_combined_message
|
||||
else:
|
||||
messages.append(_construct_single_message_from_response_item(item))
|
||||
|
||||
return messages
|
||||
|
||||
|
||||
def _construct_single_message_from_response_item(
|
||||
item: ResponseInputOutputItem,
|
||||
) -> ChatCompletionMessageParam:
|
||||
if isinstance(item, ResponseFunctionToolCall):
|
||||
|
||||
@ -881,7 +881,7 @@ class FusedMoE(CustomOp):
|
||||
# Record that the clone will be used by shared_experts_stream
|
||||
# to avoid gc issue from deallocation of hidden_states_clone
|
||||
# For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
|
||||
# NOTE: We dont need shared_output.record_stream(current_stream())
|
||||
# NOTE: We don't need shared_output.record_stream(current_stream())
|
||||
# because we synch the streams before using shared_output.
|
||||
hidden_states_clone.record_stream(self.shared_experts_stream)
|
||||
|
||||
|
||||
@ -28,7 +28,7 @@ class CompressedTensorsW4A16Fp4(CompressedTensorsScheme):
|
||||
|
||||
@classmethod
|
||||
def get_min_capability(cls) -> int:
|
||||
# dont restrict as emulations
|
||||
# don't restrict as emulations
|
||||
return 80
|
||||
|
||||
def create_weights(
|
||||
|
||||
@ -31,7 +31,6 @@ from vllm.model_executor.utils import replace_parameter
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.triton_utils import tl, triton
|
||||
from vllm.utils.deep_gemm import (
|
||||
DeepGemmQuantScaleFMT,
|
||||
fp8_gemm_nt,
|
||||
is_deep_gemm_e8m0_used,
|
||||
is_deep_gemm_supported,
|
||||
@ -248,6 +247,7 @@ class W8A8BlockFp8LinearOp:
|
||||
self.act_quant_group_shape = act_quant_group_shape
|
||||
self.is_deep_gemm_supported = is_deep_gemm_supported()
|
||||
self.is_hopper = current_platform.is_device_capability(90)
|
||||
self.is_blackwell = current_platform.is_device_capability(100)
|
||||
self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used()
|
||||
|
||||
# Get the correct blockscale mul and input quant operations.
|
||||
@ -303,7 +303,7 @@ class W8A8BlockFp8LinearOp:
|
||||
weight: torch.Tensor,
|
||||
weight_scale: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
if DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0:
|
||||
if self.use_deep_gemm_e8m0 and self.is_blackwell:
|
||||
q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm(
|
||||
input_2d,
|
||||
group_size=self.act_quant_group_shape.col,
|
||||
|
||||
@ -502,6 +502,7 @@ class HunYuanVisionTransformer(nn.Module):
|
||||
cu_seqlens: list = [0]
|
||||
|
||||
hidden_states = x.to(device=self.device, dtype=self.dtype)
|
||||
# embeddings = patch_embeds + patch_pos_embed
|
||||
hidden_states = self.embeddings(hidden_states, grid_thw)
|
||||
|
||||
for t, h, w in grid_thw:
|
||||
@ -515,8 +516,14 @@ class HunYuanVisionTransformer(nn.Module):
|
||||
|
||||
hidden_states = hidden_states.reshape(seq_len, -1)
|
||||
hidden_states = hidden_states.unsqueeze(0)
|
||||
for layer_num, layer in enumerate(self.layers):
|
||||
hidden_states = layer(hidden_states)
|
||||
|
||||
# build per-image lengths once
|
||||
split_lengths = [int(h) * int(w) for (_, h, w) in grid_thw]
|
||||
for layer in self.layers:
|
||||
# hidden_states: (1, T_total, D)
|
||||
parts = hidden_states.split(split_lengths, dim=1) # list of (1, L_i, D)
|
||||
parts = [layer(p) for p in parts]
|
||||
hidden_states = torch.cat(parts, dim=1)
|
||||
|
||||
# adapter
|
||||
split_lengths = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
|
||||
|
||||
@ -201,7 +201,7 @@ class MiniMaxM2Attention(nn.Module):
|
||||
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=rotary_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=rope_parameters,
|
||||
)
|
||||
|
||||
@ -403,6 +403,7 @@ class Qwen3MoeModel(nn.Module):
|
||||
self.padding_idx = config.pad_token_id
|
||||
self.vocab_size = config.vocab_size
|
||||
self.config = config
|
||||
self.quant_config = quant_config
|
||||
self.embed_tokens = VocabParallelEmbedding(
|
||||
config.vocab_size,
|
||||
config.hidden_size,
|
||||
@ -505,6 +506,19 @@ class Qwen3MoeModel(nn.Module):
|
||||
loaded_params: set[str] = set()
|
||||
expert_params_mapping = self.get_expert_mapping()
|
||||
for name, loaded_weight in weights:
|
||||
if self.quant_config is not None and (
|
||||
scale_name := self.quant_config.get_cache_scale(name)
|
||||
):
|
||||
# Loading kv cache quantization scales
|
||||
param = params_dict[scale_name]
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
assert loaded_weight.numel() == 1, (
|
||||
f"KV scale numel {loaded_weight.numel()} != 1"
|
||||
)
|
||||
loaded_weight = loaded_weight.squeeze()
|
||||
weight_loader(param, loaded_weight)
|
||||
loaded_params.add(scale_name)
|
||||
continue
|
||||
for param_name, weight_name, shard_id in stacked_params_mapping:
|
||||
# Skip non-stacked layers and experts (experts handled below).
|
||||
if weight_name not in name:
|
||||
|
||||
@ -89,7 +89,7 @@ def _extract_data_from_linear_base_module(
|
||||
assert m.quant_method.quant_config is not None
|
||||
|
||||
w = m.weight
|
||||
ws = m.weight_scale
|
||||
ws = m.weight_scale_inv if hasattr(m, "weight_scale_inv") else m.weight_scale
|
||||
quant_block_size = m.quant_method.quant_config.weight_block_size
|
||||
|
||||
assert isinstance(w, torch.Tensor)
|
||||
|
||||
@ -954,7 +954,7 @@ MultiModalKwargsOptionalItems: TypeAlias = (
|
||||
)
|
||||
|
||||
|
||||
@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.13.")
|
||||
@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.14.")
|
||||
class MultiModalKwargs(UserDict[str, NestedTensors]):
|
||||
"""
|
||||
A dictionary that represents the keyword arguments to
|
||||
@ -964,7 +964,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
|
||||
@staticmethod
|
||||
@deprecated(
|
||||
"`MultiModalKwargs.from_hf_inputs` is deprecated and "
|
||||
"will be removed in v0.13. "
|
||||
"will be removed in v0.14. "
|
||||
"Please use `MultiModalKwargsItems.from_hf_inputs` and "
|
||||
"access the tensor data using `.get_data()`."
|
||||
)
|
||||
@ -977,7 +977,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
|
||||
@staticmethod
|
||||
@deprecated(
|
||||
"`MultiModalKwargs.from_items` is deprecated and "
|
||||
"will be removed in v0.13. "
|
||||
"will be removed in v0.14. "
|
||||
"Please use `MultiModalKwargsItems.from_seq` and "
|
||||
"access the tensor data using `.get_data()`."
|
||||
)
|
||||
|
||||
@ -429,12 +429,12 @@ def group_mm_kwargs_by_modality(
|
||||
if merge_by_field_config is not None:
|
||||
logger.warning_once(
|
||||
"The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` "
|
||||
"is deprecated and will be removed in v0.13."
|
||||
"is deprecated and will be removed in v0.14."
|
||||
)
|
||||
if multimodal_cpu_fields is not None:
|
||||
logger.warning_once(
|
||||
"The `multimodal_cpu_fields` argument of `group_mm_kwargs_by_modality` "
|
||||
"is deprecated and will be removed in v0.13."
|
||||
"is deprecated and will be removed in v0.14."
|
||||
)
|
||||
|
||||
from vllm.multimodal.inputs import MultiModalKwargsItems
|
||||
|
||||
@ -17,6 +17,8 @@ class DeepseekV32Tokenizer(HfTokenizer):
|
||||
self.name_or_path = (
|
||||
tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else ""
|
||||
)
|
||||
self._added_vocab = self.tokenizer.get_added_vocab()
|
||||
self._added_vocab_size = len(self._added_vocab)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(
|
||||
@ -98,7 +100,7 @@ class DeepseekV32Tokenizer(HfTokenizer):
|
||||
|
||||
def __len__(self) -> int:
|
||||
# </think> is an added token in DeepseekV32 tokenizer
|
||||
return self.vocab_size + len(self.get_added_vocab())
|
||||
return self.vocab_size + self._added_vocab_size
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
@ -120,7 +122,7 @@ class DeepseekV32Tokenizer(HfTokenizer):
|
||||
return self.tokenizer.get_vocab()
|
||||
|
||||
def get_added_vocab(self) -> dict[str, int]:
|
||||
return self.tokenizer.get_added_vocab()
|
||||
return self._added_vocab.copy()
|
||||
|
||||
def encode(
|
||||
self,
|
||||
|
||||
@ -17,7 +17,7 @@ def __getattr__(name: str):
|
||||
warnings.warn(
|
||||
"`vllm.transformers_utils.tokenizer.AnyTokenizer` has been moved to "
|
||||
"`vllm.tokenizers.TokenizerLike`. "
|
||||
"The old name will be removed in v0.13.",
|
||||
"The old name will be removed in v0.14.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
@ -29,7 +29,7 @@ def __getattr__(name: str):
|
||||
warnings.warn(
|
||||
"`vllm.transformers_utils.tokenizer.get_tokenizer` "
|
||||
"has been moved to `vllm.tokenizers.get_tokenizer`. "
|
||||
"The old name will be removed in v0.13.",
|
||||
"The old name will be removed in v0.14.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
@ -41,7 +41,7 @@ def __getattr__(name: str):
|
||||
warnings.warn(
|
||||
"`vllm.transformers_utils.tokenizer.cached_get_tokenizer` "
|
||||
"has been moved to `vllm.tokenizers.cached_get_tokenizer`. "
|
||||
"The old name will be removed in v0.13.",
|
||||
"The old name will be removed in v0.14.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
@ -53,7 +53,7 @@ def __getattr__(name: str):
|
||||
warnings.warn(
|
||||
"`vllm.transformers_utils.tokenizer.cached_tokenizer_from_config` "
|
||||
"has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. "
|
||||
"The old name will be removed in v0.13.",
|
||||
"The old name will be removed in v0.14.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
@ -65,7 +65,7 @@ def __getattr__(name: str):
|
||||
warnings.warn(
|
||||
"`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` "
|
||||
"has been moved to `vllm.tokenizers.init_tokenizer_from_config`. "
|
||||
"The old name will be removed in v0.13.",
|
||||
"The old name will be removed in v0.14.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
@ -75,7 +75,7 @@ def __getattr__(name: str):
|
||||
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
||||
|
||||
|
||||
@deprecated("Will be removed in v0.13. Please use `tokenizer.decode()` instead.")
|
||||
@deprecated("Will be removed in v0.14. Please use `tokenizer.decode()` instead.")
|
||||
def decode_tokens(
|
||||
tokenizer: TokenizerLike,
|
||||
token_ids: list[int],
|
||||
@ -97,7 +97,7 @@ def decode_tokens(
|
||||
return tokenizer.decode(token_ids, **kw_args)
|
||||
|
||||
|
||||
@deprecated("Will be removed in v0.13. Please use `tokenizer.encode()` instead.")
|
||||
@deprecated("Will be removed in v0.14. Please use `tokenizer.encode()` instead.")
|
||||
def encode_tokens(
|
||||
tokenizer: TokenizerLike,
|
||||
text: str,
|
||||
|
||||
@ -11,7 +11,7 @@ def __getattr__(name: str):
|
||||
warnings.warn(
|
||||
"`vllm.transformers_utils.tokenizer_base.TokenizerBase` has been "
|
||||
"moved to `vllm.tokenizers.TokenizerLike`. "
|
||||
"The old name will be removed in v0.13.",
|
||||
"The old name will be removed in v0.14.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
@ -23,7 +23,7 @@ def __getattr__(name: str):
|
||||
warnings.warn(
|
||||
"`vllm.transformers_utils.tokenizer_base.TokenizerRegistry` has been "
|
||||
"moved to `vllm.tokenizers.TokenizerRegistry`. "
|
||||
"The old name will be removed in v0.13.",
|
||||
"The old name will be removed in v0.14.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
@ -21,7 +21,7 @@ from vllm.v1.attention.backends.utils import (
|
||||
CommonAttentionMetadata,
|
||||
split_decodes_and_prefills,
|
||||
)
|
||||
from vllm.v1.kv_cache_interface import AttentionSpec
|
||||
from vllm.v1.kv_cache_interface import AttentionSpec, CrossAttentionSpec
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -50,11 +50,13 @@ class CPUAttentionBackend(AttentionBackend):
|
||||
|
||||
@classmethod
|
||||
def supports_attn_type(cls, attn_type: str) -> bool:
|
||||
"""CPU attention supports decoder and encoder-only attention."""
|
||||
"""CPU attention supports decoder,
|
||||
encoder-only and encoder-decoder attention."""
|
||||
return attn_type in (
|
||||
AttentionType.DECODER,
|
||||
AttentionType.ENCODER,
|
||||
AttentionType.ENCODER_ONLY,
|
||||
AttentionType.ENCODER_DECODER,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@ -136,6 +138,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
|
||||
self.window_size = -1
|
||||
self.block_size = vllm_config.cache_config.block_size
|
||||
self.isa = _get_attn_isa(self.dtype, self.block_size)
|
||||
self.is_cross_attention = isinstance(kv_cache_spec, CrossAttentionSpec)
|
||||
|
||||
def build(
|
||||
self,
|
||||
@ -151,7 +154,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
|
||||
seq_lens = common_attn_metadata.seq_lens
|
||||
block_table_tensor = common_attn_metadata.block_table_tensor
|
||||
slot_mapping = common_attn_metadata.slot_mapping
|
||||
causal = common_attn_metadata.causal
|
||||
causal = False if self.is_cross_attention else common_attn_metadata.causal
|
||||
|
||||
sdpa_start_loc = query_start_loc
|
||||
num_decode_tokens = 0
|
||||
@ -171,22 +174,19 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
|
||||
query_start_loc = query_start_loc[: num_decodes + 1]
|
||||
block_table_tensor = block_table_tensor[:num_decodes]
|
||||
|
||||
sheduler_metadata = None
|
||||
if causal:
|
||||
# for decode batch, use the custom kernel
|
||||
sheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
|
||||
num_reqs=num_reqs,
|
||||
num_heads=self.num_heads,
|
||||
num_kv_heads=self.num_kv_heads,
|
||||
head_dim=self.head_dim,
|
||||
seq_lens=seq_lens,
|
||||
dtype=self.dtype,
|
||||
query_start_loc=query_start_loc,
|
||||
causal=causal,
|
||||
sliding_window_size=self.window_size,
|
||||
isa=self.isa,
|
||||
enable_kv_split=True,
|
||||
)
|
||||
sheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
|
||||
num_reqs=num_reqs,
|
||||
num_heads=self.num_heads,
|
||||
num_kv_heads=self.num_kv_heads,
|
||||
head_dim=self.head_dim,
|
||||
seq_lens=seq_lens,
|
||||
dtype=self.dtype,
|
||||
query_start_loc=query_start_loc,
|
||||
causal=causal,
|
||||
sliding_window_size=self.window_size,
|
||||
isa=self.isa,
|
||||
enable_kv_split=True,
|
||||
)
|
||||
|
||||
attn_metadata = CPUAttentionMetadata(
|
||||
isa=self.isa,
|
||||
|
||||
@ -370,6 +370,6 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]
|
||||
|
||||
num_accepted_tokens = torch.diff(m.query_start_loc)
|
||||
num_decode_draft_tokens_cpu = (num_accepted_tokens - 1).cpu()
|
||||
m.num_computed_tokens_cpu = m.seq_lens_cpu - num_accepted_tokens.cpu()
|
||||
m._num_computed_tokens_cpu = m.seq_lens_cpu - num_accepted_tokens.cpu()
|
||||
|
||||
return self.build(0, m, num_accepted_tokens, num_decode_draft_tokens_cpu)
|
||||
|
||||
@ -18,7 +18,7 @@ from typing import (
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from typing_extensions import runtime_checkable
|
||||
from typing_extensions import deprecated, runtime_checkable
|
||||
|
||||
from vllm.config import VllmConfig, get_layers_from_vllm_config
|
||||
from vllm.utils.math_utils import cdiv
|
||||
@ -66,11 +66,6 @@ class CommonAttentionMetadata:
|
||||
"""(batch_size + 1,), the start location of each request in query Tensor"""
|
||||
|
||||
seq_lens: torch.Tensor
|
||||
seq_lens_cpu: torch.Tensor
|
||||
"""(batch_size,), the length of each request including both computed tokens
|
||||
and newly scheduled tokens"""
|
||||
|
||||
num_computed_tokens_cpu: torch.Tensor
|
||||
"""(batch_size,), the number of computed tokens for each request"""
|
||||
|
||||
num_reqs: int
|
||||
@ -81,7 +76,7 @@ class CommonAttentionMetadata:
|
||||
max_query_len: int
|
||||
"""Longest query in batch"""
|
||||
max_seq_len: int
|
||||
"""Longest context length in batch"""
|
||||
"""Longest context length (may be an upper bound)"""
|
||||
|
||||
block_table_tensor: torch.Tensor
|
||||
slot_mapping: torch.Tensor
|
||||
@ -100,6 +95,40 @@ class CommonAttentionMetadata:
|
||||
dcp_local_seq_lens_cpu: torch.Tensor | None = None
|
||||
"""Sequence lengths of the local rank in decode context parallelism world"""
|
||||
|
||||
# WARNING: Deprecated fields. Will be removed in a future release (v0.14.0)
|
||||
_seq_lens_cpu: torch.Tensor | None = None
|
||||
_num_computed_tokens_cpu: torch.Tensor | None = None
|
||||
|
||||
@property
|
||||
@deprecated(
|
||||
"""
|
||||
Prefer using device seq_lens directly to avoid implicit H<>D sync.
|
||||
If a CPU copy is needed, use `seq_lens.cpu()` instead.
|
||||
Will be removed in a future release (v0.14.0)
|
||||
"""
|
||||
)
|
||||
def seq_lens_cpu(self) -> torch.Tensor:
|
||||
if self._seq_lens_cpu is None:
|
||||
self._seq_lens_cpu = self.seq_lens.to("cpu")
|
||||
return self._seq_lens_cpu
|
||||
|
||||
@property
|
||||
@deprecated(
|
||||
"""
|
||||
Prefer using device seq_lens directly to avoid implicit H<>D sync which breaks full
|
||||
async scheduling. If a CPU copy is needed, it can be derived from
|
||||
query_start_loc_cpu and seq_lens.
|
||||
Will be removed in a future release (v0.14.0)
|
||||
"""
|
||||
)
|
||||
def num_computed_tokens_cpu(self) -> torch.Tensor:
|
||||
if self._num_computed_tokens_cpu is None:
|
||||
query_seq_lens = (
|
||||
self.query_start_loc_cpu[1:] - self.query_start_loc_cpu[:-1]
|
||||
)
|
||||
self._num_computed_tokens_cpu = self.seq_lens_cpu - query_seq_lens
|
||||
return self._num_computed_tokens_cpu
|
||||
|
||||
# TODO(lucas): remove once we have FULL-CG spec-decode support
|
||||
def unpadded(
|
||||
self, num_actual_tokens: int, num_actual_reqs: int
|
||||
@ -109,8 +138,12 @@ class CommonAttentionMetadata:
|
||||
query_start_loc=self.query_start_loc[: num_actual_reqs + 1],
|
||||
query_start_loc_cpu=self.query_start_loc_cpu[: num_actual_reqs + 1],
|
||||
seq_lens=self.seq_lens[:num_actual_reqs],
|
||||
seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs],
|
||||
num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs],
|
||||
_seq_lens_cpu=self._seq_lens_cpu[:num_actual_reqs]
|
||||
if self._seq_lens_cpu is not None
|
||||
else None,
|
||||
_num_computed_tokens_cpu=self._num_computed_tokens_cpu[:num_actual_reqs]
|
||||
if self._num_computed_tokens_cpu is not None
|
||||
else None,
|
||||
num_reqs=num_actual_reqs,
|
||||
num_actual_tokens=num_actual_tokens,
|
||||
max_query_len=self.max_query_len,
|
||||
@ -224,14 +257,14 @@ def _make_metadata_with_slice(
|
||||
query_start_loc=query_start_loc,
|
||||
query_start_loc_cpu=query_start_loc_cpu,
|
||||
seq_lens=seq_lens,
|
||||
seq_lens_cpu=seq_lens_cpu,
|
||||
num_computed_tokens_cpu=num_computed_tokens_cpu,
|
||||
num_reqs=num_requests,
|
||||
num_actual_tokens=num_actual_tokens,
|
||||
max_query_len=max_query_len,
|
||||
max_seq_len=max_seq_len,
|
||||
block_table_tensor=block_table_tensor,
|
||||
slot_mapping=slot_mapping,
|
||||
_seq_lens_cpu=seq_lens_cpu,
|
||||
_num_computed_tokens_cpu=num_computed_tokens_cpu,
|
||||
)
|
||||
|
||||
|
||||
@ -689,9 +722,7 @@ def make_local_attention_virtual_batches(
|
||||
return CommonAttentionMetadata(
|
||||
query_start_loc_cpu=query_start_loc_cpu,
|
||||
query_start_loc=query_start_loc_cpu.to(device=device, non_blocking=True),
|
||||
seq_lens_cpu=seq_lens_cpu,
|
||||
seq_lens=seq_lens_cpu.to(device=device, non_blocking=True),
|
||||
num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local),
|
||||
num_reqs=len(seq_lens_cpu),
|
||||
num_actual_tokens=common_attn_metadata.num_actual_tokens,
|
||||
max_query_len=seqlens_q_local.max(),
|
||||
@ -699,6 +730,8 @@ def make_local_attention_virtual_batches(
|
||||
block_table_tensor=block_table_local,
|
||||
slot_mapping=common_attn_metadata.slot_mapping,
|
||||
causal=True,
|
||||
_seq_lens_cpu=seq_lens_cpu,
|
||||
_num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local),
|
||||
)
|
||||
|
||||
|
||||
@ -719,7 +752,6 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
|
||||
logits_indices = logits_indices_padded[:num_logits_indices]
|
||||
num_reqs = common_attn_metadata.num_reqs
|
||||
query_start_loc = common_attn_metadata.query_start_loc
|
||||
seq_lens = common_attn_metadata.seq_lens
|
||||
# Example inputs
|
||||
# num_reqs: 3
|
||||
# generation_indices: [14, 18, 19, 27]
|
||||
@ -748,9 +780,7 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
|
||||
common_attn_metadata = CommonAttentionMetadata(
|
||||
query_start_loc=decode_query_start_loc,
|
||||
query_start_loc_cpu=decode_query_start_loc.to("cpu", non_blocking=True),
|
||||
seq_lens=seq_lens,
|
||||
seq_lens_cpu=seq_lens.to("cpu", non_blocking=True),
|
||||
num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu,
|
||||
seq_lens=common_attn_metadata.seq_lens,
|
||||
num_reqs=num_reqs,
|
||||
num_actual_tokens=total_num_decode_tokens,
|
||||
max_query_len=decode_max_query_len,
|
||||
@ -758,6 +788,8 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
|
||||
block_table_tensor=common_attn_metadata.block_table_tensor,
|
||||
slot_mapping=common_attn_metadata.slot_mapping,
|
||||
causal=True,
|
||||
_seq_lens_cpu=common_attn_metadata._seq_lens_cpu,
|
||||
_num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu,
|
||||
)
|
||||
return common_attn_metadata
|
||||
|
||||
|
||||
@ -192,7 +192,7 @@ class AsyncLLM(EngineClient):
|
||||
@property
|
||||
@deprecated(
|
||||
"`AsyncLLM.processor` has been renamed to `AsyncLLM.input_processor`. "
|
||||
"The old name will be removed in v0.13."
|
||||
"The old name will be removed in v0.14."
|
||||
)
|
||||
def processor(self):
|
||||
return self.input_processor
|
||||
|
||||
@ -139,7 +139,7 @@ class LLMEngine:
|
||||
@property
|
||||
@deprecated(
|
||||
"`LLMEngine.processor` has been renamed to `LLMEngine.input_processor`. "
|
||||
"The old name will be removed in v0.13."
|
||||
"The old name will be removed in v0.14."
|
||||
)
|
||||
def processor(self):
|
||||
return self.input_processor
|
||||
|
||||
@ -10,7 +10,7 @@ def __getattr__(name: str):
|
||||
warnings.warn(
|
||||
"`vllm.v1.engine.processor.Processor` has been moved to "
|
||||
"`vllm.v1.engine.input_processor.InputProcessor`. "
|
||||
"The old name will be removed in v0.13.",
|
||||
"The old name will be removed in v0.14.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
@ -440,16 +440,16 @@ class EagleProposer:
|
||||
# of main model.
|
||||
# Increment the sequence lengths.
|
||||
common_attn_metadata.seq_lens += 1
|
||||
# This is an out-of-place operation to avoid modifying the original tensor.
|
||||
common_attn_metadata.seq_lens_cpu = common_attn_metadata.seq_lens_cpu + 1
|
||||
# For the requests that exceed the max model length, we set the
|
||||
# sequence length to 1 to minimize their overheads in attention.
|
||||
|
||||
common_attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1)
|
||||
|
||||
common_attn_metadata.num_computed_tokens_cpu = (
|
||||
common_attn_metadata.seq_lens_cpu - 1
|
||||
)
|
||||
# Also update the CPU-side shadow; NOTE: this is hacky and should be
|
||||
# removed in when common_attn_metadata.seq_lens_cpu is deprecated.
|
||||
if common_attn_metadata._seq_lens_cpu is not None:
|
||||
common_attn_metadata._seq_lens_cpu += 1
|
||||
if common_attn_metadata._num_computed_tokens_cpu is not None:
|
||||
common_attn_metadata._num_computed_tokens_cpu += 1
|
||||
|
||||
# Compute the slot mapping.
|
||||
if self.uses_mrope:
|
||||
@ -656,8 +656,8 @@ class EagleProposer:
|
||||
query_start_loc=common_attn_metadata.query_start_loc,
|
||||
seq_lens=common_attn_metadata.seq_lens,
|
||||
query_start_loc_cpu=query_start_loc_cpu,
|
||||
seq_lens_cpu=common_attn_metadata.seq_lens_cpu,
|
||||
num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu,
|
||||
_seq_lens_cpu=common_attn_metadata._seq_lens_cpu,
|
||||
_num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu,
|
||||
num_reqs=common_attn_metadata.num_reqs,
|
||||
num_actual_tokens=total_num_tokens,
|
||||
max_query_len=new_query_len_per_req.max().item(),
|
||||
@ -932,8 +932,8 @@ class EagleProposer:
|
||||
query_start_loc=new_query_start_loc_cpu.to(device, non_blocking=True),
|
||||
seq_lens=new_seq_lens_cpu.to(device, non_blocking=True),
|
||||
query_start_loc_cpu=new_query_start_loc_cpu,
|
||||
seq_lens_cpu=new_seq_lens_cpu,
|
||||
num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu,
|
||||
_seq_lens_cpu=new_seq_lens_cpu,
|
||||
_num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu,
|
||||
num_reqs=common_attn_metadata.num_reqs,
|
||||
num_actual_tokens=total_num_tokens,
|
||||
max_query_len=new_query_len_per_req.max().item(),
|
||||
|
||||
@ -38,16 +38,16 @@ class MedusaProposer:
|
||||
self,
|
||||
target_hidden_states: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
) -> list[list[int]]:
|
||||
) -> torch.Tensor:
|
||||
# Generate blocks and compute logits
|
||||
blocks = self.model(target_hidden_states)
|
||||
logits = self.model.compute_logits(blocks)
|
||||
|
||||
# Get draft tokens and transpose the result
|
||||
# TODO(woosuk): OPTIMIZATION: Return GPU tensor without GPU-CPU
|
||||
# synchronization.
|
||||
draft_tokens = [logit.argmax(dim=-1).tolist() for logit in logits]
|
||||
return [list(row) for row in zip(*draft_tokens)]
|
||||
# Compute argmax for each Medusa head and stack into a single tensor
|
||||
# Shape: [batch_size, num_heads]
|
||||
draft_tokens = torch.stack([logit.argmax(dim=-1) for logit in logits], dim=1)
|
||||
|
||||
return draft_tokens
|
||||
|
||||
def load_model(self, target_model: nn.Module) -> None:
|
||||
from vllm.compilation.backends import set_model_tag
|
||||
|
||||
@ -10,7 +10,7 @@ import torch
|
||||
import vllm.envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.tokenizers import MistralTokenizer
|
||||
from vllm.tokenizers import DeepseekV32Tokenizer, MistralTokenizer
|
||||
from vllm.utils.import_utils import LazyLoader
|
||||
from vllm.v1.structured_output.backend_types import (
|
||||
StructuredOutputBackend,
|
||||
@ -56,6 +56,27 @@ class XgrammarBackend(StructuredOutputBackend):
|
||||
stop_token_ids=stop_token_ids,
|
||||
add_prefix_space=True,
|
||||
)
|
||||
elif isinstance(self.tokenizer, DeepseekV32Tokenizer):
|
||||
# copy from xgr.TokenizerInfo.from_huggingface()
|
||||
# because we are using a custom tokenizer wrapper here.
|
||||
vocab_dict = self.tokenizer.get_vocab()
|
||||
tokenizer_vocab_size = max(len(vocab_dict), self.tokenizer.max_token_id + 1)
|
||||
vocab_size = self.vocab_size or tokenizer_vocab_size
|
||||
# maintain tokenizer's indexing
|
||||
encoded_vocab = [""] * vocab_size
|
||||
for token, idx in vocab_dict.items():
|
||||
if idx < vocab_size:
|
||||
encoded_vocab[idx] = token
|
||||
stop_token_ids = [self.tokenizer.eos_token_id]
|
||||
backend_str = self.tokenizer.tokenizer.backend_tokenizer.to_str()
|
||||
metadata = xgr.TokenizerInfo._detect_metadata_from_hf(backend_str)
|
||||
tokenizer_info = xgr.TokenizerInfo(
|
||||
encoded_vocab=encoded_vocab,
|
||||
vocab_type=metadata["vocab_type"],
|
||||
vocab_size=vocab_size,
|
||||
stop_token_ids=stop_token_ids,
|
||||
add_prefix_space=metadata["add_prefix_space"],
|
||||
)
|
||||
else:
|
||||
tokenizer_info = xgr.TokenizerInfo.from_huggingface(
|
||||
self.tokenizer,
|
||||
|
||||
@ -168,9 +168,9 @@ def build_attn_metadata(
|
||||
query_start_loc=query_start_loc_gpu,
|
||||
query_start_loc_cpu=query_start_loc_cpu,
|
||||
seq_lens=seq_lens,
|
||||
seq_lens_cpu=seq_lens_cpu,
|
||||
_seq_lens_cpu=seq_lens_cpu,
|
||||
max_seq_len=max_seq_len,
|
||||
num_computed_tokens_cpu=num_computed_tokens_cpu,
|
||||
_num_computed_tokens_cpu=num_computed_tokens_cpu,
|
||||
num_reqs=num_reqs,
|
||||
num_actual_tokens=num_tokens,
|
||||
max_query_len=max_query_len,
|
||||
|
||||
@ -1267,6 +1267,8 @@ class GPUModelRunner(
|
||||
if not isinstance(kv_cache_spec, CrossAttentionSpec):
|
||||
return None, None
|
||||
|
||||
# Zero out buffer for padding requests that are not actually scheduled (CGs)
|
||||
self.encoder_seq_lens.np[:num_reqs] = 0
|
||||
# Build encoder_seq_lens array mapping request indices to
|
||||
# encoder lengths for inputs scheduled in this batch
|
||||
for req_id in num_scheduled_tokens:
|
||||
@ -1626,8 +1628,8 @@ class GPUModelRunner(
|
||||
query_start_loc=query_start_loc,
|
||||
query_start_loc_cpu=query_start_loc_cpu,
|
||||
seq_lens=seq_lens,
|
||||
seq_lens_cpu=seq_lens_cpu,
|
||||
num_computed_tokens_cpu=num_computed_tokens_cpu,
|
||||
_seq_lens_cpu=seq_lens_cpu,
|
||||
_num_computed_tokens_cpu=num_computed_tokens_cpu,
|
||||
num_actual_tokens=num_tokens_padded,
|
||||
num_reqs=num_reqs_padded,
|
||||
max_query_len=max_query_len,
|
||||
@ -2764,6 +2766,7 @@ class GPUModelRunner(
|
||||
# be improved in model runner v2)
|
||||
force_uniform_decode: bool | None = None,
|
||||
force_has_lora: bool | None = None,
|
||||
num_encoder_reqs: int = 0,
|
||||
) -> tuple[
|
||||
CUDAGraphMode,
|
||||
BatchDescriptor,
|
||||
@ -2780,6 +2783,11 @@ class GPUModelRunner(
|
||||
if force_uniform_decode is None
|
||||
else force_uniform_decode
|
||||
)
|
||||
# Encoder-decoder models only support CG for decoder_step > 0 (no enc_output
|
||||
# is present). Also, chunked-prefill is disabled, so batch are uniform.
|
||||
has_encoder_output = (
|
||||
self.model_config.is_encoder_decoder and num_encoder_reqs > 0
|
||||
)
|
||||
|
||||
has_lora = (
|
||||
len(self.input_batch.lora_id_to_lora_request) > 0
|
||||
@ -2799,7 +2807,7 @@ class GPUModelRunner(
|
||||
)
|
||||
|
||||
cudagraph_mode, batch_descriptor = dispatch_cudagraph(
|
||||
num_tokens_padded, use_cascade_attn
|
||||
num_tokens_padded, use_cascade_attn or has_encoder_output
|
||||
)
|
||||
num_tokens_padded = batch_descriptor.num_tokens
|
||||
|
||||
@ -2997,6 +3005,7 @@ class GPUModelRunner(
|
||||
num_scheduled_tokens_np=num_scheduled_tokens_np,
|
||||
max_num_scheduled_tokens=max_num_scheduled_tokens,
|
||||
use_cascade_attn=cascade_attn_prefix_lens is not None,
|
||||
num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs),
|
||||
)
|
||||
|
||||
logger.debug(
|
||||
@ -4871,7 +4880,7 @@ class GPUModelRunner(
|
||||
# we need to adjust the cudagraph sizes to be a multiple of the uniform
|
||||
# decode query length to avoid: https://github.com/vllm-project/vllm/issues/28207
|
||||
# temp-fix: https://github.com/vllm-project/vllm/issues/28207#issuecomment-3504004536
|
||||
# Will be removed in the near future when we have seperate cudagraph capture
|
||||
# Will be removed in the near future when we have separate cudagraph capture
|
||||
# sizes for decode and mixed prefill-decode.
|
||||
if (
|
||||
cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
|
||||
|
||||
@ -10,7 +10,7 @@ import torch
|
||||
import torch.nn as nn
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.distributed import (
|
||||
ensure_model_parallel_initialized,
|
||||
init_distributed_environment,
|
||||
@ -207,7 +207,8 @@ class TPUWorker:
|
||||
# one compiled bytecode. Having one FX graph/cached bytecode per
|
||||
# compiled model is required for `support_torch_compile` decorator to
|
||||
# skip dynamo guard.
|
||||
self.model_runner.reset_dynamo_cache()
|
||||
with set_current_vllm_config(self.vllm_config):
|
||||
self.model_runner.reset_dynamo_cache()
|
||||
|
||||
# Get the maximum amount of memory used by the model weights and
|
||||
# intermediate activations.
|
||||
|
||||
@ -135,7 +135,7 @@ class AttentionGroup:
|
||||
kv_cache_spec: KVCacheSpec
|
||||
kv_cache_group_id: int
|
||||
# When ubatching is enabled we will have a metadata builder for each ubatch
|
||||
# so that if they use internal persistant buffers for cudagraphs, and they
|
||||
# so that if they use internal persistent buffers for cudagraphs, and they
|
||||
# won't have to worry about conflicting with the other ubatches.
|
||||
metadata_builders: list[AttentionMetadataBuilder] = field(
|
||||
default_factory=lambda: []
|
||||
@ -313,8 +313,12 @@ def bind_kv_cache(
|
||||
# TODO - analyze where runner_kv_caches is used and the right
|
||||
# way to ensure it properly reflects multiple attention layers
|
||||
# in the same decoder block.
|
||||
if current_platform.is_cuda_alike() or current_platform.is_xpu():
|
||||
# We know that the GPU runner is not impacted by this
|
||||
if (
|
||||
current_platform.is_cuda_alike()
|
||||
or current_platform.is_xpu()
|
||||
or current_platform.is_cpu()
|
||||
):
|
||||
# We know that the GPU / CPU runner is not impacted by this
|
||||
# case. Some test code depends on runner_kv_caches, but
|
||||
# not in a way that's impacted by ignoring this.
|
||||
pass
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user