mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-29 06:17:05 +08:00
Merge branch 'main' into mlm-full-lora-support
This commit is contained in:
commit
d1307e1d29
@ -36,6 +36,11 @@ function cpu_tests() {
|
|||||||
set -e
|
set -e
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||||
|
|
||||||
|
# Run model tests
|
||||||
|
docker exec cpu-test bash -c "
|
||||||
|
set -e
|
||||||
|
pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
|
||||||
|
|
||||||
# Run kernel tests
|
# Run kernel tests
|
||||||
docker exec cpu-test bash -c "
|
docker exec cpu-test bash -c "
|
||||||
set -e
|
set -e
|
||||||
|
|||||||
@ -1,73 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euxo pipefail
|
|
||||||
|
|
||||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
|
||||||
THRESHOLD=${1:-0.25}
|
|
||||||
NUM_Q=${2:-1319}
|
|
||||||
PORT=${3:-8030}
|
|
||||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
|
||||||
mkdir -p "${OUT_DIR}"
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
local port=$1
|
|
||||||
timeout 600 bash -c '
|
|
||||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
|
||||||
sleep 1
|
|
||||||
done'
|
|
||||||
}
|
|
||||||
|
|
||||||
MODEL="deepseek-ai/DeepSeek-V2-lite"
|
|
||||||
|
|
||||||
# Set BACKENDS based on platform
|
|
||||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
|
||||||
# ROCm platform
|
|
||||||
BACKENDS=("allgather_reducescatter")
|
|
||||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
|
||||||
export VLLM_ROCM_MOE_PADDING=0
|
|
||||||
else
|
|
||||||
# Non-ROCm platform (CUDA/other)
|
|
||||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
|
||||||
fi
|
|
||||||
|
|
||||||
cleanup() {
|
|
||||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
|
||||||
kill "${SERVER_PID}" 2>/dev/null || true
|
|
||||||
for _ in {1..20}; do
|
|
||||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
|
||||||
sleep 0.5
|
|
||||||
done
|
|
||||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
for BACK in "${BACKENDS[@]}"; do
|
|
||||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
|
||||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
|
||||||
vllm serve "$MODEL" \
|
|
||||||
--enforce-eager \
|
|
||||||
--tensor-parallel-size 2 \
|
|
||||||
--data-parallel-size 2 \
|
|
||||||
--enable-expert-parallel \
|
|
||||||
--enable-eplb \
|
|
||||||
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
|
||||||
--trust-remote-code \
|
|
||||||
--max-model-len 2048 \
|
|
||||||
--port $PORT &
|
|
||||||
SERVER_PID=$!
|
|
||||||
wait_for_server $PORT
|
|
||||||
|
|
||||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
|
||||||
OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json"
|
|
||||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
|
||||||
python3 - <<PY
|
|
||||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
|
||||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
|
||||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
|
||||||
PY
|
|
||||||
|
|
||||||
cleanup
|
|
||||||
SERVER_PID=
|
|
||||||
sleep 1
|
|
||||||
PORT=$((PORT+1))
|
|
||||||
done
|
|
||||||
@ -50,7 +50,6 @@ for BACK in "${BACKENDS[@]}"; do
|
|||||||
--data-parallel-size 2 \
|
--data-parallel-size 2 \
|
||||||
--enable-expert-parallel \
|
--enable-expert-parallel \
|
||||||
--enable-eplb \
|
--enable-eplb \
|
||||||
--eplb-config '{"window_size":200,"step_interval":600}' \
|
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--max-model-len 2048 \
|
--max-model-len 2048 \
|
||||||
--port $PORT &
|
--port $PORT &
|
||||||
|
|||||||
@ -1,74 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
set -euxo pipefail
|
|
||||||
|
|
||||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
|
||||||
THRESHOLD=${1:-0.25}
|
|
||||||
NUM_Q=${2:-1319}
|
|
||||||
PORT=${3:-8040}
|
|
||||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
|
||||||
mkdir -p "${OUT_DIR}"
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
local port=$1
|
|
||||||
timeout 600 bash -c '
|
|
||||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
|
||||||
sleep 1
|
|
||||||
done'
|
|
||||||
}
|
|
||||||
|
|
||||||
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
|
|
||||||
|
|
||||||
# Set BACKENDS based on platform
|
|
||||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
|
||||||
# ROCm platform
|
|
||||||
BACKENDS=("allgather_reducescatter")
|
|
||||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
|
||||||
export VLLM_ROCM_MOE_PADDING=0
|
|
||||||
else
|
|
||||||
# Non-ROCm platform (CUDA/other)
|
|
||||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
|
||||||
fi
|
|
||||||
|
|
||||||
cleanup() {
|
|
||||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
|
||||||
kill "${SERVER_PID}" 2>/dev/null || true
|
|
||||||
for _ in {1..20}; do
|
|
||||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
|
||||||
sleep 0.5
|
|
||||||
done
|
|
||||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
trap cleanup EXIT
|
|
||||||
|
|
||||||
for BACK in "${BACKENDS[@]}"; do
|
|
||||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
|
||||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
|
||||||
vllm serve "$MODEL" \
|
|
||||||
--enforce-eager \
|
|
||||||
--tensor-parallel-size 4 \
|
|
||||||
--enable-expert-parallel \
|
|
||||||
--enable-eplb \
|
|
||||||
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
|
||||||
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
|
|
||||||
--trust-remote-code \
|
|
||||||
--max-model-len 2048 \
|
|
||||||
--gpu-memory-utilization 0.9 \
|
|
||||||
--port $PORT &
|
|
||||||
SERVER_PID=$!
|
|
||||||
wait_for_server $PORT
|
|
||||||
|
|
||||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
|
||||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
|
||||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
|
||||||
python3 - <<PY
|
|
||||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
|
||||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
|
||||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
|
||||||
PY
|
|
||||||
|
|
||||||
cleanup
|
|
||||||
SERVER_PID=
|
|
||||||
sleep 1
|
|
||||||
PORT=$((PORT+1))
|
|
||||||
done
|
|
||||||
@ -1379,22 +1379,4 @@ steps:
|
|||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
working_dir: "/vllm-workspace"
|
working_dir: "/vllm-workspace"
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||||
|
|
||||||
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
|
||||||
|
|
||||||
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
|
||||||
timeout_in_minutes: 60
|
|
||||||
gpu: h100
|
|
||||||
optional: true
|
|
||||||
num_gpus: 4
|
|
||||||
working_dir: "/vllm-workspace"
|
|
||||||
commands:
|
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
|
||||||
@ -574,7 +574,7 @@ async def benchmark(
|
|||||||
)
|
)
|
||||||
print(
|
print(
|
||||||
"{:<40} {:<10.2f}".format(
|
"{:<40} {:<10.2f}".format(
|
||||||
"Total Token throughput (tok/s):", metrics.total_token_throughput
|
"Total token throughput (tok/s):", metrics.total_token_throughput
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -117,7 +117,6 @@ torch::Tensor get_scheduler_metadata(
|
|||||||
input.casual = casual;
|
input.casual = casual;
|
||||||
input.isa = isa;
|
input.isa = isa;
|
||||||
input.enable_kv_split = enable_kv_split;
|
input.enable_kv_split = enable_kv_split;
|
||||||
TORCH_CHECK(casual, "Only supports casual mask for now.");
|
|
||||||
|
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
|
VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
|
||||||
CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
|
CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
|
||||||
|
|||||||
@ -84,7 +84,7 @@ Total input tokens: 1369
|
|||||||
Total generated tokens: 2212
|
Total generated tokens: 2212
|
||||||
Request throughput (req/s): 1.73
|
Request throughput (req/s): 1.73
|
||||||
Output token throughput (tok/s): 382.89
|
Output token throughput (tok/s): 382.89
|
||||||
Total Token throughput (tok/s): 619.85
|
Total token throughput (tok/s): 619.85
|
||||||
---------------Time to First Token----------------
|
---------------Time to First Token----------------
|
||||||
Mean TTFT (ms): 71.54
|
Mean TTFT (ms): 71.54
|
||||||
Median TTFT (ms): 73.88
|
Median TTFT (ms): 73.88
|
||||||
|
|||||||
@ -21,30 +21,20 @@ The mental model is that server-level metrics help explain the values of request
|
|||||||
|
|
||||||
### v1 Metrics
|
### v1 Metrics
|
||||||
|
|
||||||
In v1, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
|
In v1, an extensive set of metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix, for example:
|
||||||
|
|
||||||
- `vllm:num_requests_running` (Gauge) - Number of requests currently running.
|
- `vllm:num_requests_running` (Gauge) - Number of requests currently running.
|
||||||
- `vllm:num_requests_waiting` (Gauge) - Number of requests currently waiting.
|
|
||||||
- `vllm:kv_cache_usage_perc` (Gauge) - Fraction of used KV cache blocks (0–1).
|
- `vllm:kv_cache_usage_perc` (Gauge) - Fraction of used KV cache blocks (0–1).
|
||||||
- `vllm:prefix_cache_queries` (Counter) - Number of prefix cache queries.
|
- `vllm:prefix_cache_queries` (Counter) - Number of prefix cache queries.
|
||||||
- `vllm:prefix_cache_hits` (Counter) - Number of prefix cache hits.
|
- `vllm:prefix_cache_hits` (Counter) - Number of prefix cache hits.
|
||||||
- `vllm:mm_cache_queries` (Counter) - (For multimodal models) Number of multimodal cache queries.
|
|
||||||
- `vllm:mm_cache_hits` (Counter) - (For multimodal models) Number of multimodal cache hits.
|
|
||||||
- `vllm:num_preemptions_total` (Counter) - Number of preemptions.
|
|
||||||
- `vllm:prompt_tokens_total` (Counter) - Total number of prompt tokens processed.
|
- `vllm:prompt_tokens_total` (Counter) - Total number of prompt tokens processed.
|
||||||
- `vllm:generation_tokens_total` (Counter) - Total number of generated tokens.
|
- `vllm:generation_tokens_total` (Counter) - Total number of generated tokens.
|
||||||
- `vllm:iteration_tokens_total` (Histogram) - Histogram of tokens processed in each engine step.
|
|
||||||
- `vllm:cache_config_info` (Gauge) - Information about the cache configuration.
|
|
||||||
- `vllm:request_success_total` (Counter) - Number of finished requests (by finish reason).
|
- `vllm:request_success_total` (Counter) - Number of finished requests (by finish reason).
|
||||||
- `vllm:request_prompt_tokens` (Histogram) - Histogram of input prompt token counts.
|
- `vllm:request_prompt_tokens` (Histogram) - Histogram of input prompt token counts.
|
||||||
- `vllm:request_generation_tokens` (Histogram) - Histogram of generation token counts.
|
- `vllm:request_generation_tokens` (Histogram) - Histogram of generation token counts.
|
||||||
- `vllm:request_params_n` (Histogram) - Histogram of request parameter n.
|
|
||||||
- `vllm:request_params_max_tokens` - (Histogram) - Histogram of max_tokens parameter in requests.
|
|
||||||
- `vllm:time_to_first_token_seconds` (Histogram) - Time to first token (TTFT).
|
- `vllm:time_to_first_token_seconds` (Histogram) - Time to first token (TTFT).
|
||||||
- `vllm:inter_token_latency_seconds` (Histogram) - Inter-token latency.
|
- `vllm:inter_token_latency_seconds` (Histogram) - Inter-token latency.
|
||||||
- `vllm:e2e_request_latency_seconds` (Histogram) - End-to-end request latency.
|
- `vllm:e2e_request_latency_seconds` (Histogram) - End-to-end request latency.
|
||||||
- `vllm:request_queue_time_seconds` (Histogram) - Time spent in the queue.
|
|
||||||
- `vllm:request_inference_time_seconds` (Histogram) - Request inference time.
|
|
||||||
- `vllm:request_prefill_time_seconds` (Histogram) - Request prefill time.
|
- `vllm:request_prefill_time_seconds` (Histogram) - Request prefill time.
|
||||||
- `vllm:request_decode_time_seconds` (Histogram) - Request decode time.
|
- `vllm:request_decode_time_seconds` (Histogram) - Request decode time.
|
||||||
|
|
||||||
|
|||||||
@ -61,7 +61,7 @@ Now let´s see an example for each of the cases, starting with the `choice`, as
|
|||||||
print(completion.choices[0].message.content)
|
print(completion.choices[0].message.content)
|
||||||
```
|
```
|
||||||
|
|
||||||
The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template:
|
The next example shows how to use the `regex`. The supported regex syntax depends on the structured output backend. For example, `xgrammar`, `guidance`, and `outlines` use Rust-style regex, while `lm-format-enforcer` uses Python's `re` module. The idea is to generate an email address, given a simple regex template:
|
||||||
|
|
||||||
??? code
|
??? code
|
||||||
|
|
||||||
|
|||||||
149
docs/mkdocs/hooks/generate_metrics.py
Normal file
149
docs/mkdocs/hooks/generate_metrics.py
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import ast
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
logger = logging.getLogger("mkdocs")
|
||||||
|
|
||||||
|
ROOT_DIR = Path(__file__).parent.parent.parent.parent
|
||||||
|
DOCS_DIR = ROOT_DIR / "docs"
|
||||||
|
GENERATED_METRICS_DIR = DOCS_DIR / "generated" / "metrics"
|
||||||
|
|
||||||
|
# Files to scan for metric definitions - each will generate a separate table
|
||||||
|
METRIC_SOURCE_FILES = [
|
||||||
|
{"path": "vllm/v1/metrics/loggers.py", "output": "general.md"},
|
||||||
|
{
|
||||||
|
"path": "vllm/v1/spec_decode/metrics.py",
|
||||||
|
"output": "spec_decode.md",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py",
|
||||||
|
"output": "nixl_connector.md",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class MetricExtractor(ast.NodeVisitor):
|
||||||
|
"""AST visitor to extract metric definitions."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.metrics: list[dict[str, str]] = []
|
||||||
|
|
||||||
|
def visit_Call(self, node: ast.Call) -> None:
|
||||||
|
"""Visit function calls to find metric class instantiations."""
|
||||||
|
metric_type = self._get_metric_type(node)
|
||||||
|
if metric_type:
|
||||||
|
name = self._extract_kwarg(node, "name")
|
||||||
|
documentation = self._extract_kwarg(node, "documentation")
|
||||||
|
|
||||||
|
if name:
|
||||||
|
self.metrics.append(
|
||||||
|
{
|
||||||
|
"name": name,
|
||||||
|
"type": metric_type,
|
||||||
|
"documentation": documentation or "",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
self.generic_visit(node)
|
||||||
|
|
||||||
|
def _get_metric_type(self, node: ast.Call) -> str | None:
|
||||||
|
"""Determine if this call creates a metric and return its type."""
|
||||||
|
metric_type_map = {
|
||||||
|
"_gauge_cls": "gauge",
|
||||||
|
"_counter_cls": "counter",
|
||||||
|
"_histogram_cls": "histogram",
|
||||||
|
}
|
||||||
|
if isinstance(node.func, ast.Attribute):
|
||||||
|
return metric_type_map.get(node.func.attr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _extract_kwarg(self, node: ast.Call, key: str) -> str | None:
|
||||||
|
"""Extract a keyword argument value from a function call."""
|
||||||
|
for keyword in node.keywords:
|
||||||
|
if keyword.arg == key:
|
||||||
|
return self._get_string_value(keyword.value)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _get_string_value(self, node: ast.AST) -> str | None:
|
||||||
|
"""Extract string value from an AST node."""
|
||||||
|
if isinstance(node, ast.Constant):
|
||||||
|
return str(node.value) if node.value is not None else None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_metrics_from_file(filepath: Path) -> list[dict[str, str]]:
|
||||||
|
"""Parse a Python file and extract all metric definitions."""
|
||||||
|
try:
|
||||||
|
with open(filepath, encoding="utf-8") as f:
|
||||||
|
source = f.read()
|
||||||
|
|
||||||
|
tree = ast.parse(source, filename=str(filepath))
|
||||||
|
extractor = MetricExtractor()
|
||||||
|
extractor.visit(tree)
|
||||||
|
return extractor.metrics
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f"Failed to parse {filepath}: {e}") from e
|
||||||
|
|
||||||
|
|
||||||
|
def generate_markdown_table(metrics: list[dict[str, str]]) -> str:
|
||||||
|
"""Generate a markdown table from extracted metrics."""
|
||||||
|
if not metrics:
|
||||||
|
return "No metrics found.\n"
|
||||||
|
|
||||||
|
# Sort by type, then by name
|
||||||
|
metrics_sorted = sorted(metrics, key=lambda m: (m["type"], m["name"]))
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
lines.append("| Metric Name | Type | Description |")
|
||||||
|
lines.append("|-------------|------|-------------|")
|
||||||
|
|
||||||
|
for metric in metrics_sorted:
|
||||||
|
name = metric["name"]
|
||||||
|
metric_type = metric["type"].capitalize()
|
||||||
|
doc = metric["documentation"].replace("\n", " ").strip()
|
||||||
|
lines.append(f"| `{name}` | {metric_type} | {doc} |")
|
||||||
|
|
||||||
|
return "\n".join(lines) + "\n"
|
||||||
|
|
||||||
|
|
||||||
|
def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||||
|
"""Generate metrics documentation tables from source files."""
|
||||||
|
logger.info("Generating metrics documentation")
|
||||||
|
|
||||||
|
# Create generated directory if it doesn't exist
|
||||||
|
GENERATED_METRICS_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
total_metrics = 0
|
||||||
|
for source_config in METRIC_SOURCE_FILES:
|
||||||
|
source_path = source_config["path"]
|
||||||
|
output_file = source_config["output"]
|
||||||
|
|
||||||
|
filepath = ROOT_DIR / source_path
|
||||||
|
if not filepath.exists():
|
||||||
|
raise FileNotFoundError(f"Metrics source file not found: {filepath}")
|
||||||
|
|
||||||
|
logger.debug("Extracting metrics from: %s", source_path)
|
||||||
|
metrics = extract_metrics_from_file(filepath)
|
||||||
|
logger.debug("Found %d metrics in %s", len(metrics), source_path)
|
||||||
|
|
||||||
|
# Generate and write the markdown table for this source
|
||||||
|
table_content = generate_markdown_table(metrics)
|
||||||
|
output_path = GENERATED_METRICS_DIR / output_file
|
||||||
|
with open(output_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(table_content)
|
||||||
|
|
||||||
|
total_metrics += len(metrics)
|
||||||
|
logger.info(
|
||||||
|
"Generated metrics table: %s (%d metrics)",
|
||||||
|
output_path.relative_to(ROOT_DIR),
|
||||||
|
len(metrics),
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Total metrics generated: %d across %d files",
|
||||||
|
total_metrics,
|
||||||
|
len(METRIC_SOURCE_FILES),
|
||||||
|
)
|
||||||
@ -24,7 +24,7 @@ There are two distinct modes supported for online deployments - self-contained w
|
|||||||
|
|
||||||
vLLM supports "self-contained" data parallel deployments that expose a single API endpoint.
|
vLLM supports "self-contained" data parallel deployments that expose a single API endpoint.
|
||||||
|
|
||||||
It can be configured by simply including e.g. `--data-parallel-size=4` in the vllm serve command line arguments. This will require 4 GPUs. It can be combined with tensor parallel, for example `--data-parallel-size=4 --tensor-parallel-size=2`, which would require 8 GPUs.
|
It can be configured by simply including e.g. `--data-parallel-size=4` in the vllm serve command line arguments. This will require 4 GPUs. It can be combined with tensor parallel, for example `--data-parallel-size=4 --tensor-parallel-size=2`, which would require 8 GPUs. When sizing DP deployments, remember that `--max-num-seqs` applies per DP rank.
|
||||||
|
|
||||||
Running a single data parallel deployment across multiple nodes requires a different `vllm serve` to be run on each node, specifying which DP ranks should run on that node. In this case, there will still be a single HTTP entrypoint - the API server(s) will run only on one node, but it doesn't necessarily need to be co-located with the DP ranks.
|
Running a single data parallel deployment across multiple nodes requires a different `vllm serve` to be run on each node, specifying which DP ranks should run on that node. In this case, there will still be a single HTTP entrypoint - the API server(s) will run only on one node, but it doesn't necessarily need to be co-located with the DP ranks.
|
||||||
|
|
||||||
@ -80,6 +80,18 @@ When deploying large DP sizes using this method, the API server process can beco
|
|||||||

|

|
||||||
</figure>
|
</figure>
|
||||||
|
|
||||||
|
## Hybrid Load Balancing
|
||||||
|
|
||||||
|
Hybrid load balancing sits between the internal and external approaches. Each node runs its own API server(s) that only queue requests to the data-parallel engines colocated on that node. An upstream load balancer (for example, an ingress controller or traffic router) spreads user requests across those per-node endpoints.
|
||||||
|
|
||||||
|
Enable this mode with `--data-parallel-hybrid-lb` while still launching every node with the global data-parallel size. The key differences from internal load balancing are:
|
||||||
|
|
||||||
|
- You must provide `--data-parallel-size-local` and `--data-parallel-start-rank` so each node knows which ranks it owns.
|
||||||
|
- Not compatible with `--headless` since every node exposes an API endpoint.
|
||||||
|
- Scale `--api-server-count` per node based on the number of local ranks
|
||||||
|
|
||||||
|
In this configuration, each node keeps scheduling decisions local, which reduces cross-node traffic and avoids single node bottlenecks at larger DP sizes.
|
||||||
|
|
||||||
## External Load Balancing
|
## External Load Balancing
|
||||||
|
|
||||||
For larger scale deployments especially, it can make sense to handle the orchestration and load balancing of data parallel ranks externally.
|
For larger scale deployments especially, it can make sense to handle the orchestration and load balancing of data parallel ranks externally.
|
||||||
|
|||||||
@ -40,10 +40,12 @@ EP_SIZE = TP_SIZE × DP_SIZE
|
|||||||
|
|
||||||
Where:
|
Where:
|
||||||
|
|
||||||
- `TP_SIZE`: Tensor parallel size (always 1 for now)
|
- `TP_SIZE`: Tensor parallel size
|
||||||
- `DP_SIZE`: Data parallel size
|
- `DP_SIZE`: Data parallel size
|
||||||
- `EP_SIZE`: Expert parallel size (computed automatically)
|
- `EP_SIZE`: Expert parallel size (computed automatically)
|
||||||
|
|
||||||
|
When EP is enabled, MoE layers use expert parallelism instead of tensor parallelism, while attention layers continue to use tensor parallelism if `TP_SIZE > 1`.
|
||||||
|
|
||||||
### Example Command
|
### Example Command
|
||||||
|
|
||||||
The following command serves a `DeepSeek-V3-0324` model with 1-way tensor parallel, 8-way (attention) data parallel, and 8-way expert parallel. The attention weights are replicated across all GPUs, while the expert weights are split across GPUs. It will work on a H200 (or H20) node with 8 GPUs. For H100, you can try to serve a smaller model or refer to the multi-node deployment section.
|
The following command serves a `DeepSeek-V3-0324` model with 1-way tensor parallel, 8-way (attention) data parallel, and 8-way expert parallel. The attention weights are replicated across all GPUs, while the expert weights are split across GPUs. It will work on a H200 (or H20) node with 8 GPUs. For H100, you can try to serve a smaller model or refer to the multi-node deployment section.
|
||||||
@ -81,7 +83,7 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \
|
|||||||
--data-parallel-size-local 8 \ # Local DP size on this node (8 GPUs per node)
|
--data-parallel-size-local 8 \ # Local DP size on this node (8 GPUs per node)
|
||||||
--data-parallel-address 192.168.1.100 \ # Replace with actual IP of Node 1
|
--data-parallel-address 192.168.1.100 \ # Replace with actual IP of Node 1
|
||||||
--data-parallel-rpc-port 13345 \ # RPC communication port, can be any port as long as reachable by all nodes
|
--data-parallel-rpc-port 13345 \ # RPC communication port, can be any port as long as reachable by all nodes
|
||||||
--api-server-count=8 # Number of API servers for load handling (scaling this out to total ranks are recommended)
|
--api-server-count=8 # Number of API servers for load handling (scaling this out to # local ranks is recommended)
|
||||||
|
|
||||||
# Node 2 (Secondary - headless mode, no API server)
|
# Node 2 (Secondary - headless mode, no API server)
|
||||||
vllm serve deepseek-ai/DeepSeek-V3-0324 \
|
vllm serve deepseek-ai/DeepSeek-V3-0324 \
|
||||||
@ -119,9 +121,6 @@ While MoE models are typically trained so that each expert receives a similar nu
|
|||||||
|
|
||||||
Enable EPLB with the `--enable-eplb` flag.
|
Enable EPLB with the `--enable-eplb` flag.
|
||||||
|
|
||||||
!!! note "Model Support"
|
|
||||||
Currently only DeepSeek V3 architecture is supported.
|
|
||||||
|
|
||||||
When enabled, vLLM collects load statistics with every forward pass and periodically rebalances expert distribution.
|
When enabled, vLLM collects load statistics with every forward pass and periodically rebalances expert distribution.
|
||||||
|
|
||||||
### EPLB Parameters
|
### EPLB Parameters
|
||||||
@ -134,6 +133,8 @@ Configure EPLB with the `--eplb-config` argument, which accepts a JSON string. T
|
|||||||
| `step_interval`| Frequency of rebalancing (every N engine steps) | 3000 |
|
| `step_interval`| Frequency of rebalancing (every N engine steps) | 3000 |
|
||||||
| `log_balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` |
|
| `log_balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` |
|
||||||
| `num_redundant_experts` | Additional global experts per EP rank beyond equal distribution | `0` |
|
| `num_redundant_experts` | Additional global experts per EP rank beyond equal distribution | `0` |
|
||||||
|
| `use_async` | Use non-blocking EPLB for reduced latency overhead | `false` |
|
||||||
|
| `policy` | The policy type for expert parallel load balancing | `"default"` |
|
||||||
|
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
@ -183,6 +184,26 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \
|
|||||||
|
|
||||||
For multi-node deployment, add these EPLB flags to each node's command. We recommend setting `--eplb-config '{"num_redundant_experts":32}'` to 32 in large scale use cases so the most popular experts are always available.
|
For multi-node deployment, add these EPLB flags to each node's command. We recommend setting `--eplb-config '{"num_redundant_experts":32}'` to 32 in large scale use cases so the most popular experts are always available.
|
||||||
|
|
||||||
|
## Advanced Configuration
|
||||||
|
|
||||||
|
### Performance Optimization
|
||||||
|
|
||||||
|
- **DeepEP kernels**: The `high_throughput` and `low_latency` kernels are optimized for disaggregated serving and may show poor performance for mixed workloads
|
||||||
|
- **Dual Batch Overlap**: Use `--enable-dbo` to overlap all-to-all communication with compute. See [Dual Batch Overlap](../design/dbo.md) for more details.
|
||||||
|
- **Async scheduling (experimental)**: Try `--async-scheduling` to overlap scheduling with model execution.
|
||||||
|
|
||||||
|
### Troubleshooting
|
||||||
|
|
||||||
|
- **`non-zero status: 7 cannot register cq buf`**: When using Infiniband/RoCE, make sure host VM and pods show `ulimit -l` "unlimited".
|
||||||
|
- **`init failed for transport: IBGDA`**: The InfiniBand GDA kernel modules are missing. Run `tools/ep_kernels/configure_system_drivers.sh` on each GPU node and reboot. Also fixes error `NVSHMEM API called before NVSHMEM initialization has completed`.
|
||||||
|
- **NVSHMEM peer disconnect**: Usually a networking misconfiguration. If deploying via Kubernetes, verify that every pod runs with `hostNetwork: true`, `securityContext.privileged: true` to access Infiniband.
|
||||||
|
|
||||||
|
### Benchmarking
|
||||||
|
|
||||||
|
- Use simulator flags `VLLM_MOE_ROUTING_SIMULATION_STRATEGY=uniform_random` and `VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1` so token routing is balanced across EP ranks.
|
||||||
|
|
||||||
|
- Increasing `VLLM_MOE_DP_CHUNK_SIZE` may increase throughput by increasing the maximum batch size for inter-rank token transfers. This may cause DeepEP to throw `assert self.nvshmem_qp_depth >= (num_max_dispatch_tokens_per_rank + 1) * 2`, which can be fixed by increasing environment variable `NVSHMEM_QP_DEPTH`.
|
||||||
|
|
||||||
## Disaggregated Serving (Prefill/Decode Split)
|
## Disaggregated Serving (Prefill/Decode Split)
|
||||||
|
|
||||||
For production deployments requiring strict SLA guarantees for time-to-first-token and inter-token latency, disaggregated serving allows independent scaling of prefill and decode operations.
|
For production deployments requiring strict SLA guarantees for time-to-first-token and inter-token latency, disaggregated serving allows independent scaling of prefill and decode operations.
|
||||||
@ -273,3 +294,9 @@ except Exception as e:
|
|||||||
print(f"❌ Error during disaggregated serving: {e}")
|
print(f"❌ Error during disaggregated serving: {e}")
|
||||||
print("Check that both prefill and decode instances are running and accessible")
|
print("Check that both prefill and decode instances are running and accessible")
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Benchmarking
|
||||||
|
|
||||||
|
- To simulate the decode deployment of disaggregated serving, pass `--kv-transfer-config '{"kv_connector":"DecodeBenchConnector","kv_role":"kv_both"}'` to the `vllm serve` invocation. The connector populates KV cache with random values so decode can be profiled in isolation.
|
||||||
|
|
||||||
|
- **CUDAGraph capture**: Use `--compilation_config '{"cudagraph_mode": "FULL_DECODE_ONLY"}'` to enable CUDA graph capture for decode only and save KV cache.
|
||||||
|
|||||||
@ -33,11 +33,19 @@ Then query the endpoint to get the latest metrics from the server:
|
|||||||
|
|
||||||
The following metrics are exposed:
|
The following metrics are exposed:
|
||||||
|
|
||||||
??? code
|
## General Metrics
|
||||||
|
|
||||||
```python
|
--8<-- "docs/generated/metrics/general.md"
|
||||||
--8<-- "vllm/engine/metrics.py:metrics-definitions"
|
|
||||||
```
|
## Speculative Decoding Metrics
|
||||||
|
|
||||||
|
--8<-- "docs/generated/metrics/spec_decode.md"
|
||||||
|
|
||||||
|
## NIXL KV Connector Metrics
|
||||||
|
|
||||||
|
--8<-- "docs/generated/metrics/nixl_connector.md"
|
||||||
|
|
||||||
|
## Deprecation Policy
|
||||||
|
|
||||||
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
|
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
|
||||||
but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
|
but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
|
||||||
|
|||||||
@ -51,6 +51,7 @@ hooks:
|
|||||||
- docs/mkdocs/hooks/remove_announcement.py
|
- docs/mkdocs/hooks/remove_announcement.py
|
||||||
- docs/mkdocs/hooks/generate_examples.py
|
- docs/mkdocs/hooks/generate_examples.py
|
||||||
- docs/mkdocs/hooks/generate_argparse.py
|
- docs/mkdocs/hooks/generate_argparse.py
|
||||||
|
- docs/mkdocs/hooks/generate_metrics.py
|
||||||
- docs/mkdocs/hooks/url_schemes.py
|
- docs/mkdocs/hooks/url_schemes.py
|
||||||
|
|
||||||
plugins:
|
plugins:
|
||||||
|
|||||||
@ -1,2 +1,2 @@
|
|||||||
lmcache
|
lmcache >= 0.3.10.post1
|
||||||
nixl >= 0.7.1 # Required for disaggregated prefill
|
nixl >= 0.7.1 # Required for disaggregated prefill
|
||||||
|
|||||||
228
tests/entrypoints/openai/test_chat_error.py
Normal file
228
tests/entrypoints/openai/test_chat_error.py
Normal file
@ -0,0 +1,228 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from http import HTTPStatus
|
||||||
|
from typing import Any
|
||||||
|
from unittest.mock import AsyncMock, MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from vllm.config.multimodal import MultiModalConfig
|
||||||
|
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ErrorResponse
|
||||||
|
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||||
|
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
|
||||||
|
from vllm.outputs import CompletionOutput, RequestOutput
|
||||||
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
|
from vllm.v1.engine.async_llm import AsyncLLM
|
||||||
|
|
||||||
|
MODEL_NAME = "openai-community/gpt2"
|
||||||
|
MODEL_NAME_SHORT = "gpt2"
|
||||||
|
BASE_MODEL_PATHS = [
|
||||||
|
BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME),
|
||||||
|
BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MockHFConfig:
|
||||||
|
model_type: str = "any"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MockModelConfig:
|
||||||
|
task = "generate"
|
||||||
|
runner_type = "generate"
|
||||||
|
tokenizer = MODEL_NAME
|
||||||
|
trust_remote_code = False
|
||||||
|
tokenizer_mode = "auto"
|
||||||
|
max_model_len = 100
|
||||||
|
tokenizer_revision = None
|
||||||
|
multimodal_config = MultiModalConfig()
|
||||||
|
hf_config = MockHFConfig()
|
||||||
|
logits_processor_pattern = None
|
||||||
|
logits_processors: list[str] | None = None
|
||||||
|
diff_sampling_param: dict | None = None
|
||||||
|
allowed_local_media_path: str = ""
|
||||||
|
allowed_media_domains: list[str] | None = None
|
||||||
|
encoder_config = None
|
||||||
|
generation_config: str = "auto"
|
||||||
|
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
||||||
|
skip_tokenizer_init = False
|
||||||
|
|
||||||
|
def get_diff_sampling_param(self):
|
||||||
|
return self.diff_sampling_param or {}
|
||||||
|
|
||||||
|
|
||||||
|
def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
|
||||||
|
models = OpenAIServingModels(
|
||||||
|
engine_client=engine,
|
||||||
|
base_model_paths=BASE_MODEL_PATHS,
|
||||||
|
)
|
||||||
|
serving_chat = OpenAIServingChat(
|
||||||
|
engine,
|
||||||
|
models,
|
||||||
|
response_role="assistant",
|
||||||
|
request_logger=None,
|
||||||
|
chat_template=None,
|
||||||
|
chat_template_content_format="auto",
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _fake_process_inputs(
|
||||||
|
request_id,
|
||||||
|
engine_prompt,
|
||||||
|
sampling_params,
|
||||||
|
*,
|
||||||
|
lora_request,
|
||||||
|
trace_headers,
|
||||||
|
priority,
|
||||||
|
):
|
||||||
|
return dict(engine_prompt), {}
|
||||||
|
|
||||||
|
async def _fake_preprocess_chat(*args, **kwargs):
|
||||||
|
# return conversation, request_prompts, engine_prompts
|
||||||
|
return (
|
||||||
|
[{"role": "user", "content": "Test"}],
|
||||||
|
[[1, 2, 3]],
|
||||||
|
[{"prompt_token_ids": [1, 2, 3]}],
|
||||||
|
)
|
||||||
|
|
||||||
|
serving_chat._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
|
||||||
|
serving_chat._preprocess_chat = AsyncMock(side_effect=_fake_preprocess_chat)
|
||||||
|
return serving_chat
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_chat_error_non_stream():
|
||||||
|
"""test finish_reason='error' returns 500 InternalServerError (non-streaming)"""
|
||||||
|
mock_engine = MagicMock(spec=AsyncLLM)
|
||||||
|
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||||
|
mock_engine.errored = False
|
||||||
|
mock_engine.model_config = MockModelConfig()
|
||||||
|
mock_engine.input_processor = MagicMock()
|
||||||
|
mock_engine.io_processor = MagicMock()
|
||||||
|
|
||||||
|
serving_chat = _build_serving_chat(mock_engine)
|
||||||
|
|
||||||
|
completion_output = CompletionOutput(
|
||||||
|
index=0,
|
||||||
|
text="",
|
||||||
|
token_ids=[],
|
||||||
|
cumulative_logprob=None,
|
||||||
|
logprobs=None,
|
||||||
|
finish_reason="error",
|
||||||
|
)
|
||||||
|
|
||||||
|
request_output = RequestOutput(
|
||||||
|
request_id="test-id",
|
||||||
|
prompt="Test prompt",
|
||||||
|
prompt_token_ids=[1, 2, 3],
|
||||||
|
prompt_logprobs=None,
|
||||||
|
outputs=[completion_output],
|
||||||
|
finished=True,
|
||||||
|
metrics=None,
|
||||||
|
lora_request=None,
|
||||||
|
encoder_prompt=None,
|
||||||
|
encoder_prompt_token_ids=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def mock_generate(*args, **kwargs):
|
||||||
|
yield request_output
|
||||||
|
|
||||||
|
mock_engine.generate = MagicMock(side_effect=mock_generate)
|
||||||
|
|
||||||
|
request = ChatCompletionRequest(
|
||||||
|
model=MODEL_NAME,
|
||||||
|
messages=[{"role": "user", "content": "Test prompt"}],
|
||||||
|
max_tokens=10,
|
||||||
|
stream=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await serving_chat.create_chat_completion(request)
|
||||||
|
|
||||||
|
assert isinstance(response, ErrorResponse)
|
||||||
|
assert response.error.type == "InternalServerError"
|
||||||
|
assert response.error.message == "Internal server error"
|
||||||
|
assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_chat_error_stream():
|
||||||
|
"""test finish_reason='error' returns 500 InternalServerError (streaming)"""
|
||||||
|
mock_engine = MagicMock(spec=AsyncLLM)
|
||||||
|
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||||
|
mock_engine.errored = False
|
||||||
|
mock_engine.model_config = MockModelConfig()
|
||||||
|
mock_engine.input_processor = MagicMock()
|
||||||
|
mock_engine.io_processor = MagicMock()
|
||||||
|
|
||||||
|
serving_chat = _build_serving_chat(mock_engine)
|
||||||
|
|
||||||
|
completion_output_1 = CompletionOutput(
|
||||||
|
index=0,
|
||||||
|
text="Hello",
|
||||||
|
token_ids=[100],
|
||||||
|
cumulative_logprob=None,
|
||||||
|
logprobs=None,
|
||||||
|
finish_reason=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
request_output_1 = RequestOutput(
|
||||||
|
request_id="test-id",
|
||||||
|
prompt="Test prompt",
|
||||||
|
prompt_token_ids=[1, 2, 3],
|
||||||
|
prompt_logprobs=None,
|
||||||
|
outputs=[completion_output_1],
|
||||||
|
finished=False,
|
||||||
|
metrics=None,
|
||||||
|
lora_request=None,
|
||||||
|
encoder_prompt=None,
|
||||||
|
encoder_prompt_token_ids=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
completion_output_2 = CompletionOutput(
|
||||||
|
index=0,
|
||||||
|
text="Hello",
|
||||||
|
token_ids=[100],
|
||||||
|
cumulative_logprob=None,
|
||||||
|
logprobs=None,
|
||||||
|
finish_reason="error",
|
||||||
|
)
|
||||||
|
|
||||||
|
request_output_2 = RequestOutput(
|
||||||
|
request_id="test-id",
|
||||||
|
prompt="Test prompt",
|
||||||
|
prompt_token_ids=[1, 2, 3],
|
||||||
|
prompt_logprobs=None,
|
||||||
|
outputs=[completion_output_2],
|
||||||
|
finished=True,
|
||||||
|
metrics=None,
|
||||||
|
lora_request=None,
|
||||||
|
encoder_prompt=None,
|
||||||
|
encoder_prompt_token_ids=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def mock_generate(*args, **kwargs):
|
||||||
|
yield request_output_1
|
||||||
|
yield request_output_2
|
||||||
|
|
||||||
|
mock_engine.generate = MagicMock(side_effect=mock_generate)
|
||||||
|
|
||||||
|
request = ChatCompletionRequest(
|
||||||
|
model=MODEL_NAME,
|
||||||
|
messages=[{"role": "user", "content": "Test prompt"}],
|
||||||
|
max_tokens=10,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await serving_chat.create_chat_completion(request)
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
async for chunk in response:
|
||||||
|
chunks.append(chunk)
|
||||||
|
|
||||||
|
assert len(chunks) >= 2
|
||||||
|
assert any("Internal server error" in chunk for chunk in chunks), (
|
||||||
|
f"Expected error message in chunks: {chunks}"
|
||||||
|
)
|
||||||
|
assert chunks[-1] == "data: [DONE]\n\n"
|
||||||
216
tests/entrypoints/openai/test_completion_error.py
Normal file
216
tests/entrypoints/openai/test_completion_error.py
Normal file
@ -0,0 +1,216 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from http import HTTPStatus
|
||||||
|
from typing import Any
|
||||||
|
from unittest.mock import AsyncMock, MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from vllm.config.multimodal import MultiModalConfig
|
||||||
|
from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
|
||||||
|
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
||||||
|
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
|
||||||
|
from vllm.outputs import CompletionOutput, RequestOutput
|
||||||
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
|
from vllm.v1.engine.async_llm import AsyncLLM
|
||||||
|
|
||||||
|
MODEL_NAME = "openai-community/gpt2"
|
||||||
|
MODEL_NAME_SHORT = "gpt2"
|
||||||
|
BASE_MODEL_PATHS = [
|
||||||
|
BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME),
|
||||||
|
BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MockHFConfig:
|
||||||
|
model_type: str = "any"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MockModelConfig:
|
||||||
|
task = "generate"
|
||||||
|
runner_type = "generate"
|
||||||
|
tokenizer = MODEL_NAME
|
||||||
|
trust_remote_code = False
|
||||||
|
tokenizer_mode = "auto"
|
||||||
|
max_model_len = 100
|
||||||
|
tokenizer_revision = None
|
||||||
|
multimodal_config = MultiModalConfig()
|
||||||
|
hf_config = MockHFConfig()
|
||||||
|
logits_processor_pattern = None
|
||||||
|
logits_processors: list[str] | None = None
|
||||||
|
diff_sampling_param: dict | None = None
|
||||||
|
allowed_local_media_path: str = ""
|
||||||
|
allowed_media_domains: list[str] | None = None
|
||||||
|
encoder_config = None
|
||||||
|
generation_config: str = "auto"
|
||||||
|
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
||||||
|
skip_tokenizer_init = False
|
||||||
|
|
||||||
|
def get_diff_sampling_param(self):
|
||||||
|
return self.diff_sampling_param or {}
|
||||||
|
|
||||||
|
|
||||||
|
def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
|
||||||
|
models = OpenAIServingModels(
|
||||||
|
engine_client=engine,
|
||||||
|
base_model_paths=BASE_MODEL_PATHS,
|
||||||
|
)
|
||||||
|
serving_completion = OpenAIServingCompletion(
|
||||||
|
engine,
|
||||||
|
models,
|
||||||
|
request_logger=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _fake_process_inputs(
|
||||||
|
request_id,
|
||||||
|
engine_prompt,
|
||||||
|
sampling_params,
|
||||||
|
*,
|
||||||
|
lora_request,
|
||||||
|
trace_headers,
|
||||||
|
priority,
|
||||||
|
):
|
||||||
|
return dict(engine_prompt), {}
|
||||||
|
|
||||||
|
serving_completion._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
|
||||||
|
return serving_completion
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_completion_error_non_stream():
|
||||||
|
"""test finish_reason='error' returns 500 InternalServerError (non-streaming)"""
|
||||||
|
mock_engine = MagicMock(spec=AsyncLLM)
|
||||||
|
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||||
|
mock_engine.errored = False
|
||||||
|
mock_engine.model_config = MockModelConfig()
|
||||||
|
mock_engine.input_processor = MagicMock()
|
||||||
|
mock_engine.io_processor = MagicMock()
|
||||||
|
|
||||||
|
serving_completion = _build_serving_completion(mock_engine)
|
||||||
|
|
||||||
|
completion_output = CompletionOutput(
|
||||||
|
index=0,
|
||||||
|
text="",
|
||||||
|
token_ids=[],
|
||||||
|
cumulative_logprob=None,
|
||||||
|
logprobs=None,
|
||||||
|
finish_reason="error",
|
||||||
|
)
|
||||||
|
|
||||||
|
request_output = RequestOutput(
|
||||||
|
request_id="test-id",
|
||||||
|
prompt="Test prompt",
|
||||||
|
prompt_token_ids=[1, 2, 3],
|
||||||
|
prompt_logprobs=None,
|
||||||
|
outputs=[completion_output],
|
||||||
|
finished=True,
|
||||||
|
metrics=None,
|
||||||
|
lora_request=None,
|
||||||
|
encoder_prompt=None,
|
||||||
|
encoder_prompt_token_ids=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def mock_generate(*args, **kwargs):
|
||||||
|
yield request_output
|
||||||
|
|
||||||
|
mock_engine.generate = MagicMock(side_effect=mock_generate)
|
||||||
|
|
||||||
|
request = CompletionRequest(
|
||||||
|
model=MODEL_NAME,
|
||||||
|
prompt="Test prompt",
|
||||||
|
max_tokens=10,
|
||||||
|
stream=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await serving_completion.create_completion(request)
|
||||||
|
|
||||||
|
assert isinstance(response, ErrorResponse)
|
||||||
|
assert response.error.type == "InternalServerError"
|
||||||
|
assert response.error.message == "Internal server error"
|
||||||
|
assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_completion_error_stream():
|
||||||
|
"""test finish_reason='error' returns 500 InternalServerError (streaming)"""
|
||||||
|
mock_engine = MagicMock(spec=AsyncLLM)
|
||||||
|
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||||
|
mock_engine.errored = False
|
||||||
|
mock_engine.model_config = MockModelConfig()
|
||||||
|
mock_engine.input_processor = MagicMock()
|
||||||
|
mock_engine.io_processor = MagicMock()
|
||||||
|
|
||||||
|
serving_completion = _build_serving_completion(mock_engine)
|
||||||
|
|
||||||
|
completion_output_1 = CompletionOutput(
|
||||||
|
index=0,
|
||||||
|
text="Hello",
|
||||||
|
token_ids=[100],
|
||||||
|
cumulative_logprob=None,
|
||||||
|
logprobs=None,
|
||||||
|
finish_reason=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
request_output_1 = RequestOutput(
|
||||||
|
request_id="test-id",
|
||||||
|
prompt="Test prompt",
|
||||||
|
prompt_token_ids=[1, 2, 3],
|
||||||
|
prompt_logprobs=None,
|
||||||
|
outputs=[completion_output_1],
|
||||||
|
finished=False,
|
||||||
|
metrics=None,
|
||||||
|
lora_request=None,
|
||||||
|
encoder_prompt=None,
|
||||||
|
encoder_prompt_token_ids=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
completion_output_2 = CompletionOutput(
|
||||||
|
index=0,
|
||||||
|
text="Hello",
|
||||||
|
token_ids=[100],
|
||||||
|
cumulative_logprob=None,
|
||||||
|
logprobs=None,
|
||||||
|
finish_reason="error",
|
||||||
|
)
|
||||||
|
|
||||||
|
request_output_2 = RequestOutput(
|
||||||
|
request_id="test-id",
|
||||||
|
prompt="Test prompt",
|
||||||
|
prompt_token_ids=[1, 2, 3],
|
||||||
|
prompt_logprobs=None,
|
||||||
|
outputs=[completion_output_2],
|
||||||
|
finished=True,
|
||||||
|
metrics=None,
|
||||||
|
lora_request=None,
|
||||||
|
encoder_prompt=None,
|
||||||
|
encoder_prompt_token_ids=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def mock_generate(*args, **kwargs):
|
||||||
|
yield request_output_1
|
||||||
|
yield request_output_2
|
||||||
|
|
||||||
|
mock_engine.generate = MagicMock(side_effect=mock_generate)
|
||||||
|
|
||||||
|
request = CompletionRequest(
|
||||||
|
model=MODEL_NAME,
|
||||||
|
prompt="Test prompt",
|
||||||
|
max_tokens=10,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await serving_completion.create_completion(request)
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
async for chunk in response:
|
||||||
|
chunks.append(chunk)
|
||||||
|
|
||||||
|
assert len(chunks) >= 2
|
||||||
|
assert any("Internal server error" in chunk for chunk in chunks), (
|
||||||
|
f"Expected error message in chunks: {chunks}"
|
||||||
|
)
|
||||||
|
assert chunks[-1] == "data: [DONE]\n\n"
|
||||||
89
tests/entrypoints/openai/test_responses_error.py
Normal file
89
tests/entrypoints/openai/test_responses_error.py
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
from http import HTTPStatus
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from vllm.entrypoints.openai.protocol import ErrorResponse
|
||||||
|
from vllm.entrypoints.openai.serving_engine import GenerationError, OpenAIServing
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_raise_if_error_raises_generation_error():
|
||||||
|
"""test _raise_if_error raises GenerationError"""
|
||||||
|
# create a minimal OpenAIServing instance
|
||||||
|
mock_engine = MagicMock()
|
||||||
|
mock_engine.model_config = MagicMock()
|
||||||
|
mock_engine.model_config.max_model_len = 100
|
||||||
|
mock_models = MagicMock()
|
||||||
|
|
||||||
|
serving = OpenAIServing(
|
||||||
|
engine_client=mock_engine,
|
||||||
|
models=mock_models,
|
||||||
|
request_logger=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
# test that error finish_reason raises GenerationError
|
||||||
|
with pytest.raises(GenerationError) as exc_info:
|
||||||
|
serving._raise_if_error("error", "test-request-id")
|
||||||
|
|
||||||
|
assert str(exc_info.value) == "Internal server error"
|
||||||
|
assert exc_info.value.status_code == HTTPStatus.INTERNAL_SERVER_ERROR
|
||||||
|
|
||||||
|
# test that other finish_reasons don't raise
|
||||||
|
serving._raise_if_error("stop", "test-request-id") # should not raise
|
||||||
|
serving._raise_if_error("length", "test-request-id") # should not raise
|
||||||
|
serving._raise_if_error(None, "test-request-id") # should not raise
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_convert_generation_error_to_response():
|
||||||
|
"""test _convert_generation_error_to_response creates proper ErrorResponse"""
|
||||||
|
mock_engine = MagicMock()
|
||||||
|
mock_engine.model_config = MagicMock()
|
||||||
|
mock_engine.model_config.max_model_len = 100
|
||||||
|
mock_models = MagicMock()
|
||||||
|
|
||||||
|
serving = OpenAIServing(
|
||||||
|
engine_client=mock_engine,
|
||||||
|
models=mock_models,
|
||||||
|
request_logger=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
# create a GenerationError
|
||||||
|
gen_error = GenerationError("Internal server error")
|
||||||
|
|
||||||
|
# convert to ErrorResponse
|
||||||
|
error_response = serving._convert_generation_error_to_response(gen_error)
|
||||||
|
|
||||||
|
assert isinstance(error_response, ErrorResponse)
|
||||||
|
assert error_response.error.type == "InternalServerError"
|
||||||
|
assert error_response.error.message == "Internal server error"
|
||||||
|
assert error_response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_convert_generation_error_to_streaming_response():
|
||||||
|
"""test _convert_generation_error_to_streaming_response output"""
|
||||||
|
mock_engine = MagicMock()
|
||||||
|
mock_engine.model_config = MagicMock()
|
||||||
|
mock_engine.model_config.max_model_len = 100
|
||||||
|
mock_models = MagicMock()
|
||||||
|
|
||||||
|
serving = OpenAIServing(
|
||||||
|
engine_client=mock_engine,
|
||||||
|
models=mock_models,
|
||||||
|
request_logger=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
# create a GenerationError
|
||||||
|
gen_error = GenerationError("Internal server error")
|
||||||
|
|
||||||
|
# convert to streaming error response
|
||||||
|
error_json = serving._convert_generation_error_to_streaming_response(gen_error)
|
||||||
|
|
||||||
|
assert isinstance(error_json, str)
|
||||||
|
assert "Internal server error" in error_json
|
||||||
|
assert "InternalServerError" in error_json
|
||||||
@ -92,16 +92,19 @@ def run_test(
|
|||||||
*,
|
*,
|
||||||
tensor_parallel_size: int,
|
tensor_parallel_size: int,
|
||||||
distributed_executor_backend: str | None = None,
|
distributed_executor_backend: str | None = None,
|
||||||
|
dtype: str = "half",
|
||||||
) -> None:
|
) -> None:
|
||||||
prompt_list = PROMPTS * 10
|
prompt_list = PROMPTS * 10
|
||||||
expected_list = EXPECTED[model] * 10
|
expected_list = EXPECTED[model] * 10
|
||||||
|
|
||||||
with vllm_runner(
|
with vllm_runner(
|
||||||
model,
|
model,
|
||||||
dtype="half",
|
dtype=dtype,
|
||||||
max_model_len=448,
|
max_model_len=448,
|
||||||
tensor_parallel_size=tensor_parallel_size,
|
tensor_parallel_size=tensor_parallel_size,
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
|
# TODO (NickLucche) figure out output differences with non-eager and re-enable
|
||||||
|
enforce_eager=True,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
llm = vllm_model.llm
|
llm = vllm_model.llm
|
||||||
|
|
||||||
@ -120,12 +123,28 @@ def run_test(
|
|||||||
|
|
||||||
@pytest.mark.core_model
|
@pytest.mark.core_model
|
||||||
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
|
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
|
||||||
|
@pytest.mark.parametrize("dtype", ["half"])
|
||||||
@create_new_process_for_each_test()
|
@create_new_process_for_each_test()
|
||||||
def test_models(vllm_runner, model) -> None:
|
def test_models(vllm_runner, model, dtype) -> None:
|
||||||
run_test(
|
run_test(
|
||||||
vllm_runner,
|
vllm_runner,
|
||||||
model,
|
model,
|
||||||
tensor_parallel_size=1,
|
tensor_parallel_size=1,
|
||||||
|
dtype=dtype,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.cpu_model
|
||||||
|
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
|
||||||
|
@pytest.mark.parametrize("dtype", ["half"])
|
||||||
|
def test_models_cpu(vllm_runner, model, dtype) -> None:
|
||||||
|
# @create_new_process_for_each_test() does not work for some runners
|
||||||
|
# TODO: to fix cpu privilege issues in run-cpu-test-arm.sh
|
||||||
|
run_test(
|
||||||
|
vllm_runner,
|
||||||
|
model,
|
||||||
|
tensor_parallel_size=1,
|
||||||
|
dtype=dtype,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -8,6 +8,7 @@ import pytest
|
|||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm.envs import (
|
from vllm.envs import (
|
||||||
|
disable_envs_cache,
|
||||||
enable_envs_cache,
|
enable_envs_cache,
|
||||||
env_list_with_choices,
|
env_list_with_choices,
|
||||||
env_set_with_choices,
|
env_set_with_choices,
|
||||||
@ -57,6 +58,43 @@ def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch):
|
|||||||
envs.__getattr__ = envs.__getattr__.__wrapped__
|
envs.__getattr__ = envs.__getattr__.__wrapped__
|
||||||
|
|
||||||
|
|
||||||
|
def test_getattr_with_reset(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
monkeypatch.setenv("VLLM_HOST_IP", "1.1.1.1")
|
||||||
|
# __getattr__ is not decorated with functools.cache
|
||||||
|
assert not hasattr(envs.__getattr__, "cache_info")
|
||||||
|
|
||||||
|
# Enable envs cache and ignore ongoing environment changes
|
||||||
|
enable_envs_cache()
|
||||||
|
assert envs.VLLM_HOST_IP == "1.1.1.1"
|
||||||
|
# With cache enabled, the environment variable value is cached and unchanged
|
||||||
|
monkeypatch.setenv("VLLM_HOST_IP", "2.2.2.2")
|
||||||
|
assert envs.VLLM_HOST_IP == "1.1.1.1"
|
||||||
|
|
||||||
|
disable_envs_cache()
|
||||||
|
assert envs.VLLM_HOST_IP == "2.2.2.2"
|
||||||
|
# After cache disabled, the environment variable value would be synced
|
||||||
|
# with os.environ
|
||||||
|
monkeypatch.setenv("VLLM_HOST_IP", "3.3.3.3")
|
||||||
|
assert envs.VLLM_HOST_IP == "3.3.3.3"
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_envs_cache_enabled() -> None:
|
||||||
|
assert not envs._is_envs_cache_enabled()
|
||||||
|
enable_envs_cache()
|
||||||
|
assert envs._is_envs_cache_enabled()
|
||||||
|
|
||||||
|
# Only wrap one-layer of cache, so we only need to
|
||||||
|
# call disable once to reset.
|
||||||
|
enable_envs_cache()
|
||||||
|
enable_envs_cache()
|
||||||
|
enable_envs_cache()
|
||||||
|
disable_envs_cache()
|
||||||
|
assert not envs._is_envs_cache_enabled()
|
||||||
|
|
||||||
|
disable_envs_cache()
|
||||||
|
assert not envs._is_envs_cache_enabled()
|
||||||
|
|
||||||
|
|
||||||
class TestEnvWithChoices:
|
class TestEnvWithChoices:
|
||||||
"""Test cases for env_with_choices function."""
|
"""Test cases for env_with_choices function."""
|
||||||
|
|
||||||
|
|||||||
54
tests/v1/engine/test_init_error_messaging.py
Normal file
54
tests/v1/engine/test_init_error_messaging.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from vllm.v1.core.kv_cache_utils import check_enough_kv_cache_memory
|
||||||
|
from vllm.v1.kv_cache_interface import FullAttentionSpec
|
||||||
|
|
||||||
|
|
||||||
|
def test_kv_cache_oom_no_memory():
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
config = MagicMock()
|
||||||
|
config.model_config.max_model_len = 2048
|
||||||
|
|
||||||
|
spec = {
|
||||||
|
"layer_0": FullAttentionSpec(
|
||||||
|
block_size=16,
|
||||||
|
num_kv_heads=8,
|
||||||
|
head_size=128,
|
||||||
|
dtype="float16",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
check_enough_kv_cache_memory(config, spec, 0)
|
||||||
|
|
||||||
|
|
||||||
|
def test_kv_cache_oom_insufficient_memory(monkeypatch):
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
config = MagicMock()
|
||||||
|
config.model_config.max_model_len = 2048
|
||||||
|
config.cache_config.block_size = 16
|
||||||
|
config.parallel_config.tensor_parallel_size = 1
|
||||||
|
config.parallel_config.pipeline_parallel_size = 1
|
||||||
|
config.parallel_config.decode_context_parallel_size = 1
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"vllm.v1.core.kv_cache_utils.max_memory_usage_bytes",
|
||||||
|
lambda c, s: 100 * 1024**3, # 100 GiB
|
||||||
|
)
|
||||||
|
|
||||||
|
spec = {
|
||||||
|
"layer_0": FullAttentionSpec(
|
||||||
|
block_size=16,
|
||||||
|
num_kv_heads=8,
|
||||||
|
head_size=128,
|
||||||
|
dtype="float16",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
check_enough_kv_cache_memory(config, spec, 1024**3) # 1 GiB
|
||||||
163
tests/v1/kv_connector/unit/test_cache_pollution_prevention.py
Normal file
163
tests/v1/kv_connector/unit/test_cache_pollution_prevention.py
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
"""
|
||||||
|
test that invalid blocks are evicted from prefix cache to prevent pollution.
|
||||||
|
|
||||||
|
verifies that when sync-loading fails, invalid blocks are removed from the
|
||||||
|
prefix cache hash table so future requests cannot match and reuse corrupted data.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from collections.abc import Callable
|
||||||
|
from unittest.mock import Mock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from vllm.v1.core.sched.scheduler import Scheduler
|
||||||
|
from vllm.v1.request import Request, RequestStatus
|
||||||
|
|
||||||
|
from .utils import (
|
||||||
|
create_model_runner_output,
|
||||||
|
create_request,
|
||||||
|
create_scheduler,
|
||||||
|
create_vllm_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.cpu_test
|
||||||
|
|
||||||
|
|
||||||
|
def _make_get_num_new_matched_tokens(
|
||||||
|
req_num_new_matched_tokens: dict[str, int],
|
||||||
|
async_load: bool,
|
||||||
|
) -> Callable[[Request, int], tuple[int, bool]]:
|
||||||
|
def get_num_new_matched_tokens(request: Request, _: int) -> tuple[int, bool]:
|
||||||
|
value = req_num_new_matched_tokens.get(request.request_id, 0)
|
||||||
|
return value, async_load
|
||||||
|
|
||||||
|
return get_num_new_matched_tokens
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def fail_scheduler():
|
||||||
|
"""scheduler with kv_load_failure_policy='fail'"""
|
||||||
|
vllm_config = create_vllm_config()
|
||||||
|
vllm_config.kv_transfer_config.kv_load_failure_policy = "fail"
|
||||||
|
return create_scheduler(vllm_config)
|
||||||
|
|
||||||
|
|
||||||
|
def test_invalid_blocks_evicted_prevents_cache_pollution(
|
||||||
|
fail_scheduler: Scheduler,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
verify invalid blocks are evicted to prevent future cache hits.
|
||||||
|
|
||||||
|
scenario:
|
||||||
|
1. request 1 loads externally-computed blocks (sync mode)
|
||||||
|
2. some blocks fail to load and are marked invalid
|
||||||
|
3. with fail policy, invalid blocks should be evicted from prefix cache
|
||||||
|
4. request is marked as FINISHED_ERROR
|
||||||
|
"""
|
||||||
|
num_prompt_blocks = 100
|
||||||
|
num_external_computed_blocks = 99
|
||||||
|
invalid_block_idx = 50
|
||||||
|
|
||||||
|
num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size
|
||||||
|
num_external_computed_tokens = (
|
||||||
|
num_external_computed_blocks * fail_scheduler.block_size
|
||||||
|
)
|
||||||
|
|
||||||
|
# request 1: will have invalid blocks
|
||||||
|
request1 = create_request(num_tokens=num_prompt_tokens, request_id=1)
|
||||||
|
fail_scheduler.add_request(request=request1)
|
||||||
|
|
||||||
|
req_num_new_matched_tokens = {
|
||||||
|
request1.request_id: num_external_computed_tokens,
|
||||||
|
}
|
||||||
|
|
||||||
|
# mock connector indicating sync load
|
||||||
|
fail_scheduler.connector = Mock()
|
||||||
|
fail_scheduler.connector.get_num_new_matched_tokens.side_effect = (
|
||||||
|
_make_get_num_new_matched_tokens(req_num_new_matched_tokens, False)
|
||||||
|
)
|
||||||
|
fail_scheduler.connector.request_finished.return_value = (False, None)
|
||||||
|
fail_scheduler.connector.take_events.return_value = ()
|
||||||
|
|
||||||
|
scheduler_output = fail_scheduler.schedule()
|
||||||
|
|
||||||
|
# request should be running with sync KV load
|
||||||
|
assert len(fail_scheduler.running) == 1
|
||||||
|
assert request1.status == RequestStatus.RUNNING
|
||||||
|
|
||||||
|
# get allocated block IDs
|
||||||
|
req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0]
|
||||||
|
invalid_block_id = req_block_ids[invalid_block_idx]
|
||||||
|
invalid_block_ids = {invalid_block_id}
|
||||||
|
|
||||||
|
# get the block object to verify eviction later
|
||||||
|
block = fail_scheduler.kv_cache_manager.block_pool.blocks[invalid_block_id]
|
||||||
|
|
||||||
|
# cache the blocks to simulate they've been computed and cached
|
||||||
|
# (in real scenario blocks would be cached after compute)
|
||||||
|
fail_scheduler.kv_cache_manager.cache_blocks(request1, num_external_computed_tokens)
|
||||||
|
|
||||||
|
# verify block has a hash (is cached) before reporting invalid blocks
|
||||||
|
assert block.block_hash is not None, (
|
||||||
|
f"block {invalid_block_id} should be cached (have a hash) before "
|
||||||
|
f"eviction test, but hash is None"
|
||||||
|
)
|
||||||
|
|
||||||
|
# report invalid blocks
|
||||||
|
model_runner_output = create_model_runner_output(
|
||||||
|
[request1],
|
||||||
|
invalid_block_ids=invalid_block_ids,
|
||||||
|
use_eos=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
fail_scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||||
|
|
||||||
|
# verify request finished with error (fail policy)
|
||||||
|
assert request1.status == RequestStatus.FINISHED_ERROR
|
||||||
|
|
||||||
|
# critical assertion: invalid block and all subsequent blocks should be evicted
|
||||||
|
# all blocks from invalid_block_idx onwards become invalid since they were
|
||||||
|
# computed based on the failed block
|
||||||
|
for idx in range(invalid_block_idx, len(req_block_ids)):
|
||||||
|
block_id = req_block_ids[idx]
|
||||||
|
block_obj = fail_scheduler.kv_cache_manager.block_pool.blocks[block_id]
|
||||||
|
assert block_obj.block_hash is None, (
|
||||||
|
f"block {block_id} at index {idx} should have been evicted "
|
||||||
|
f"(hash reset to None), but hash is {block_obj.block_hash}. "
|
||||||
|
f"All blocks from index {invalid_block_idx} onwards should be evicted "
|
||||||
|
f"since they depend on the invalid block at index {invalid_block_idx}."
|
||||||
|
)
|
||||||
|
|
||||||
|
# verify cache contains exactly the valid blocks (before first affected block)
|
||||||
|
# and none of the invalid blocks (from first affected block onwards)
|
||||||
|
|
||||||
|
# valid blocks: all blocks before invalid_block_idx should be cached
|
||||||
|
for idx in range(invalid_block_idx):
|
||||||
|
block_id = req_block_ids[idx]
|
||||||
|
block_obj = fail_scheduler.kv_cache_manager.block_pool.blocks[block_id]
|
||||||
|
assert block_obj.block_hash is not None, (
|
||||||
|
f"valid block {block_id} at index {idx} should still be cached "
|
||||||
|
f"(have a hash), but hash is None. Only blocks from index "
|
||||||
|
f"{invalid_block_idx} onwards should be evicted."
|
||||||
|
)
|
||||||
|
|
||||||
|
# invalid blocks: verify they're not in the cached_block_hash_to_block map
|
||||||
|
cached_blocks = (
|
||||||
|
fail_scheduler.kv_cache_manager.block_pool.cached_block_hash_to_block
|
||||||
|
)
|
||||||
|
cached_block_ids = {
|
||||||
|
b.block_id
|
||||||
|
for blocks_val in cached_blocks._cache.values()
|
||||||
|
for b in (
|
||||||
|
[blocks_val] if not isinstance(blocks_val, dict) else blocks_val.values()
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
for idx in range(invalid_block_idx, len(req_block_ids)):
|
||||||
|
block_id = req_block_ids[idx]
|
||||||
|
assert block_id not in cached_block_ids, (
|
||||||
|
f"invalid block {block_id} at index {idx} should not be in cache hash table"
|
||||||
|
)
|
||||||
147
tests/v1/kv_connector/unit/test_error_propagation.py
Normal file
147
tests/v1/kv_connector/unit/test_error_propagation.py
Normal file
@ -0,0 +1,147 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
from collections.abc import Callable
|
||||||
|
from unittest.mock import Mock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from vllm.v1.core.sched.scheduler import Scheduler
|
||||||
|
from vllm.v1.request import FinishReason, Request, RequestStatus
|
||||||
|
|
||||||
|
from .utils import (
|
||||||
|
create_model_runner_output,
|
||||||
|
create_request,
|
||||||
|
create_scheduler,
|
||||||
|
create_vllm_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.cpu_test
|
||||||
|
|
||||||
|
|
||||||
|
def _make_get_num_new_matched_tokens(
|
||||||
|
req_num_new_matched_tokens: dict[str, int],
|
||||||
|
async_load: bool,
|
||||||
|
) -> Callable[[Request, int], tuple[int, bool]]:
|
||||||
|
def get_num_new_matched_tokens(request: Request, _: int) -> tuple[int, bool]:
|
||||||
|
value = req_num_new_matched_tokens.get(request.request_id, 0)
|
||||||
|
return value, async_load
|
||||||
|
|
||||||
|
return get_num_new_matched_tokens
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def fail_scheduler():
|
||||||
|
"""scheduler with kv_load_failure_policy='fail'"""
|
||||||
|
vllm_config = create_vllm_config()
|
||||||
|
vllm_config.kv_transfer_config.kv_load_failure_policy = "fail"
|
||||||
|
return create_scheduler(vllm_config)
|
||||||
|
|
||||||
|
|
||||||
|
def test_error_propagation_sync_load(fail_scheduler: Scheduler):
|
||||||
|
"""test invalid_block_ids with fail policy -> FINISHED_ERROR (sync load)"""
|
||||||
|
num_prompt_blocks = 100
|
||||||
|
num_external_computed_blocks = 99
|
||||||
|
invalid_block_idx = 50
|
||||||
|
|
||||||
|
num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size
|
||||||
|
num_external_computed_tokens = (
|
||||||
|
num_external_computed_blocks * fail_scheduler.block_size
|
||||||
|
)
|
||||||
|
|
||||||
|
request = create_request(num_tokens=num_prompt_tokens)
|
||||||
|
fail_scheduler.add_request(request=request)
|
||||||
|
|
||||||
|
req_num_new_matched_tokens = {
|
||||||
|
request.request_id: num_external_computed_tokens,
|
||||||
|
}
|
||||||
|
|
||||||
|
fail_scheduler.connector = Mock()
|
||||||
|
fail_scheduler.connector.get_num_new_matched_tokens.side_effect = (
|
||||||
|
_make_get_num_new_matched_tokens(req_num_new_matched_tokens, False)
|
||||||
|
)
|
||||||
|
fail_scheduler.connector.request_finished.return_value = (False, None)
|
||||||
|
fail_scheduler.connector.take_events.return_value = ()
|
||||||
|
|
||||||
|
scheduler_output = fail_scheduler.schedule()
|
||||||
|
|
||||||
|
assert len(fail_scheduler.running) == 1
|
||||||
|
assert len(scheduler_output.scheduled_new_reqs) == 1
|
||||||
|
assert fail_scheduler.connector.get_num_new_matched_tokens.call_count == 1
|
||||||
|
|
||||||
|
req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0]
|
||||||
|
invalid_block_ids = {req_block_ids[invalid_block_idx]}
|
||||||
|
model_runner_output = create_model_runner_output(
|
||||||
|
[request],
|
||||||
|
invalid_block_ids=invalid_block_ids,
|
||||||
|
use_eos=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
outputs = fail_scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||||
|
|
||||||
|
assert request.status == RequestStatus.FINISHED_ERROR
|
||||||
|
assert request.get_finished_reason() == FinishReason.ERROR
|
||||||
|
|
||||||
|
assert len(outputs) == 1
|
||||||
|
engine_outputs = next(iter(outputs.values()))
|
||||||
|
assert len(engine_outputs.outputs) == 1
|
||||||
|
output = engine_outputs.outputs[0]
|
||||||
|
assert output.request_id == request.request_id
|
||||||
|
assert output.finish_reason == FinishReason.ERROR
|
||||||
|
|
||||||
|
assert len(fail_scheduler.running) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_error_propagation_async_load(fail_scheduler: Scheduler):
|
||||||
|
"""test invalid_block_ids with fail policy -> FINISHED_ERROR (async load)"""
|
||||||
|
num_prompt_blocks = 100
|
||||||
|
num_external_computed_blocks = 99
|
||||||
|
invalid_block_idx = 50
|
||||||
|
|
||||||
|
num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size
|
||||||
|
num_external_computed_tokens = (
|
||||||
|
num_external_computed_blocks * fail_scheduler.block_size
|
||||||
|
)
|
||||||
|
|
||||||
|
request = create_request(num_tokens=num_prompt_tokens)
|
||||||
|
fail_scheduler.add_request(request=request)
|
||||||
|
|
||||||
|
req_num_new_matched_tokens = {
|
||||||
|
request.request_id: num_external_computed_tokens,
|
||||||
|
}
|
||||||
|
|
||||||
|
fail_scheduler.connector = Mock()
|
||||||
|
fail_scheduler.connector.get_num_new_matched_tokens.side_effect = (
|
||||||
|
_make_get_num_new_matched_tokens(req_num_new_matched_tokens, True)
|
||||||
|
)
|
||||||
|
fail_scheduler.connector.request_finished.return_value = (False, None)
|
||||||
|
fail_scheduler.connector.take_events.return_value = ()
|
||||||
|
|
||||||
|
scheduler_output = fail_scheduler.schedule()
|
||||||
|
|
||||||
|
assert len(fail_scheduler.waiting) == 1
|
||||||
|
assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS
|
||||||
|
assert request.num_computed_tokens == 0
|
||||||
|
|
||||||
|
(req_block_ids,) = fail_scheduler.kv_cache_manager.get_block_ids(request.request_id)
|
||||||
|
invalid_block_ids = {req_block_ids[invalid_block_idx]}
|
||||||
|
model_runner_output = create_model_runner_output(
|
||||||
|
reqs=[],
|
||||||
|
finished_recving=set(),
|
||||||
|
invalid_block_ids=invalid_block_ids,
|
||||||
|
use_eos=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
outputs = fail_scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||||
|
|
||||||
|
assert request.status == RequestStatus.FINISHED_ERROR
|
||||||
|
assert request.get_finished_reason() == FinishReason.ERROR
|
||||||
|
|
||||||
|
assert len(outputs) == 1
|
||||||
|
engine_outputs = next(iter(outputs.values()))
|
||||||
|
assert len(engine_outputs.outputs) == 1
|
||||||
|
output = engine_outputs.outputs[0]
|
||||||
|
assert output.request_id == request.request_id
|
||||||
|
assert output.finish_reason == FinishReason.ERROR
|
||||||
|
|
||||||
|
assert len(fail_scheduler.waiting) == 0
|
||||||
454
tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py
Normal file
454
tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py
Normal file
@ -0,0 +1,454 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
"""
|
||||||
|
Tests for correctness in invalid block handling.
|
||||||
|
|
||||||
|
These tests verify correct behavior in three scenarios:
|
||||||
|
1. Sync recompute case: Blocks should not be freed for running requests
|
||||||
|
that need to recompute invalid blocks
|
||||||
|
2. Sync fail case: Invalid blocks must be evicted from cache when request fails
|
||||||
|
3. Async recompute case: Invalid blocks should not be cached after transfer
|
||||||
|
"""
|
||||||
|
|
||||||
|
from collections.abc import Callable
|
||||||
|
from unittest.mock import Mock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from vllm.v1.core.sched.scheduler import Scheduler
|
||||||
|
from vllm.v1.request import FinishReason, Request, RequestStatus
|
||||||
|
|
||||||
|
from .utils import (
|
||||||
|
create_model_runner_output,
|
||||||
|
create_request,
|
||||||
|
create_scheduler,
|
||||||
|
create_vllm_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.cpu_test
|
||||||
|
|
||||||
|
|
||||||
|
def _make_get_num_new_matched_tokens(
|
||||||
|
req_num_new_matched_tokens: dict[str, int],
|
||||||
|
async_load: bool,
|
||||||
|
) -> Callable[[Request, int], tuple[int, bool]]:
|
||||||
|
def get_num_new_matched_tokens(request: Request, _: int) -> tuple[int, bool]:
|
||||||
|
value = req_num_new_matched_tokens.get(request.request_id, 0)
|
||||||
|
return value, async_load
|
||||||
|
|
||||||
|
return get_num_new_matched_tokens
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def fail_scheduler():
|
||||||
|
"""scheduler with kv_load_failure_policy='fail'"""
|
||||||
|
vllm_config = create_vllm_config()
|
||||||
|
vllm_config.kv_transfer_config.kv_load_failure_policy = "fail"
|
||||||
|
return create_scheduler(vllm_config)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def recompute_scheduler():
|
||||||
|
"""scheduler with kv_load_failure_policy='recompute'"""
|
||||||
|
vllm_config = create_vllm_config()
|
||||||
|
vllm_config.kv_transfer_config.kv_load_failure_policy = "recompute"
|
||||||
|
return create_scheduler(vllm_config)
|
||||||
|
|
||||||
|
|
||||||
|
def test_sync_recompute_blocks_not_freed_for_running_requests(
|
||||||
|
recompute_scheduler: Scheduler,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Test sync recompute case - blocks must not be freed for running requests.
|
||||||
|
|
||||||
|
When a running request has invalid blocks and retry_policy is 'recompute':
|
||||||
|
1. Request should remain in RUNNING state
|
||||||
|
2. num_computed_tokens should be truncated to invalid block boundary
|
||||||
|
3. Blocks should NOT be freed (request still needs them for recomputation)
|
||||||
|
4. Request should remain in scheduler.requests and scheduler.running
|
||||||
|
"""
|
||||||
|
num_prompt_blocks = 100
|
||||||
|
num_external_computed_blocks = 99
|
||||||
|
invalid_block_idx = 50
|
||||||
|
|
||||||
|
num_prompt_tokens = num_prompt_blocks * recompute_scheduler.block_size
|
||||||
|
num_external_computed_tokens = (
|
||||||
|
num_external_computed_blocks * recompute_scheduler.block_size
|
||||||
|
)
|
||||||
|
|
||||||
|
request = create_request(num_tokens=num_prompt_tokens)
|
||||||
|
recompute_scheduler.add_request(request=request)
|
||||||
|
|
||||||
|
req_num_new_matched_tokens = {
|
||||||
|
request.request_id: num_external_computed_tokens,
|
||||||
|
}
|
||||||
|
|
||||||
|
# mock connector indicating sync load
|
||||||
|
recompute_scheduler.connector = Mock()
|
||||||
|
recompute_scheduler.connector.get_num_new_matched_tokens.side_effect = (
|
||||||
|
_make_get_num_new_matched_tokens(req_num_new_matched_tokens, False)
|
||||||
|
)
|
||||||
|
recompute_scheduler.connector.request_finished.return_value = (False, None)
|
||||||
|
recompute_scheduler.connector.take_events.return_value = ()
|
||||||
|
|
||||||
|
scheduler_output = recompute_scheduler.schedule()
|
||||||
|
|
||||||
|
# request should be running with sync KV load
|
||||||
|
assert len(recompute_scheduler.running) == 1
|
||||||
|
assert len(scheduler_output.scheduled_new_reqs) == 1
|
||||||
|
assert request.status == RequestStatus.RUNNING
|
||||||
|
|
||||||
|
# get the allocated block IDs before invalid blocks are reported
|
||||||
|
req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0]
|
||||||
|
invalid_block_ids = {req_block_ids[invalid_block_idx]}
|
||||||
|
|
||||||
|
# store original num_computed_tokens for comparison
|
||||||
|
original_num_computed_tokens = request.num_computed_tokens
|
||||||
|
|
||||||
|
model_runner_output = create_model_runner_output(
|
||||||
|
[request],
|
||||||
|
invalid_block_ids=invalid_block_ids,
|
||||||
|
use_eos=False, # not finished - should continue running
|
||||||
|
)
|
||||||
|
|
||||||
|
outputs = recompute_scheduler.update_from_output(
|
||||||
|
scheduler_output, model_runner_output
|
||||||
|
)
|
||||||
|
|
||||||
|
# critical assertions for recompute case:
|
||||||
|
|
||||||
|
# 1. request should still be RUNNING (not finished, not aborted)
|
||||||
|
assert request.status == RequestStatus.RUNNING, (
|
||||||
|
f"Request should remain RUNNING for recompute, got {request.status}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. num_computed_tokens should be truncated to first invalid block
|
||||||
|
expected_truncated_tokens = invalid_block_idx * recompute_scheduler.block_size
|
||||||
|
assert request.num_computed_tokens == expected_truncated_tokens, (
|
||||||
|
f"num_computed_tokens should be truncated to {expected_truncated_tokens}, "
|
||||||
|
f"got {request.num_computed_tokens}"
|
||||||
|
)
|
||||||
|
assert request.num_computed_tokens < original_num_computed_tokens, (
|
||||||
|
"num_computed_tokens should be reduced after invalid block detection"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3. no output should be generated (request is still running)
|
||||||
|
# the request should be skipped in the output loop
|
||||||
|
assert len(outputs) == 0 or request.request_id not in [
|
||||||
|
out.request_id for outs in outputs.values() for out in outs.outputs
|
||||||
|
], "No output should be generated for recompute requests"
|
||||||
|
|
||||||
|
# 4. request should still be in running queue
|
||||||
|
assert request in recompute_scheduler.running, (
|
||||||
|
"Request should remain in running queue for recomputation"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 5. request should still be in scheduler.requests (not deleted)
|
||||||
|
assert request.request_id in recompute_scheduler.requests, (
|
||||||
|
"Request should not be deleted from scheduler.requests"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 6. blocks should NOT be freed - verify blocks are still allocated
|
||||||
|
try:
|
||||||
|
allocated_blocks = recompute_scheduler.kv_cache_manager.get_block_ids(
|
||||||
|
request.request_id
|
||||||
|
)
|
||||||
|
assert allocated_blocks is not None
|
||||||
|
assert len(allocated_blocks[0]) > 0, (
|
||||||
|
"Blocks should still be allocated for recomputation"
|
||||||
|
)
|
||||||
|
except KeyError:
|
||||||
|
pytest.fail(
|
||||||
|
"Blocks were freed incorrectly! Running requests need their blocks "
|
||||||
|
"to recompute invalid portions."
|
||||||
|
)
|
||||||
|
|
||||||
|
# 7. verify request can be rescheduled in next step
|
||||||
|
scheduler_output_2 = recompute_scheduler.schedule()
|
||||||
|
|
||||||
|
# request should appear in the new schedule to recompute invalid blocks
|
||||||
|
scheduled_req_ids = [
|
||||||
|
req.request_id for req in scheduler_output_2.scheduled_new_reqs
|
||||||
|
]
|
||||||
|
if scheduler_output_2.num_scheduled_tokens:
|
||||||
|
scheduled_req_ids.extend(scheduler_output_2.num_scheduled_tokens.keys())
|
||||||
|
|
||||||
|
assert (
|
||||||
|
request.request_id in scheduled_req_ids or len(recompute_scheduler.running) > 0
|
||||||
|
), "Request should be reschedulable for recomputation"
|
||||||
|
|
||||||
|
|
||||||
|
def test_sync_fail_invalid_blocks_evicted(fail_scheduler: Scheduler):
|
||||||
|
"""
|
||||||
|
Test sync fail case - invalid blocks must be evicted from cache.
|
||||||
|
|
||||||
|
When a request fails with policy='fail' and has invalid blocks from sync loading:
|
||||||
|
1. Request should be finished with FINISHED_ERROR
|
||||||
|
2. Invalid blocks should be evicted from the KV cache
|
||||||
|
3. Valid blocks (if shared) should remain in cache
|
||||||
|
4. Future requests should not reuse the invalid blocks
|
||||||
|
|
||||||
|
This test verifies that invalid blocks are properly evicted to prevent
|
||||||
|
cache corruption and reuse of invalid data.
|
||||||
|
"""
|
||||||
|
num_prompt_blocks = 100
|
||||||
|
num_external_computed_blocks = 99
|
||||||
|
invalid_block_idx = 50
|
||||||
|
|
||||||
|
num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size
|
||||||
|
num_external_computed_tokens = (
|
||||||
|
num_external_computed_blocks * fail_scheduler.block_size
|
||||||
|
)
|
||||||
|
|
||||||
|
request = create_request(num_tokens=num_prompt_tokens)
|
||||||
|
fail_scheduler.add_request(request=request)
|
||||||
|
|
||||||
|
req_num_new_matched_tokens = {
|
||||||
|
request.request_id: num_external_computed_tokens,
|
||||||
|
}
|
||||||
|
|
||||||
|
# mock connector indicating sync load
|
||||||
|
fail_scheduler.connector = Mock()
|
||||||
|
fail_scheduler.connector.get_num_new_matched_tokens.side_effect = (
|
||||||
|
_make_get_num_new_matched_tokens(req_num_new_matched_tokens, False)
|
||||||
|
)
|
||||||
|
fail_scheduler.connector.request_finished.return_value = (False, None)
|
||||||
|
fail_scheduler.connector.take_events.return_value = ()
|
||||||
|
|
||||||
|
scheduler_output = fail_scheduler.schedule()
|
||||||
|
|
||||||
|
# request should be running with sync KV load
|
||||||
|
assert len(fail_scheduler.running) == 1
|
||||||
|
assert request.status == RequestStatus.RUNNING
|
||||||
|
|
||||||
|
# get allocated block IDs
|
||||||
|
req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0]
|
||||||
|
invalid_block_id = req_block_ids[invalid_block_idx]
|
||||||
|
invalid_block_ids = {invalid_block_id}
|
||||||
|
|
||||||
|
# verify the block is in the block pool before we report it as invalid
|
||||||
|
block = fail_scheduler.kv_cache_manager.block_pool.blocks[invalid_block_id]
|
||||||
|
assert block is not None
|
||||||
|
|
||||||
|
# report invalid blocks - request should fail
|
||||||
|
model_runner_output = create_model_runner_output(
|
||||||
|
[request],
|
||||||
|
invalid_block_ids=invalid_block_ids,
|
||||||
|
use_eos=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
outputs = fail_scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||||
|
|
||||||
|
# verify request is finished with error
|
||||||
|
assert request.status == RequestStatus.FINISHED_ERROR
|
||||||
|
assert request.get_finished_reason() == FinishReason.ERROR
|
||||||
|
|
||||||
|
# verify output is generated
|
||||||
|
assert len(outputs) == 1
|
||||||
|
engine_outputs = next(iter(outputs.values()))
|
||||||
|
assert len(engine_outputs.outputs) == 1
|
||||||
|
output = engine_outputs.outputs[0]
|
||||||
|
assert output.request_id == request.request_id
|
||||||
|
assert output.finish_reason == FinishReason.ERROR
|
||||||
|
|
||||||
|
# verify the request was removed from scheduler
|
||||||
|
assert request.request_id not in fail_scheduler.requests
|
||||||
|
assert len(fail_scheduler.running) == 0
|
||||||
|
|
||||||
|
# critical: verify invalid block was actually freed from cache
|
||||||
|
# this is the key assertion - the invalid block should no longer be
|
||||||
|
# tracked by the KV cache manager for this request
|
||||||
|
# if it's still there, a future request could reuse the invalid data
|
||||||
|
try:
|
||||||
|
block_ids = fail_scheduler.kv_cache_manager.get_block_ids(request.request_id)
|
||||||
|
# if we get here, check if blocks were actually freed
|
||||||
|
if block_ids is not None and len(block_ids[0]) > 0:
|
||||||
|
pytest.fail(
|
||||||
|
f"Invalid blocks still tracked for finished request! "
|
||||||
|
f"Request {request.request_id} should have been freed but "
|
||||||
|
f"still has {len(block_ids[0])} blocks allocated."
|
||||||
|
)
|
||||||
|
# blocks list exists but is empty - this is fine, they were freed
|
||||||
|
except KeyError:
|
||||||
|
# expected - request completely removed from tracking
|
||||||
|
pass
|
||||||
|
|
||||||
|
# critical: verify invalid block was evicted from prefix cache
|
||||||
|
# the block should no longer have a hash (hash is reset on eviction)
|
||||||
|
assert block.block_hash is None, (
|
||||||
|
f"Invalid block {invalid_block_id} should have been evicted from cache "
|
||||||
|
f"(hash should be None), but hash is still {block.block_hash}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_async_recompute_blocks_not_cached_when_invalid(
|
||||||
|
recompute_scheduler: Scheduler,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Test async recompute case - invalid blocks not cached after transfer.
|
||||||
|
|
||||||
|
When async KV loading has invalid blocks and retry_policy is 'recompute':
|
||||||
|
1. Blocks are allocated but not cached yet
|
||||||
|
2. When async transfer completes, only valid blocks should be cached
|
||||||
|
3. Invalid blocks should never enter the prefix cache
|
||||||
|
|
||||||
|
This test verifies correctness, the failed_recving_kv_req_ids protection
|
||||||
|
ensures only valid blocks are cached when the transfer completes, and we
|
||||||
|
only evict blocks from cache that are already hashed in the block table.
|
||||||
|
"""
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
num_prompt_blocks = 100
|
||||||
|
num_external_computed_blocks = 99
|
||||||
|
invalid_block_idx = 50
|
||||||
|
|
||||||
|
num_prompt_tokens = num_prompt_blocks * recompute_scheduler.block_size
|
||||||
|
num_external_computed_tokens = (
|
||||||
|
num_external_computed_blocks * recompute_scheduler.block_size
|
||||||
|
)
|
||||||
|
|
||||||
|
request = create_request(num_tokens=num_prompt_tokens)
|
||||||
|
recompute_scheduler.add_request(request=request)
|
||||||
|
|
||||||
|
req_num_new_matched_tokens = {
|
||||||
|
request.request_id: num_external_computed_tokens,
|
||||||
|
}
|
||||||
|
|
||||||
|
# mock connector indicating async load
|
||||||
|
recompute_scheduler.connector = Mock()
|
||||||
|
recompute_scheduler.connector.get_num_new_matched_tokens.side_effect = (
|
||||||
|
_make_get_num_new_matched_tokens(req_num_new_matched_tokens, True)
|
||||||
|
)
|
||||||
|
recompute_scheduler.connector.request_finished.return_value = (False, None)
|
||||||
|
recompute_scheduler.connector.take_events.return_value = ()
|
||||||
|
|
||||||
|
scheduler_output = recompute_scheduler.schedule()
|
||||||
|
|
||||||
|
# request should be waiting for remote KVs
|
||||||
|
assert len(recompute_scheduler.waiting) == 1
|
||||||
|
assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS
|
||||||
|
assert request.num_computed_tokens == 0
|
||||||
|
|
||||||
|
# get the allocated block IDs
|
||||||
|
(req_block_ids,) = recompute_scheduler.kv_cache_manager.get_block_ids(
|
||||||
|
request.request_id
|
||||||
|
)
|
||||||
|
invalid_block_id = req_block_ids[invalid_block_idx]
|
||||||
|
invalid_block_ids = {invalid_block_id}
|
||||||
|
|
||||||
|
# get the block object to verify it's not cached yet and stays uncached
|
||||||
|
block = recompute_scheduler.kv_cache_manager.block_pool.blocks[invalid_block_id]
|
||||||
|
|
||||||
|
# verify block has no hash before invalid blocks are reported
|
||||||
|
assert block.block_hash is None, (
|
||||||
|
"Async loading blocks should not be cached yet (no hash)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# report invalid blocks (transfer not finished yet)
|
||||||
|
model_runner_output = create_model_runner_output(
|
||||||
|
reqs=[],
|
||||||
|
finished_recving=None, # transfer NOT finished
|
||||||
|
invalid_block_ids=invalid_block_ids,
|
||||||
|
use_eos=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# critical: spy on evict_blocks to verify it's NOT called for async blocks
|
||||||
|
original_evict_blocks = recompute_scheduler.kv_cache_manager.evict_blocks
|
||||||
|
evict_blocks_calls = []
|
||||||
|
|
||||||
|
def evict_blocks_spy(block_ids):
|
||||||
|
evict_blocks_calls.append(set(block_ids))
|
||||||
|
return original_evict_blocks(block_ids)
|
||||||
|
|
||||||
|
with patch.object(
|
||||||
|
recompute_scheduler.kv_cache_manager, "evict_blocks", evict_blocks_spy
|
||||||
|
):
|
||||||
|
recompute_scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||||
|
|
||||||
|
# verify evict_blocks was NOT called (async blocks excluded from eviction)
|
||||||
|
assert len(evict_blocks_calls) == 0, (
|
||||||
|
f"evict_blocks should not be called for async-only invalid blocks, "
|
||||||
|
f"but was called {len(evict_blocks_calls)} time(s) with {evict_blocks_calls}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# request should still be waiting (not finished with error due to recompute policy)
|
||||||
|
assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS
|
||||||
|
assert request.request_id in recompute_scheduler.failed_recving_kv_req_ids
|
||||||
|
|
||||||
|
# verify num_computed_tokens was truncated to before invalid block
|
||||||
|
expected_valid_tokens = invalid_block_idx * recompute_scheduler.block_size
|
||||||
|
assert request.num_computed_tokens == expected_valid_tokens
|
||||||
|
|
||||||
|
# verify invalid block still has no hash (was not evicted)
|
||||||
|
assert block.block_hash is None, (
|
||||||
|
f"Async loading blocks shouldn't be cached or evicted. "
|
||||||
|
f"Block {invalid_block_id} hash should be None but is {block.block_hash}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# now simulate async transfer completing
|
||||||
|
model_runner_output_2 = create_model_runner_output(
|
||||||
|
reqs=[],
|
||||||
|
finished_recving={request.request_id},
|
||||||
|
invalid_block_ids=None,
|
||||||
|
use_eos=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
recompute_scheduler.update_from_output(scheduler_output, model_runner_output_2)
|
||||||
|
|
||||||
|
# verify request is now marked as finished receiving and ready to be processed
|
||||||
|
assert request.request_id in recompute_scheduler.finished_recving_kv_req_ids
|
||||||
|
assert request.request_id in recompute_scheduler.failed_recving_kv_req_ids
|
||||||
|
|
||||||
|
# critical: verify invalid block still has no hash before recompute
|
||||||
|
# the async transfer invalid data was never cached
|
||||||
|
assert block.block_hash is None, (
|
||||||
|
f"Invalid block {invalid_block_id} should not be cached before recompute "
|
||||||
|
f"(hash should be None), but hash is {block.block_hash}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# critical end-to-end test: spy on cache_blocks to verify it's called with
|
||||||
|
# the truncated num_computed_tokens value
|
||||||
|
original_cache_blocks = recompute_scheduler.kv_cache_manager.cache_blocks
|
||||||
|
cache_blocks_calls = []
|
||||||
|
|
||||||
|
def cache_blocks_spy(req, num_tokens):
|
||||||
|
cache_blocks_calls.append((req.request_id, num_tokens))
|
||||||
|
return original_cache_blocks(req, num_tokens)
|
||||||
|
|
||||||
|
with patch.object(
|
||||||
|
recompute_scheduler.kv_cache_manager, "cache_blocks", cache_blocks_spy
|
||||||
|
):
|
||||||
|
# call schedule() again - this triggers _update_waiting_for_remote_kv()
|
||||||
|
# which should call cache_blocks with the truncated value
|
||||||
|
recompute_scheduler.schedule()
|
||||||
|
|
||||||
|
# verify cache_blocks was called with the truncated value
|
||||||
|
assert len(cache_blocks_calls) == 1, (
|
||||||
|
f"cache_blocks should be called exactly once, "
|
||||||
|
f"got {len(cache_blocks_calls)} calls"
|
||||||
|
)
|
||||||
|
cached_req_id, cached_num_tokens = cache_blocks_calls[0]
|
||||||
|
assert cached_req_id == request.request_id
|
||||||
|
assert cached_num_tokens == expected_valid_tokens, (
|
||||||
|
f"cache_blocks should be called with truncated value {expected_valid_tokens}, "
|
||||||
|
f"but was called with {cached_num_tokens}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# request should now be RUNNING (scheduled immediately after transfer completes)
|
||||||
|
# the flow is: WAITING_FOR_REMOTE_KVS -> WAITING -> RUNNING in same schedule() call
|
||||||
|
assert request.status == RequestStatus.RUNNING
|
||||||
|
|
||||||
|
# num_computed_tokens should be >= expected_valid_tokens because the scheduler
|
||||||
|
# will schedule additional new tokens (up to max_num_batched_tokens) for the request
|
||||||
|
assert request.num_computed_tokens >= expected_valid_tokens, (
|
||||||
|
f"num_computed_tokens should be at least {expected_valid_tokens}, "
|
||||||
|
f"got {request.num_computed_tokens}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# request should no longer be in the failed/finished receiving sets
|
||||||
|
assert request.request_id not in recompute_scheduler.failed_recving_kv_req_ids
|
||||||
|
assert request.request_id not in recompute_scheduler.finished_recving_kv_req_ids
|
||||||
|
|
||||||
|
# request should be in the running queue
|
||||||
|
assert request in recompute_scheduler.running
|
||||||
@ -7,7 +7,7 @@ Here we break down the requirements in 2 steps:
|
|||||||
1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
|
1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
|
||||||
2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine.
|
2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine.
|
||||||
|
|
||||||
2 is necessary for multi-node deployment.
|
Step 2 is necessary for multi-node deployment.
|
||||||
|
|
||||||
All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`.
|
All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`.
|
||||||
|
|
||||||
@ -23,6 +23,6 @@ TORCH_CUDA_ARCH_LIST="10.0" bash install_python_libraries.sh
|
|||||||
Additional step for multi-node deployment:
|
Additional step for multi-node deployment:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
sudo bash configure_system_drivers.sh
|
sudo bash configure_system_drivers.sh # update-initramfs can take several minutes
|
||||||
sudo reboot # Reboot is required to load the new driver
|
sudo reboot # Reboot is required to load the new driver
|
||||||
```
|
```
|
||||||
|
|||||||
@ -788,7 +788,7 @@ async def benchmark(
|
|||||||
)
|
)
|
||||||
print(
|
print(
|
||||||
"{:<40} {:<10.2f}".format(
|
"{:<40} {:<10.2f}".format(
|
||||||
"Total Token throughput (tok/s):", metrics.total_token_throughput
|
"Total token throughput (tok/s):", metrics.total_token_throughput
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -64,6 +64,11 @@ class KVTransferConfig:
|
|||||||
enable_permute_local_kv: bool = False
|
enable_permute_local_kv: bool = False
|
||||||
"""Experiment feature flag to enable HND to NHD KV Transfer"""
|
"""Experiment feature flag to enable HND to NHD KV Transfer"""
|
||||||
|
|
||||||
|
kv_load_failure_policy: Literal["recompute", "fail"] = "recompute"
|
||||||
|
"""Policy for handling KV cache load failures.
|
||||||
|
'recompute': reschedule the request to recompute failed blocks (default)
|
||||||
|
'fail': immediately fail the request with an error finish reason"""
|
||||||
|
|
||||||
def compute_hash(self) -> str:
|
def compute_hash(self) -> str:
|
||||||
"""
|
"""
|
||||||
WARNING: Whenever a new field is added to this config,
|
WARNING: Whenever a new field is added to this config,
|
||||||
|
|||||||
@ -666,8 +666,9 @@ class VllmConfig:
|
|||||||
|
|
||||||
default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
|
default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
|
||||||
self._apply_optimization_level_defaults(default_config)
|
self._apply_optimization_level_defaults(default_config)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
|
self.compilation_config.cudagraph_mode.requires_piecewise_compilation()
|
||||||
and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
|
and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
|
||||||
):
|
):
|
||||||
logger.info(
|
logger.info(
|
||||||
@ -692,22 +693,29 @@ class VllmConfig:
|
|||||||
|
|
||||||
if current_platform.support_static_graph_mode():
|
if current_platform.support_static_graph_mode():
|
||||||
# if cudagraph_mode has full cudagraphs, we need to check support
|
# if cudagraph_mode has full cudagraphs, we need to check support
|
||||||
if (
|
if model_config := self.model_config:
|
||||||
self.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
if (
|
||||||
and self.model_config is not None
|
self.compilation_config.cudagraph_mode.has_full_cudagraphs()
|
||||||
):
|
and model_config.pooler_config is not None
|
||||||
if self.model_config.pooler_config is not None:
|
):
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"Pooling models do not support full cudagraphs. "
|
"Pooling models do not support full cudagraphs. "
|
||||||
"Overriding cudagraph_mode to PIECEWISE."
|
"Overriding cudagraph_mode to PIECEWISE."
|
||||||
)
|
)
|
||||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||||
elif self.model_config.is_encoder_decoder:
|
elif (
|
||||||
logger.warning_once(
|
model_config.is_encoder_decoder
|
||||||
"Encoder-decoder models do not support full cudagraphs. "
|
and self.compilation_config.cudagraph_mode
|
||||||
"Overriding cudagraph_mode to PIECEWISE."
|
not in (CUDAGraphMode.NONE, CUDAGraphMode.FULL_DECODE_ONLY)
|
||||||
|
):
|
||||||
|
logger.info_once(
|
||||||
|
"Encoder-decoder models do not support %s. "
|
||||||
|
"Overriding cudagraph_mode to FULL_DECODE_ONLY.",
|
||||||
|
self.compilation_config.cudagraph_mode.name,
|
||||||
|
)
|
||||||
|
self.compilation_config.cudagraph_mode = (
|
||||||
|
CUDAGraphMode.FULL_DECODE_ONLY
|
||||||
)
|
)
|
||||||
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
|
||||||
|
|
||||||
# disable cudagraph when enforce eager execution
|
# disable cudagraph when enforce eager execution
|
||||||
if self.model_config is not None and self.model_config.enforce_eager:
|
if self.model_config is not None and self.model_config.enforce_eager:
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import functools
|
import functools
|
||||||
import pickle
|
import pickle
|
||||||
|
import threading
|
||||||
import time
|
import time
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
@ -43,6 +44,33 @@ VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL
|
|||||||
from_bytes_big = functools.partial(int.from_bytes, byteorder="big")
|
from_bytes_big = functools.partial(int.from_bytes, byteorder="big")
|
||||||
|
|
||||||
|
|
||||||
|
# Memory fence for cross-process shared memory visibility.
|
||||||
|
# Required for correct producer-consumer synchronization when using
|
||||||
|
# shared memory without locks.
|
||||||
|
_memory_fence_lock = threading.Lock()
|
||||||
|
|
||||||
|
|
||||||
|
def memory_fence():
|
||||||
|
"""
|
||||||
|
Full memory barrier for shared memory synchronization.
|
||||||
|
|
||||||
|
Ensures all prior memory writes are visible to other processes before
|
||||||
|
any subsequent reads. This is critical for lock-free producer-consumer
|
||||||
|
patterns using shared memory.
|
||||||
|
|
||||||
|
Implementation acquires and immediately releases a lock. Python's
|
||||||
|
threading.Lock provides sequentially consistent memory barrier semantics
|
||||||
|
across all major platforms (POSIX, Windows). This is a lightweight
|
||||||
|
operation (~20ns) that guarantees:
|
||||||
|
- All stores before the barrier are visible to other threads/processes
|
||||||
|
- All loads after the barrier see the latest values
|
||||||
|
"""
|
||||||
|
# Lock acquire/release provides full memory barrier semantics.
|
||||||
|
# Using context manager ensures lock release even on exceptions.
|
||||||
|
with _memory_fence_lock:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def to_bytes_big(value: int, size: int) -> bytes:
|
def to_bytes_big(value: int, size: int) -> bytes:
|
||||||
return value.to_bytes(size, byteorder="big")
|
return value.to_bytes(size, byteorder="big")
|
||||||
|
|
||||||
@ -414,6 +442,10 @@ class MessageQueue:
|
|||||||
n_warning = 1
|
n_warning = 1
|
||||||
while True:
|
while True:
|
||||||
with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
|
with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
|
||||||
|
# Memory fence ensures we see the latest read flags from readers.
|
||||||
|
# Without this, we may read stale flags from our CPU cache and
|
||||||
|
# spin indefinitely even though readers have completed.
|
||||||
|
memory_fence()
|
||||||
read_count = sum(metadata_buffer[1:])
|
read_count = sum(metadata_buffer[1:])
|
||||||
written_flag = metadata_buffer[0]
|
written_flag = metadata_buffer[0]
|
||||||
if written_flag and read_count != self.buffer.n_reader:
|
if written_flag and read_count != self.buffer.n_reader:
|
||||||
@ -458,6 +490,10 @@ class MessageQueue:
|
|||||||
metadata_buffer[i] = 0
|
metadata_buffer[i] = 0
|
||||||
# mark the block as written
|
# mark the block as written
|
||||||
metadata_buffer[0] = 1
|
metadata_buffer[0] = 1
|
||||||
|
# Memory fence ensures the write is visible to readers on other cores
|
||||||
|
# before we proceed. Without this, readers may spin indefinitely
|
||||||
|
# waiting for a write that's stuck in our CPU's store buffer.
|
||||||
|
memory_fence()
|
||||||
self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks
|
self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -473,6 +509,10 @@ class MessageQueue:
|
|||||||
n_warning = 1
|
n_warning = 1
|
||||||
while True:
|
while True:
|
||||||
with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
|
with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
|
||||||
|
# Memory fence ensures we see the latest writes from the writer.
|
||||||
|
# Without this, we may read stale flags from our CPU cache
|
||||||
|
# and spin indefinitely even though writer has updated them.
|
||||||
|
memory_fence()
|
||||||
read_flag = metadata_buffer[self.local_reader_rank + 1]
|
read_flag = metadata_buffer[self.local_reader_rank + 1]
|
||||||
written_flag = metadata_buffer[0]
|
written_flag = metadata_buffer[0]
|
||||||
if not written_flag or read_flag:
|
if not written_flag or read_flag:
|
||||||
@ -513,6 +553,10 @@ class MessageQueue:
|
|||||||
# caller has read from the buffer
|
# caller has read from the buffer
|
||||||
# set the read flag
|
# set the read flag
|
||||||
metadata_buffer[self.local_reader_rank + 1] = 1
|
metadata_buffer[self.local_reader_rank + 1] = 1
|
||||||
|
# Memory fence ensures the read flag is visible to the writer.
|
||||||
|
# Without this, writer may not see our read completion and
|
||||||
|
# could wait indefinitely for all readers to finish.
|
||||||
|
memory_fence()
|
||||||
self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks
|
self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks
|
||||||
|
|
||||||
self._read_spin_timer.record_activity()
|
self._read_spin_timer.record_activity()
|
||||||
|
|||||||
@ -322,6 +322,9 @@ async def transfer_layer(
|
|||||||
num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
|
num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
|
||||||
assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
|
assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
|
||||||
assert num_physical_experts == ep_size * num_local_physical_experts
|
assert num_physical_experts == ep_size * num_local_physical_experts
|
||||||
|
# A buffer to hold the expert weights in one layer during the exchange.
|
||||||
|
# NOTE: Currently we assume the same weights across different layers
|
||||||
|
# have the same shape.
|
||||||
|
|
||||||
is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
|
is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
|
||||||
num_local_experts=num_local_physical_experts,
|
num_local_experts=num_local_physical_experts,
|
||||||
|
|||||||
@ -27,7 +27,7 @@ from lmcache.v1.lookup_client.lmcache_async_lookup_client import (
|
|||||||
LMCacheAsyncLookupServer,
|
LMCacheAsyncLookupServer,
|
||||||
)
|
)
|
||||||
from lmcache.v1.offload_server.zmq_server import ZMQOffloadServer
|
from lmcache.v1.offload_server.zmq_server import ZMQOffloadServer
|
||||||
from lmcache.v1.plugin.plugin_launcher import PluginLauncher
|
from lmcache.v1.plugin.runtime_plugin_launcher import RuntimePluginLauncher
|
||||||
|
|
||||||
from vllm.attention.backends.abstract import AttentionMetadata
|
from vllm.attention.backends.abstract import AttentionMetadata
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig
|
||||||
@ -683,7 +683,7 @@ class LMCacheConnectorV1Impl:
|
|||||||
self.api_server = InternalAPIServer(self)
|
self.api_server = InternalAPIServer(self)
|
||||||
self.api_server.start()
|
self.api_server.start()
|
||||||
# Launch plugins
|
# Launch plugins
|
||||||
self.plugin_launcher = PluginLauncher(
|
self.plugin_launcher = RuntimePluginLauncher(
|
||||||
self.config,
|
self.config,
|
||||||
role,
|
role,
|
||||||
self.worker_count,
|
self.worker_count,
|
||||||
|
|||||||
@ -1586,6 +1586,8 @@ def destroy_distributed_environment():
|
|||||||
|
|
||||||
|
|
||||||
def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
|
def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
|
||||||
|
# Reset environment variable cache
|
||||||
|
envs.disable_envs_cache()
|
||||||
# Ensure all objects are not frozen before cleanup
|
# Ensure all objects are not frozen before cleanup
|
||||||
gc.unfreeze()
|
gc.unfreeze()
|
||||||
|
|
||||||
|
|||||||
@ -9,7 +9,7 @@ import cloudpickle
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from pydantic import ValidationError
|
from pydantic import ValidationError
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
from typing_extensions import TypeVar, deprecated
|
from typing_extensions import TypeVar
|
||||||
|
|
||||||
from vllm.beam_search import (
|
from vllm.beam_search import (
|
||||||
BeamSearchInstance,
|
BeamSearchInstance,
|
||||||
@ -73,7 +73,6 @@ from vllm.pooling_params import PoolingParams
|
|||||||
from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams
|
from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams
|
||||||
from vllm.tasks import PoolingTask
|
from vllm.tasks import PoolingTask
|
||||||
from vllm.tokenizers import MistralTokenizer, TokenizerLike
|
from vllm.tokenizers import MistralTokenizer, TokenizerLike
|
||||||
from vllm.tokenizers.hf import get_cached_tokenizer
|
|
||||||
from vllm.usage.usage_lib import UsageContext
|
from vllm.usage.usage_lib import UsageContext
|
||||||
from vllm.utils.collection_utils import as_iter, is_list_of
|
from vllm.utils.collection_utils import as_iter, is_list_of
|
||||||
from vllm.utils.counter import Counter
|
from vllm.utils.counter import Counter
|
||||||
@ -367,16 +366,6 @@ class LLM:
|
|||||||
def get_tokenizer(self) -> TokenizerLike:
|
def get_tokenizer(self) -> TokenizerLike:
|
||||||
return self.llm_engine.get_tokenizer()
|
return self.llm_engine.get_tokenizer()
|
||||||
|
|
||||||
@deprecated("`set_tokenizer` is deprecated and will be removed in v0.13.")
|
|
||||||
def set_tokenizer(self, tokenizer: TokenizerLike) -> None:
|
|
||||||
# While CachedTokenizer is dynamic, have no choice but
|
|
||||||
# compare class name. Misjudgment will arise from
|
|
||||||
# user-defined tokenizer started with 'Cached'
|
|
||||||
if tokenizer.__class__.__name__.startswith("Cached"):
|
|
||||||
self.llm_engine.tokenizer = tokenizer
|
|
||||||
else:
|
|
||||||
self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer)
|
|
||||||
|
|
||||||
def reset_mm_cache(self) -> None:
|
def reset_mm_cache(self) -> None:
|
||||||
self.input_processor.clear_mm_cache()
|
self.input_processor.clear_mm_cache()
|
||||||
self.llm_engine.reset_mm_cache()
|
self.llm_engine.reset_mm_cache()
|
||||||
|
|||||||
@ -51,7 +51,11 @@ from vllm.entrypoints.openai.protocol import (
|
|||||||
ToolCall,
|
ToolCall,
|
||||||
UsageInfo,
|
UsageInfo,
|
||||||
)
|
)
|
||||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
|
from vllm.entrypoints.openai.serving_engine import (
|
||||||
|
GenerationError,
|
||||||
|
OpenAIServing,
|
||||||
|
clamp_prompt_logprobs,
|
||||||
|
)
|
||||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||||
from vllm.entrypoints.openai.tool_parsers import ToolParser
|
from vllm.entrypoints.openai.tool_parsers import ToolParser
|
||||||
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
|
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
|
||||||
@ -380,6 +384,8 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
tokenizer,
|
tokenizer,
|
||||||
request_metadata,
|
request_metadata,
|
||||||
)
|
)
|
||||||
|
except GenerationError as e:
|
||||||
|
return self._convert_generation_error_to_response(e)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
# TODO: Use a vllm-specific Validation Error
|
# TODO: Use a vllm-specific Validation Error
|
||||||
return self.create_error_response(str(e))
|
return self.create_error_response(str(e))
|
||||||
@ -1120,6 +1126,10 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
|
|
||||||
# if the model is finished generating
|
# if the model is finished generating
|
||||||
else:
|
else:
|
||||||
|
# check for error finish reason and abort streaming
|
||||||
|
# finish_reason='error' indicates a retryable error
|
||||||
|
self._raise_if_error(output.finish_reason, request_id)
|
||||||
|
|
||||||
# check to make sure we haven't "forgotten" to stream
|
# check to make sure we haven't "forgotten" to stream
|
||||||
# any tokens that were generated but previously
|
# any tokens that were generated but previously
|
||||||
# matched by partial json parsing
|
# matched by partial json parsing
|
||||||
@ -1287,6 +1297,8 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
delta=False,
|
delta=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
except GenerationError as e:
|
||||||
|
yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# TODO: Use a vllm-specific Validation Error
|
# TODO: Use a vllm-specific Validation Error
|
||||||
logger.exception("Error in chat completion stream generator.")
|
logger.exception("Error in chat completion stream generator.")
|
||||||
@ -1327,6 +1339,9 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
|
|
||||||
role = self.get_chat_request_role(request)
|
role = self.get_chat_request_role(request)
|
||||||
for output in final_res.outputs:
|
for output in final_res.outputs:
|
||||||
|
# check for error finish reason and raise GenerationError
|
||||||
|
# finish_reason='error' indicates a retryable request-level internal error
|
||||||
|
self._raise_if_error(output.finish_reason, request_id)
|
||||||
token_ids = output.token_ids
|
token_ids = output.token_ids
|
||||||
out_logprobs = output.logprobs
|
out_logprobs = output.logprobs
|
||||||
tool_call_info = None
|
tool_call_info = None
|
||||||
|
|||||||
@ -24,7 +24,11 @@ from vllm.entrypoints.openai.protocol import (
|
|||||||
RequestResponseMetadata,
|
RequestResponseMetadata,
|
||||||
UsageInfo,
|
UsageInfo,
|
||||||
)
|
)
|
||||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
|
from vllm.entrypoints.openai.serving_engine import (
|
||||||
|
GenerationError,
|
||||||
|
OpenAIServing,
|
||||||
|
clamp_prompt_logprobs,
|
||||||
|
)
|
||||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||||
from vllm.entrypoints.renderer import RenderConfig
|
from vllm.entrypoints.renderer import RenderConfig
|
||||||
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
|
from vllm.entrypoints.utils import get_max_tokens, should_include_usage
|
||||||
@ -300,6 +304,8 @@ class OpenAIServingCompletion(OpenAIServing):
|
|||||||
)
|
)
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
return self.create_error_response("Client disconnected")
|
return self.create_error_response("Client disconnected")
|
||||||
|
except GenerationError as e:
|
||||||
|
return self._convert_generation_error_to_response(e)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
# TODO: Use a vllm-specific Validation Error
|
# TODO: Use a vllm-specific Validation Error
|
||||||
return self.create_error_response(str(e))
|
return self.create_error_response(str(e))
|
||||||
@ -437,6 +443,8 @@ class OpenAIServingCompletion(OpenAIServing):
|
|||||||
finish_reason = output.finish_reason
|
finish_reason = output.finish_reason
|
||||||
stop_reason = output.stop_reason
|
stop_reason = output.stop_reason
|
||||||
|
|
||||||
|
self._raise_if_error(finish_reason, request_id)
|
||||||
|
|
||||||
chunk = CompletionStreamResponse(
|
chunk = CompletionStreamResponse(
|
||||||
id=request_id,
|
id=request_id,
|
||||||
created=created_time,
|
created=created_time,
|
||||||
@ -498,8 +506,11 @@ class OpenAIServingCompletion(OpenAIServing):
|
|||||||
# report to FastAPI middleware aggregate usage across all choices
|
# report to FastAPI middleware aggregate usage across all choices
|
||||||
request_metadata.final_usage_info = final_usage_info
|
request_metadata.final_usage_info = final_usage_info
|
||||||
|
|
||||||
|
except GenerationError as e:
|
||||||
|
yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# TODO: Use a vllm-specific Validation Error
|
# TODO: Use a vllm-specific Validation Error
|
||||||
|
logger.exception("Error in completion stream generator.")
|
||||||
data = self.create_streaming_error_response(str(e))
|
data = self.create_streaming_error_response(str(e))
|
||||||
yield f"data: {data}\n\n"
|
yield f"data: {data}\n\n"
|
||||||
yield "data: [DONE]\n\n"
|
yield "data: [DONE]\n\n"
|
||||||
@ -530,6 +541,8 @@ class OpenAIServingCompletion(OpenAIServing):
|
|||||||
out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
|
out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
|
||||||
|
|
||||||
for output in final_res.outputs:
|
for output in final_res.outputs:
|
||||||
|
self._raise_if_error(output.finish_reason, request_id)
|
||||||
|
|
||||||
assert request.max_tokens is not None
|
assert request.max_tokens is not None
|
||||||
if request.echo:
|
if request.echo:
|
||||||
if request.return_token_ids:
|
if request.return_token_ids:
|
||||||
|
|||||||
@ -133,6 +133,15 @@ from vllm.utils.async_utils import (
|
|||||||
from vllm.utils.collection_utils import is_list_of
|
from vllm.utils.collection_utils import is_list_of
|
||||||
from vllm.v1.engine import EngineCoreRequest
|
from vllm.v1.engine import EngineCoreRequest
|
||||||
|
|
||||||
|
|
||||||
|
class GenerationError(Exception):
|
||||||
|
"""raised when finish_reason indicates internal server error (500)"""
|
||||||
|
|
||||||
|
def __init__(self, message: str = "Internal server error"):
|
||||||
|
super().__init__(message)
|
||||||
|
self.status_code = HTTPStatus.INTERNAL_SERVER_ERROR
|
||||||
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
CompletionLikeRequest: TypeAlias = (
|
CompletionLikeRequest: TypeAlias = (
|
||||||
@ -456,6 +465,29 @@ class OpenAIServing:
|
|||||||
# Iterate through all beam inference results
|
# Iterate through all beam inference results
|
||||||
for i, result in enumerate(output):
|
for i, result in enumerate(output):
|
||||||
current_beam = all_beams[i]
|
current_beam = all_beams[i]
|
||||||
|
|
||||||
|
# check for error finish reason and abort beam search
|
||||||
|
if result.outputs[0].finish_reason == "error":
|
||||||
|
# yield error output and terminate beam search
|
||||||
|
yield RequestOutput(
|
||||||
|
request_id=request_id,
|
||||||
|
prompt=prompt_text,
|
||||||
|
outputs=[
|
||||||
|
CompletionOutput(
|
||||||
|
index=0,
|
||||||
|
text="",
|
||||||
|
token_ids=[],
|
||||||
|
cumulative_logprob=None,
|
||||||
|
logprobs=None,
|
||||||
|
finish_reason="error",
|
||||||
|
)
|
||||||
|
],
|
||||||
|
finished=True,
|
||||||
|
prompt_token_ids=prompt_token_ids,
|
||||||
|
prompt_logprobs=None,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
if result.outputs[0].logprobs is not None:
|
if result.outputs[0].logprobs is not None:
|
||||||
logprobs = result.outputs[0].logprobs[0]
|
logprobs = result.outputs[0].logprobs[0]
|
||||||
all_beams_token_id.extend(list(logprobs.keys()))
|
all_beams_token_id.extend(list(logprobs.keys()))
|
||||||
@ -780,6 +812,35 @@ class OpenAIServing:
|
|||||||
)
|
)
|
||||||
return json_str
|
return json_str
|
||||||
|
|
||||||
|
def _raise_if_error(self, finish_reason: str | None, request_id: str) -> None:
|
||||||
|
"""Raise GenerationError if finish_reason indicates an error."""
|
||||||
|
if finish_reason == "error":
|
||||||
|
logger.error(
|
||||||
|
"Request %s failed with an internal error during generation",
|
||||||
|
request_id,
|
||||||
|
)
|
||||||
|
raise GenerationError("Internal server error")
|
||||||
|
|
||||||
|
def _convert_generation_error_to_response(
|
||||||
|
self, e: GenerationError
|
||||||
|
) -> ErrorResponse:
|
||||||
|
"""Convert GenerationError to ErrorResponse."""
|
||||||
|
return self.create_error_response(
|
||||||
|
str(e),
|
||||||
|
err_type="InternalServerError",
|
||||||
|
status_code=e.status_code,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _convert_generation_error_to_streaming_response(
|
||||||
|
self, e: GenerationError
|
||||||
|
) -> str:
|
||||||
|
"""Convert GenerationError to streaming error response."""
|
||||||
|
return self.create_streaming_error_response(
|
||||||
|
str(e),
|
||||||
|
err_type="InternalServerError",
|
||||||
|
status_code=e.status_code,
|
||||||
|
)
|
||||||
|
|
||||||
async def _check_model(
|
async def _check_model(
|
||||||
self,
|
self,
|
||||||
request: AnyRequest,
|
request: AnyRequest,
|
||||||
|
|||||||
@ -50,6 +50,7 @@ from openai.types.responses.response_reasoning_item import (
|
|||||||
)
|
)
|
||||||
from openai.types.responses.tool import Mcp, Tool
|
from openai.types.responses.tool import Mcp, Tool
|
||||||
from openai_harmony import Message as OpenAIHarmonyMessage
|
from openai_harmony import Message as OpenAIHarmonyMessage
|
||||||
|
from pydantic import TypeAdapter
|
||||||
|
|
||||||
from vllm import envs
|
from vllm import envs
|
||||||
from vllm.engine.protocol import EngineClient
|
from vllm.engine.protocol import EngineClient
|
||||||
@ -94,7 +95,10 @@ from vllm.entrypoints.openai.protocol import (
|
|||||||
ResponseUsage,
|
ResponseUsage,
|
||||||
StreamingResponsesResponse,
|
StreamingResponsesResponse,
|
||||||
)
|
)
|
||||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
from vllm.entrypoints.openai.serving_engine import (
|
||||||
|
GenerationError,
|
||||||
|
OpenAIServing,
|
||||||
|
)
|
||||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||||
from vllm.entrypoints.responses_utils import (
|
from vllm.entrypoints.responses_utils import (
|
||||||
construct_input_messages,
|
construct_input_messages,
|
||||||
@ -541,6 +545,8 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
tokenizer,
|
tokenizer,
|
||||||
request_metadata,
|
request_metadata,
|
||||||
)
|
)
|
||||||
|
except GenerationError as e:
|
||||||
|
return self._convert_generation_error_to_response(e)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return self.create_error_response(str(e))
|
return self.create_error_response(str(e))
|
||||||
|
|
||||||
@ -648,6 +654,8 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
status = "incomplete"
|
status = "incomplete"
|
||||||
elif context.finish_reason == "abort":
|
elif context.finish_reason == "abort":
|
||||||
status = "cancelled"
|
status = "cancelled"
|
||||||
|
else:
|
||||||
|
self._raise_if_error(context.finish_reason, request.request_id)
|
||||||
else:
|
else:
|
||||||
status = "incomplete"
|
status = "incomplete"
|
||||||
elif isinstance(context, ParsableContext):
|
elif isinstance(context, ParsableContext):
|
||||||
@ -673,6 +681,9 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
assert len(final_res.outputs) == 1
|
assert len(final_res.outputs) == 1
|
||||||
final_output = final_res.outputs[0]
|
final_output = final_res.outputs[0]
|
||||||
|
|
||||||
|
# finish_reason='error' indicates retryable internal error
|
||||||
|
self._raise_if_error(final_output.finish_reason, request.request_id)
|
||||||
|
|
||||||
output = self._make_response_output_items(request, final_output, tokenizer)
|
output = self._make_response_output_items(request, final_output, tokenizer)
|
||||||
|
|
||||||
if request.enable_response_messages:
|
if request.enable_response_messages:
|
||||||
@ -1066,6 +1077,8 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
async for event in generator:
|
async for event in generator:
|
||||||
event_deque.append(event)
|
event_deque.append(event)
|
||||||
new_event_signal.set() # Signal new event available
|
new_event_signal.set() # Signal new event available
|
||||||
|
except GenerationError as e:
|
||||||
|
response = self._convert_generation_error_to_response(e)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.exception("Background request failed for %s", request.request_id)
|
logger.exception("Background request failed for %s", request.request_id)
|
||||||
response = self.create_error_response(str(e))
|
response = self.create_error_response(str(e))
|
||||||
@ -1089,6 +1102,8 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
response = await self.responses_full_generator(request, *args, **kwargs)
|
response = await self.responses_full_generator(request, *args, **kwargs)
|
||||||
|
except GenerationError as e:
|
||||||
|
response = self._convert_generation_error_to_response(e)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.exception("Background request failed for %s", request.request_id)
|
logger.exception("Background request failed for %s", request.request_id)
|
||||||
response = self.create_error_response(str(e))
|
response = self.create_error_response(str(e))
|
||||||
@ -1227,6 +1242,8 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
continue
|
continue
|
||||||
if ctx.last_output.outputs:
|
if ctx.last_output.outputs:
|
||||||
output = ctx.last_output.outputs[0]
|
output = ctx.last_output.outputs[0]
|
||||||
|
# finish_reason='error' indicates a retryable error
|
||||||
|
self._raise_if_error(output.finish_reason, request.request_id)
|
||||||
if reasoning_parser:
|
if reasoning_parser:
|
||||||
delta_message = reasoning_parser.extract_reasoning_streaming(
|
delta_message = reasoning_parser.extract_reasoning_streaming(
|
||||||
previous_text=previous_text,
|
previous_text=previous_text,
|
||||||
@ -1522,6 +1539,9 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
async for ctx in result_generator:
|
async for ctx in result_generator:
|
||||||
assert isinstance(ctx, StreamingHarmonyContext)
|
assert isinstance(ctx, StreamingHarmonyContext)
|
||||||
|
|
||||||
|
# finish_reason='error' indicates a retryable error
|
||||||
|
self._raise_if_error(ctx.finish_reason, request.request_id)
|
||||||
|
|
||||||
if ctx.is_expecting_start():
|
if ctx.is_expecting_start():
|
||||||
current_output_index += 1
|
current_output_index += 1
|
||||||
sent_output_item_added = False
|
sent_output_item_added = False
|
||||||
@ -2016,18 +2036,25 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
async for event_data in processer(
|
try:
|
||||||
request,
|
async for event_data in processer(
|
||||||
sampling_params,
|
request,
|
||||||
result_generator,
|
sampling_params,
|
||||||
context,
|
result_generator,
|
||||||
model_name,
|
context,
|
||||||
tokenizer,
|
model_name,
|
||||||
request_metadata,
|
tokenizer,
|
||||||
created_time,
|
request_metadata,
|
||||||
_increment_sequence_number_and_return,
|
created_time,
|
||||||
):
|
_increment_sequence_number_and_return,
|
||||||
yield event_data
|
):
|
||||||
|
yield event_data
|
||||||
|
except GenerationError as e:
|
||||||
|
error_json = self._convert_generation_error_to_streaming_response(e)
|
||||||
|
yield _increment_sequence_number_and_return(
|
||||||
|
TypeAdapter(StreamingResponsesResponse).validate_json(error_json)
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
async def empty_async_generator():
|
async def empty_async_generator():
|
||||||
# A hack to trick Python to think this is a generator but
|
# A hack to trick Python to think this is a generator but
|
||||||
|
|||||||
20
vllm/envs.py
20
vllm/envs.py
@ -1580,6 +1580,12 @@ def __getattr__(name: str):
|
|||||||
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
||||||
|
|
||||||
|
|
||||||
|
def _is_envs_cache_enabled() -> bool:
|
||||||
|
"""Checked if __getattr__ is wrapped with functools.cache"""
|
||||||
|
global __getattr__
|
||||||
|
return hasattr(__getattr__, "cache_clear")
|
||||||
|
|
||||||
|
|
||||||
def enable_envs_cache() -> None:
|
def enable_envs_cache() -> None:
|
||||||
"""
|
"""
|
||||||
Enables caching of environment variables. This is useful for performance
|
Enables caching of environment variables. This is useful for performance
|
||||||
@ -1590,6 +1596,9 @@ def enable_envs_cache() -> None:
|
|||||||
runtime overhead. This also means that environment variables should NOT
|
runtime overhead. This also means that environment variables should NOT
|
||||||
be updated after the service is initialized.
|
be updated after the service is initialized.
|
||||||
"""
|
"""
|
||||||
|
if _is_envs_cache_enabled():
|
||||||
|
# Avoid wrapping functools.cache multiple times
|
||||||
|
return
|
||||||
# Tag __getattr__ with functools.cache
|
# Tag __getattr__ with functools.cache
|
||||||
global __getattr__
|
global __getattr__
|
||||||
__getattr__ = functools.cache(__getattr__)
|
__getattr__ = functools.cache(__getattr__)
|
||||||
@ -1599,6 +1608,17 @@ def enable_envs_cache() -> None:
|
|||||||
__getattr__(key)
|
__getattr__(key)
|
||||||
|
|
||||||
|
|
||||||
|
def disable_envs_cache() -> None:
|
||||||
|
"""
|
||||||
|
Resets the environment variables cache. It could be used to isolate environments
|
||||||
|
between unit tests.
|
||||||
|
"""
|
||||||
|
global __getattr__
|
||||||
|
# If __getattr__ is wrapped by functions.cache, unwrap the caching layer.
|
||||||
|
if _is_envs_cache_enabled():
|
||||||
|
__getattr__ = __getattr__.__wrapped__
|
||||||
|
|
||||||
|
|
||||||
def __dir__():
|
def __dir__():
|
||||||
return list(environment_variables.keys())
|
return list(environment_variables.keys())
|
||||||
|
|
||||||
|
|||||||
@ -1556,6 +1556,14 @@ class FusedMoE(CustomOp):
|
|||||||
f"EPLB is not supported for {self.quant_method.method_name}."
|
f"EPLB is not supported for {self.quant_method.method_name}."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def valid_grouping() -> bool:
|
||||||
|
# Check if num_experts is greater than num_expert_group
|
||||||
|
# and is divisible by num_expert_group
|
||||||
|
num_experts = router_logits.shape[-1]
|
||||||
|
if num_experts <= self.num_expert_group:
|
||||||
|
return False
|
||||||
|
return num_experts % self.num_expert_group == 0
|
||||||
|
|
||||||
indices_type = self.quant_method.topk_indices_dtype
|
indices_type = self.quant_method.topk_indices_dtype
|
||||||
|
|
||||||
# Check if we should use a routing simulation strategy
|
# Check if we should use a routing simulation strategy
|
||||||
@ -1570,7 +1578,7 @@ class FusedMoE(CustomOp):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# DeepSeekv2 uses grouped_top_k
|
# DeepSeekv2 uses grouped_top_k
|
||||||
elif self.use_grouped_topk:
|
elif self.use_grouped_topk and valid_grouping():
|
||||||
assert self.topk_group is not None
|
assert self.topk_group is not None
|
||||||
assert self.num_expert_group is not None
|
assert self.num_expert_group is not None
|
||||||
if rocm_aiter_ops.is_fused_moe_enabled():
|
if rocm_aiter_ops.is_fused_moe_enabled():
|
||||||
|
|||||||
@ -502,6 +502,7 @@ class HunYuanVisionTransformer(nn.Module):
|
|||||||
cu_seqlens: list = [0]
|
cu_seqlens: list = [0]
|
||||||
|
|
||||||
hidden_states = x.to(device=self.device, dtype=self.dtype)
|
hidden_states = x.to(device=self.device, dtype=self.dtype)
|
||||||
|
# embeddings = patch_embeds + patch_pos_embed
|
||||||
hidden_states = self.embeddings(hidden_states, grid_thw)
|
hidden_states = self.embeddings(hidden_states, grid_thw)
|
||||||
|
|
||||||
for t, h, w in grid_thw:
|
for t, h, w in grid_thw:
|
||||||
@ -515,8 +516,14 @@ class HunYuanVisionTransformer(nn.Module):
|
|||||||
|
|
||||||
hidden_states = hidden_states.reshape(seq_len, -1)
|
hidden_states = hidden_states.reshape(seq_len, -1)
|
||||||
hidden_states = hidden_states.unsqueeze(0)
|
hidden_states = hidden_states.unsqueeze(0)
|
||||||
for layer_num, layer in enumerate(self.layers):
|
|
||||||
hidden_states = layer(hidden_states)
|
# build per-image lengths once
|
||||||
|
split_lengths = [int(h) * int(w) for (_, h, w) in grid_thw]
|
||||||
|
for layer in self.layers:
|
||||||
|
# hidden_states: (1, T_total, D)
|
||||||
|
parts = hidden_states.split(split_lengths, dim=1) # list of (1, L_i, D)
|
||||||
|
parts = [layer(p) for p in parts]
|
||||||
|
hidden_states = torch.cat(parts, dim=1)
|
||||||
|
|
||||||
# adapter
|
# adapter
|
||||||
split_lengths = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
|
split_lengths = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
|
||||||
|
|||||||
@ -201,7 +201,7 @@ class MiniMaxM2Attention(nn.Module):
|
|||||||
|
|
||||||
self.rotary_emb = get_rope(
|
self.rotary_emb = get_rope(
|
||||||
self.head_dim,
|
self.head_dim,
|
||||||
rotary_dim=rotary_dim,
|
rotary_dim=self.head_dim,
|
||||||
max_position=max_position_embeddings,
|
max_position=max_position_embeddings,
|
||||||
rope_parameters=rope_parameters,
|
rope_parameters=rope_parameters,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -89,7 +89,7 @@ def _extract_data_from_linear_base_module(
|
|||||||
assert m.quant_method.quant_config is not None
|
assert m.quant_method.quant_config is not None
|
||||||
|
|
||||||
w = m.weight
|
w = m.weight
|
||||||
ws = m.weight_scale
|
ws = m.weight_scale_inv if hasattr(m, "weight_scale_inv") else m.weight_scale
|
||||||
quant_block_size = m.quant_method.quant_config.weight_block_size
|
quant_block_size = m.quant_method.quant_config.weight_block_size
|
||||||
|
|
||||||
assert isinstance(w, torch.Tensor)
|
assert isinstance(w, torch.Tensor)
|
||||||
|
|||||||
@ -954,7 +954,7 @@ MultiModalKwargsOptionalItems: TypeAlias = (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.13.")
|
@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.14.")
|
||||||
class MultiModalKwargs(UserDict[str, NestedTensors]):
|
class MultiModalKwargs(UserDict[str, NestedTensors]):
|
||||||
"""
|
"""
|
||||||
A dictionary that represents the keyword arguments to
|
A dictionary that represents the keyword arguments to
|
||||||
@ -964,7 +964,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
@deprecated(
|
@deprecated(
|
||||||
"`MultiModalKwargs.from_hf_inputs` is deprecated and "
|
"`MultiModalKwargs.from_hf_inputs` is deprecated and "
|
||||||
"will be removed in v0.13. "
|
"will be removed in v0.14. "
|
||||||
"Please use `MultiModalKwargsItems.from_hf_inputs` and "
|
"Please use `MultiModalKwargsItems.from_hf_inputs` and "
|
||||||
"access the tensor data using `.get_data()`."
|
"access the tensor data using `.get_data()`."
|
||||||
)
|
)
|
||||||
@ -977,7 +977,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
@deprecated(
|
@deprecated(
|
||||||
"`MultiModalKwargs.from_items` is deprecated and "
|
"`MultiModalKwargs.from_items` is deprecated and "
|
||||||
"will be removed in v0.13. "
|
"will be removed in v0.14. "
|
||||||
"Please use `MultiModalKwargsItems.from_seq` and "
|
"Please use `MultiModalKwargsItems.from_seq` and "
|
||||||
"access the tensor data using `.get_data()`."
|
"access the tensor data using `.get_data()`."
|
||||||
)
|
)
|
||||||
|
|||||||
@ -429,12 +429,12 @@ def group_mm_kwargs_by_modality(
|
|||||||
if merge_by_field_config is not None:
|
if merge_by_field_config is not None:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` "
|
"The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` "
|
||||||
"is deprecated and will be removed in v0.13."
|
"is deprecated and will be removed in v0.14."
|
||||||
)
|
)
|
||||||
if multimodal_cpu_fields is not None:
|
if multimodal_cpu_fields is not None:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"The `multimodal_cpu_fields` argument of `group_mm_kwargs_by_modality` "
|
"The `multimodal_cpu_fields` argument of `group_mm_kwargs_by_modality` "
|
||||||
"is deprecated and will be removed in v0.13."
|
"is deprecated and will be removed in v0.14."
|
||||||
)
|
)
|
||||||
|
|
||||||
from vllm.multimodal.inputs import MultiModalKwargsItems
|
from vllm.multimodal.inputs import MultiModalKwargsItems
|
||||||
|
|||||||
@ -403,7 +403,21 @@ class RocmPlatform(Platform):
|
|||||||
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||||
|
|
||||||
if cache_config and cache_config.block_size is None:
|
if cache_config and cache_config.block_size is None:
|
||||||
cache_config.block_size = 16
|
if (
|
||||||
|
envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION and envs.VLLM_ROCM_USE_AITER
|
||||||
|
# NOTE: This block has been deprecated
|
||||||
|
# or get_env_variable_attn_backend()
|
||||||
|
# == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN
|
||||||
|
# TODO: monitor https://github.com/vllm-project/vllm/pull/30396
|
||||||
|
# to see how we can transition to the new way of selecting
|
||||||
|
# attention backends
|
||||||
|
):
|
||||||
|
cache_config.block_size = 64
|
||||||
|
logger.warning(
|
||||||
|
"[ROCM_AITER_UNIFIED_ATTN]: Setting kv cache block size to 64."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cache_config.block_size = 16
|
||||||
|
|
||||||
if parallel_config.worker_cls == "auto":
|
if parallel_config.worker_cls == "auto":
|
||||||
parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
|
parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
|
||||||
|
|||||||
@ -17,7 +17,7 @@ def __getattr__(name: str):
|
|||||||
warnings.warn(
|
warnings.warn(
|
||||||
"`vllm.transformers_utils.tokenizer.AnyTokenizer` has been moved to "
|
"`vllm.transformers_utils.tokenizer.AnyTokenizer` has been moved to "
|
||||||
"`vllm.tokenizers.TokenizerLike`. "
|
"`vllm.tokenizers.TokenizerLike`. "
|
||||||
"The old name will be removed in v0.13.",
|
"The old name will be removed in v0.14.",
|
||||||
DeprecationWarning,
|
DeprecationWarning,
|
||||||
stacklevel=2,
|
stacklevel=2,
|
||||||
)
|
)
|
||||||
@ -29,7 +29,7 @@ def __getattr__(name: str):
|
|||||||
warnings.warn(
|
warnings.warn(
|
||||||
"`vllm.transformers_utils.tokenizer.get_tokenizer` "
|
"`vllm.transformers_utils.tokenizer.get_tokenizer` "
|
||||||
"has been moved to `vllm.tokenizers.get_tokenizer`. "
|
"has been moved to `vllm.tokenizers.get_tokenizer`. "
|
||||||
"The old name will be removed in v0.13.",
|
"The old name will be removed in v0.14.",
|
||||||
DeprecationWarning,
|
DeprecationWarning,
|
||||||
stacklevel=2,
|
stacklevel=2,
|
||||||
)
|
)
|
||||||
@ -41,7 +41,7 @@ def __getattr__(name: str):
|
|||||||
warnings.warn(
|
warnings.warn(
|
||||||
"`vllm.transformers_utils.tokenizer.cached_get_tokenizer` "
|
"`vllm.transformers_utils.tokenizer.cached_get_tokenizer` "
|
||||||
"has been moved to `vllm.tokenizers.cached_get_tokenizer`. "
|
"has been moved to `vllm.tokenizers.cached_get_tokenizer`. "
|
||||||
"The old name will be removed in v0.13.",
|
"The old name will be removed in v0.14.",
|
||||||
DeprecationWarning,
|
DeprecationWarning,
|
||||||
stacklevel=2,
|
stacklevel=2,
|
||||||
)
|
)
|
||||||
@ -53,7 +53,7 @@ def __getattr__(name: str):
|
|||||||
warnings.warn(
|
warnings.warn(
|
||||||
"`vllm.transformers_utils.tokenizer.cached_tokenizer_from_config` "
|
"`vllm.transformers_utils.tokenizer.cached_tokenizer_from_config` "
|
||||||
"has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. "
|
"has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. "
|
||||||
"The old name will be removed in v0.13.",
|
"The old name will be removed in v0.14.",
|
||||||
DeprecationWarning,
|
DeprecationWarning,
|
||||||
stacklevel=2,
|
stacklevel=2,
|
||||||
)
|
)
|
||||||
@ -65,7 +65,7 @@ def __getattr__(name: str):
|
|||||||
warnings.warn(
|
warnings.warn(
|
||||||
"`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` "
|
"`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` "
|
||||||
"has been moved to `vllm.tokenizers.init_tokenizer_from_config`. "
|
"has been moved to `vllm.tokenizers.init_tokenizer_from_config`. "
|
||||||
"The old name will be removed in v0.13.",
|
"The old name will be removed in v0.14.",
|
||||||
DeprecationWarning,
|
DeprecationWarning,
|
||||||
stacklevel=2,
|
stacklevel=2,
|
||||||
)
|
)
|
||||||
@ -75,7 +75,7 @@ def __getattr__(name: str):
|
|||||||
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
||||||
|
|
||||||
|
|
||||||
@deprecated("Will be removed in v0.13. Please use `tokenizer.decode()` instead.")
|
@deprecated("Will be removed in v0.14. Please use `tokenizer.decode()` instead.")
|
||||||
def decode_tokens(
|
def decode_tokens(
|
||||||
tokenizer: TokenizerLike,
|
tokenizer: TokenizerLike,
|
||||||
token_ids: list[int],
|
token_ids: list[int],
|
||||||
@ -97,7 +97,7 @@ def decode_tokens(
|
|||||||
return tokenizer.decode(token_ids, **kw_args)
|
return tokenizer.decode(token_ids, **kw_args)
|
||||||
|
|
||||||
|
|
||||||
@deprecated("Will be removed in v0.13. Please use `tokenizer.encode()` instead.")
|
@deprecated("Will be removed in v0.14. Please use `tokenizer.encode()` instead.")
|
||||||
def encode_tokens(
|
def encode_tokens(
|
||||||
tokenizer: TokenizerLike,
|
tokenizer: TokenizerLike,
|
||||||
text: str,
|
text: str,
|
||||||
|
|||||||
@ -11,7 +11,7 @@ def __getattr__(name: str):
|
|||||||
warnings.warn(
|
warnings.warn(
|
||||||
"`vllm.transformers_utils.tokenizer_base.TokenizerBase` has been "
|
"`vllm.transformers_utils.tokenizer_base.TokenizerBase` has been "
|
||||||
"moved to `vllm.tokenizers.TokenizerLike`. "
|
"moved to `vllm.tokenizers.TokenizerLike`. "
|
||||||
"The old name will be removed in v0.13.",
|
"The old name will be removed in v0.14.",
|
||||||
DeprecationWarning,
|
DeprecationWarning,
|
||||||
stacklevel=2,
|
stacklevel=2,
|
||||||
)
|
)
|
||||||
@ -23,7 +23,7 @@ def __getattr__(name: str):
|
|||||||
warnings.warn(
|
warnings.warn(
|
||||||
"`vllm.transformers_utils.tokenizer_base.TokenizerRegistry` has been "
|
"`vllm.transformers_utils.tokenizer_base.TokenizerRegistry` has been "
|
||||||
"moved to `vllm.tokenizers.TokenizerRegistry`. "
|
"moved to `vllm.tokenizers.TokenizerRegistry`. "
|
||||||
"The old name will be removed in v0.13.",
|
"The old name will be removed in v0.14.",
|
||||||
DeprecationWarning,
|
DeprecationWarning,
|
||||||
stacklevel=2,
|
stacklevel=2,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -269,6 +269,8 @@ def supports_trtllm_attention() -> bool:
|
|||||||
|
|
||||||
def force_use_trtllm_attention() -> bool | None:
|
def force_use_trtllm_attention() -> bool | None:
|
||||||
"""
|
"""
|
||||||
|
This function should only be called during initialization stage when vllm config
|
||||||
|
is set.
|
||||||
Return `None` if --attention-config.use_trtllm_attention is not set,
|
Return `None` if --attention-config.use_trtllm_attention is not set,
|
||||||
return `True` if TRTLLM attention is forced to be used,
|
return `True` if TRTLLM attention is forced to be used,
|
||||||
return `False` if TRTLLM attention is forced to be not used.
|
return `False` if TRTLLM attention is forced to be not used.
|
||||||
@ -296,11 +298,12 @@ def use_trtllm_attention(
|
|||||||
kv_cache_dtype: str,
|
kv_cache_dtype: str,
|
||||||
q_dtype: torch.dtype,
|
q_dtype: torch.dtype,
|
||||||
is_prefill: bool,
|
is_prefill: bool,
|
||||||
|
# None means auto-detection, True means force on, False means force off
|
||||||
|
force_use_trtllm: bool | None = None,
|
||||||
has_sinks: bool = False,
|
has_sinks: bool = False,
|
||||||
has_spec: bool = False,
|
has_spec: bool = False,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""Return `True` if TRTLLM attention is used."""
|
"""Return `True` if TRTLLM attention is used."""
|
||||||
force_use_trtllm = force_use_trtllm_attention()
|
|
||||||
|
|
||||||
# CLI argument is set to 0 - respect it
|
# CLI argument is set to 0 - respect it
|
||||||
if force_use_trtllm is not None and not force_use_trtllm:
|
if force_use_trtllm is not None and not force_use_trtllm:
|
||||||
|
|||||||
@ -21,7 +21,7 @@ from vllm.v1.attention.backends.utils import (
|
|||||||
CommonAttentionMetadata,
|
CommonAttentionMetadata,
|
||||||
split_decodes_and_prefills,
|
split_decodes_and_prefills,
|
||||||
)
|
)
|
||||||
from vllm.v1.kv_cache_interface import AttentionSpec
|
from vllm.v1.kv_cache_interface import AttentionSpec, CrossAttentionSpec
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@ -50,11 +50,13 @@ class CPUAttentionBackend(AttentionBackend):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def supports_attn_type(cls, attn_type: str) -> bool:
|
def supports_attn_type(cls, attn_type: str) -> bool:
|
||||||
"""CPU attention supports decoder and encoder-only attention."""
|
"""CPU attention supports decoder,
|
||||||
|
encoder-only and encoder-decoder attention."""
|
||||||
return attn_type in (
|
return attn_type in (
|
||||||
AttentionType.DECODER,
|
AttentionType.DECODER,
|
||||||
AttentionType.ENCODER,
|
AttentionType.ENCODER,
|
||||||
AttentionType.ENCODER_ONLY,
|
AttentionType.ENCODER_ONLY,
|
||||||
|
AttentionType.ENCODER_DECODER,
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -136,6 +138,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
|
|||||||
self.window_size = -1
|
self.window_size = -1
|
||||||
self.block_size = vllm_config.cache_config.block_size
|
self.block_size = vllm_config.cache_config.block_size
|
||||||
self.isa = _get_attn_isa(self.dtype, self.block_size)
|
self.isa = _get_attn_isa(self.dtype, self.block_size)
|
||||||
|
self.is_cross_attention = isinstance(kv_cache_spec, CrossAttentionSpec)
|
||||||
|
|
||||||
def build(
|
def build(
|
||||||
self,
|
self,
|
||||||
@ -151,7 +154,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
|
|||||||
seq_lens = common_attn_metadata.seq_lens
|
seq_lens = common_attn_metadata.seq_lens
|
||||||
block_table_tensor = common_attn_metadata.block_table_tensor
|
block_table_tensor = common_attn_metadata.block_table_tensor
|
||||||
slot_mapping = common_attn_metadata.slot_mapping
|
slot_mapping = common_attn_metadata.slot_mapping
|
||||||
causal = common_attn_metadata.causal
|
causal = False if self.is_cross_attention else common_attn_metadata.causal
|
||||||
|
|
||||||
sdpa_start_loc = query_start_loc
|
sdpa_start_loc = query_start_loc
|
||||||
num_decode_tokens = 0
|
num_decode_tokens = 0
|
||||||
@ -171,22 +174,19 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
|
|||||||
query_start_loc = query_start_loc[: num_decodes + 1]
|
query_start_loc = query_start_loc[: num_decodes + 1]
|
||||||
block_table_tensor = block_table_tensor[:num_decodes]
|
block_table_tensor = block_table_tensor[:num_decodes]
|
||||||
|
|
||||||
sheduler_metadata = None
|
sheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
|
||||||
if causal:
|
num_reqs=num_reqs,
|
||||||
# for decode batch, use the custom kernel
|
num_heads=self.num_heads,
|
||||||
sheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
|
num_kv_heads=self.num_kv_heads,
|
||||||
num_reqs=num_reqs,
|
head_dim=self.head_dim,
|
||||||
num_heads=self.num_heads,
|
seq_lens=seq_lens,
|
||||||
num_kv_heads=self.num_kv_heads,
|
dtype=self.dtype,
|
||||||
head_dim=self.head_dim,
|
query_start_loc=query_start_loc,
|
||||||
seq_lens=seq_lens,
|
causal=causal,
|
||||||
dtype=self.dtype,
|
sliding_window_size=self.window_size,
|
||||||
query_start_loc=query_start_loc,
|
isa=self.isa,
|
||||||
causal=causal,
|
enable_kv_split=True,
|
||||||
sliding_window_size=self.window_size,
|
)
|
||||||
isa=self.isa,
|
|
||||||
enable_kv_split=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
attn_metadata = CPUAttentionMetadata(
|
attn_metadata = CPUAttentionMetadata(
|
||||||
isa=self.isa,
|
isa=self.isa,
|
||||||
|
|||||||
@ -429,6 +429,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
|||||||
super().__init__(kv_cache_spec, layer_names, vllm_config, device)
|
super().__init__(kv_cache_spec, layer_names, vllm_config, device)
|
||||||
self.cache_config = vllm_config.cache_config
|
self.cache_config = vllm_config.cache_config
|
||||||
self.model_config = vllm_config.model_config
|
self.model_config = vllm_config.model_config
|
||||||
|
self.attention_config = vllm_config.attention_config
|
||||||
self._workspace_buffer = None
|
self._workspace_buffer = None
|
||||||
self._prefill_wrapper: (
|
self._prefill_wrapper: (
|
||||||
BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
|
BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
|
||||||
@ -779,6 +780,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
|||||||
self.cache_dtype,
|
self.cache_dtype,
|
||||||
self.q_data_type,
|
self.q_data_type,
|
||||||
is_prefill=True,
|
is_prefill=True,
|
||||||
|
force_use_trtllm=self.attention_config.use_trtllm_attention,
|
||||||
has_sinks=self.has_sinks,
|
has_sinks=self.has_sinks,
|
||||||
has_spec=uses_spec_reorder,
|
has_spec=uses_spec_reorder,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -397,6 +397,25 @@ class BlockPool:
|
|||||||
[block for block in blocks_list if block.ref_cnt == 0 and not block.is_null]
|
[block for block in blocks_list if block.ref_cnt == 0 and not block.is_null]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def evict_blocks(self, block_ids: set[int]) -> None:
|
||||||
|
"""evict blocks from the prefix cache by their block IDs.
|
||||||
|
|
||||||
|
only evicts blocks that are currently cached (have a hash). blocks
|
||||||
|
with ref_cnt > 0 are not freed from the block pool, only evicted
|
||||||
|
from the prefix cache hash table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
block_ids: Set of block IDs to evict from cache.
|
||||||
|
"""
|
||||||
|
for block_id in block_ids:
|
||||||
|
assert block_id < len(self.blocks), (
|
||||||
|
f"Invalid block_id {block_id} >= {len(self.blocks)}. "
|
||||||
|
f"This indicates a bug in the KV connector - workers should "
|
||||||
|
f"only report block IDs that were allocated by the scheduler."
|
||||||
|
)
|
||||||
|
block = self.blocks[block_id]
|
||||||
|
self._maybe_evict_cached_block(block)
|
||||||
|
|
||||||
def reset_prefix_cache(self) -> bool:
|
def reset_prefix_cache(self) -> bool:
|
||||||
"""Reset prefix cache. This function may be used in RLHF
|
"""Reset prefix cache. This function may be used in RLHF
|
||||||
flows to invalid prefix caching after the weights are updated,
|
flows to invalid prefix caching after the weights are updated,
|
||||||
|
|||||||
@ -333,6 +333,14 @@ class KVCacheManager:
|
|||||||
"""
|
"""
|
||||||
self.coordinator.free(request.request_id)
|
self.coordinator.free(request.request_id)
|
||||||
|
|
||||||
|
def evict_blocks(self, block_ids: set[int]) -> None:
|
||||||
|
"""evict blocks from the prefix cache by their block IDs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
block_ids: Set of block IDs to evict from cache.
|
||||||
|
"""
|
||||||
|
self.block_pool.evict_blocks(block_ids)
|
||||||
|
|
||||||
def reset_prefix_cache(self) -> bool:
|
def reset_prefix_cache(self) -> bool:
|
||||||
"""Reset prefix cache. This function may be used in RLHF
|
"""Reset prefix cache. This function may be used in RLHF
|
||||||
flows to invalidate prefix caching after the weights are updated,
|
flows to invalidate prefix caching after the weights are updated,
|
||||||
|
|||||||
@ -687,7 +687,9 @@ def check_enough_kv_cache_memory(
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
"No available memory for the cache blocks. "
|
"No available memory for the cache blocks. "
|
||||||
"Try increasing `gpu_memory_utilization` when "
|
"Try increasing `gpu_memory_utilization` when "
|
||||||
"initializing the engine."
|
"initializing the engine. "
|
||||||
|
"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
|
||||||
|
"for more details."
|
||||||
)
|
)
|
||||||
|
|
||||||
max_model_len = vllm_config.model_config.max_model_len
|
max_model_len = vllm_config.model_config.max_model_len
|
||||||
@ -711,8 +713,10 @@ def check_enough_kv_cache_memory(
|
|||||||
f"cache is needed, which is larger than the available KV cache "
|
f"cache is needed, which is larger than the available KV cache "
|
||||||
f"memory ({available_memory / GiB_bytes:.2f} GiB). "
|
f"memory ({available_memory / GiB_bytes:.2f} GiB). "
|
||||||
f"{estimated_msg} "
|
f"{estimated_msg} "
|
||||||
f"Try increasing `gpu_memory_utilization` or decreasing "
|
f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` "
|
||||||
f"`max_model_len` when initializing the engine."
|
f"when initializing the engine. "
|
||||||
|
f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
|
||||||
|
f"for more details."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -106,6 +106,7 @@ class Scheduler(SchedulerInterface):
|
|||||||
# KV Connector pushes/pull of remote KVs for P/D and offloading.
|
# KV Connector pushes/pull of remote KVs for P/D and offloading.
|
||||||
self.connector = None
|
self.connector = None
|
||||||
self.connector_prefix_cache_stats: PrefixCacheStats | None = None
|
self.connector_prefix_cache_stats: PrefixCacheStats | None = None
|
||||||
|
self.recompute_kv_load_failures = True
|
||||||
if self.vllm_config.kv_transfer_config is not None:
|
if self.vllm_config.kv_transfer_config is not None:
|
||||||
assert not self.is_encoder_decoder, (
|
assert not self.is_encoder_decoder, (
|
||||||
"Encoder-decoder models are not currently supported with KV connectors"
|
"Encoder-decoder models are not currently supported with KV connectors"
|
||||||
@ -117,6 +118,10 @@ class Scheduler(SchedulerInterface):
|
|||||||
)
|
)
|
||||||
if self.log_stats:
|
if self.log_stats:
|
||||||
self.connector_prefix_cache_stats = PrefixCacheStats()
|
self.connector_prefix_cache_stats = PrefixCacheStats()
|
||||||
|
kv_load_failure_policy = (
|
||||||
|
self.vllm_config.kv_transfer_config.kv_load_failure_policy
|
||||||
|
)
|
||||||
|
self.recompute_kv_load_failures = kv_load_failure_policy == "recompute"
|
||||||
|
|
||||||
self.kv_event_publisher = EventPublisherFactory.create(
|
self.kv_event_publisher = EventPublisherFactory.create(
|
||||||
self.kv_events_config,
|
self.kv_events_config,
|
||||||
@ -1066,7 +1071,7 @@ class Scheduler(SchedulerInterface):
|
|||||||
for req_id, num_tokens_scheduled in num_scheduled_tokens.items():
|
for req_id, num_tokens_scheduled in num_scheduled_tokens.items():
|
||||||
assert num_tokens_scheduled > 0
|
assert num_tokens_scheduled > 0
|
||||||
if failed_kv_load_req_ids and req_id in failed_kv_load_req_ids:
|
if failed_kv_load_req_ids and req_id in failed_kv_load_req_ids:
|
||||||
# Skip requests that were recovered from KV load failure
|
# skip failed or rescheduled requests from KV load failure
|
||||||
continue
|
continue
|
||||||
request = self.requests.get(req_id)
|
request = self.requests.get(req_id)
|
||||||
if request is None:
|
if request is None:
|
||||||
@ -1177,6 +1182,21 @@ class Scheduler(SchedulerInterface):
|
|||||||
# This is a rare case and unlikely to impact performance.
|
# This is a rare case and unlikely to impact performance.
|
||||||
self.waiting.remove_requests(stopped_preempted_reqs)
|
self.waiting.remove_requests(stopped_preempted_reqs)
|
||||||
|
|
||||||
|
if failed_kv_load_req_ids and not self.recompute_kv_load_failures:
|
||||||
|
requests = [self.requests[req_id] for req_id in failed_kv_load_req_ids]
|
||||||
|
self.finish_requests(failed_kv_load_req_ids, RequestStatus.FINISHED_ERROR)
|
||||||
|
for request in requests:
|
||||||
|
outputs[request.client_index].append(
|
||||||
|
EngineCoreOutput(
|
||||||
|
request_id=request.request_id,
|
||||||
|
new_token_ids=[],
|
||||||
|
finish_reason=request.get_finished_reason(),
|
||||||
|
events=request.take_events(),
|
||||||
|
trace_headers=request.trace_headers,
|
||||||
|
num_cached_tokens=request.num_cached_tokens,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# KV Connector: update state for finished KV Transfers.
|
# KV Connector: update state for finished KV Transfers.
|
||||||
if kv_connector_output:
|
if kv_connector_output:
|
||||||
self._update_from_kv_xfer_finished(kv_connector_output)
|
self._update_from_kv_xfer_finished(kv_connector_output)
|
||||||
@ -1610,8 +1630,11 @@ class Scheduler(SchedulerInterface):
|
|||||||
self._free_blocks(self.requests[req_id])
|
self._free_blocks(self.requests[req_id])
|
||||||
|
|
||||||
def _update_requests_with_invalid_blocks(
|
def _update_requests_with_invalid_blocks(
|
||||||
self, requests: Iterable[Request], invalid_block_ids: set[int]
|
self,
|
||||||
) -> tuple[set[str], int]:
|
requests: Iterable[Request],
|
||||||
|
invalid_block_ids: set[int],
|
||||||
|
evict_blocks: bool = True,
|
||||||
|
) -> tuple[set[str], int, set[int]]:
|
||||||
"""
|
"""
|
||||||
Identify and update requests affected by invalid KV cache blocks.
|
Identify and update requests affected by invalid KV cache blocks.
|
||||||
|
|
||||||
@ -1623,16 +1646,21 @@ class Scheduler(SchedulerInterface):
|
|||||||
Args:
|
Args:
|
||||||
requests: The set of requests to scan for invalid blocks.
|
requests: The set of requests to scan for invalid blocks.
|
||||||
invalid_block_ids: IDs of invalid blocks.
|
invalid_block_ids: IDs of invalid blocks.
|
||||||
|
evict_blocks: Whether to collect blocks for eviction (False for
|
||||||
|
async requests which aren't cached yet).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tuple:
|
tuple:
|
||||||
- affected_req_ids (set[str]): IDs of requests impacted by
|
- affected_req_ids (set[str]): IDs of requests impacted by
|
||||||
invalid blocks.
|
invalid blocks.
|
||||||
- total_affected_tokens (int): Total number of tokens that must
|
- total_affected_tokens (int): Total number of tokens that must
|
||||||
be recomputed across all affected requests (for observability).
|
be recomputed across all affected requests.
|
||||||
|
- blocks_to_evict (set[int]): Block IDs to evict from cache,
|
||||||
|
including invalid blocks and downstream dependent blocks.
|
||||||
"""
|
"""
|
||||||
affected_req_ids: set[str] = set()
|
affected_req_ids: set[str] = set()
|
||||||
total_affected_tokens = 0
|
total_affected_tokens = 0
|
||||||
|
blocks_to_evict: set[int] = set()
|
||||||
# If a block is invalid and shared by multiple requests in the batch,
|
# If a block is invalid and shared by multiple requests in the batch,
|
||||||
# these requests must be rescheduled, but only the first will recompute
|
# these requests must be rescheduled, but only the first will recompute
|
||||||
# it. This set tracks blocks already marked for recomputation.
|
# it. This set tracks blocks already marked for recomputation.
|
||||||
@ -1690,6 +1718,9 @@ class Scheduler(SchedulerInterface):
|
|||||||
)
|
)
|
||||||
total_affected_tokens += num_affected_tokens
|
total_affected_tokens += num_affected_tokens
|
||||||
request.num_external_computed_tokens -= num_affected_tokens
|
request.num_external_computed_tokens -= num_affected_tokens
|
||||||
|
# collect invalid block and all downstream dependent blocks
|
||||||
|
if evict_blocks:
|
||||||
|
blocks_to_evict.update(req_block_ids[idx:])
|
||||||
|
|
||||||
if is_affected:
|
if is_affected:
|
||||||
if not marked_invalid_block:
|
if not marked_invalid_block:
|
||||||
@ -1705,47 +1736,70 @@ class Scheduler(SchedulerInterface):
|
|||||||
|
|
||||||
affected_req_ids.add(request.request_id)
|
affected_req_ids.add(request.request_id)
|
||||||
|
|
||||||
return affected_req_ids, total_affected_tokens
|
return affected_req_ids, total_affected_tokens, blocks_to_evict
|
||||||
|
|
||||||
def _handle_invalid_blocks(self, invalid_block_ids: set[int]) -> set[str]:
|
def _handle_invalid_blocks(self, invalid_block_ids: set[int]) -> set[str]:
|
||||||
total_requests_to_reschedule = 0
|
"""
|
||||||
total_tokens_to_reschedule = 0
|
Handle requests affected by invalid KV cache blocks.
|
||||||
|
|
||||||
# --- Handle async KV loads (WAITING_FOR_REMOTE_KVS) ---
|
Returns:
|
||||||
|
Set of affected request IDs to skip in update_from_output main loop.
|
||||||
|
"""
|
||||||
|
should_fail = not self.recompute_kv_load_failures
|
||||||
|
|
||||||
|
# handle async KV loads (not cached yet, evict_blocks=False)
|
||||||
async_load_reqs = (
|
async_load_reqs = (
|
||||||
req
|
req
|
||||||
for req in self.waiting
|
for req in self.waiting
|
||||||
if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS
|
if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS
|
||||||
)
|
)
|
||||||
async_affected_req_ids, num_tokens_to_reschedule = (
|
async_failed_req_ids, num_failed_tokens, _ = (
|
||||||
self._update_requests_with_invalid_blocks(
|
self._update_requests_with_invalid_blocks(
|
||||||
async_load_reqs, invalid_block_ids
|
async_load_reqs, invalid_block_ids, evict_blocks=False
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
total_requests_to_reschedule += len(async_affected_req_ids)
|
total_failed_requests = len(async_failed_req_ids)
|
||||||
total_tokens_to_reschedule += num_tokens_to_reschedule
|
total_failed_tokens = num_failed_tokens
|
||||||
|
|
||||||
# Mark requests with async KV load failures; they will be rescheduled
|
# handle sync loads (may be cached, collect blocks for eviction)
|
||||||
# once loading completes.
|
sync_failed_req_ids, num_failed_tokens, sync_blocks_to_evict = (
|
||||||
self.failed_recving_kv_req_ids |= async_affected_req_ids
|
self._update_requests_with_invalid_blocks(
|
||||||
|
self.running, invalid_block_ids, evict_blocks=True
|
||||||
# --- Handle sync KV loads (running requests) ---
|
)
|
||||||
sync_affected_req_ids, num_tokens_to_reschedule = (
|
|
||||||
self._update_requests_with_invalid_blocks(self.running, invalid_block_ids)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
total_requests_to_reschedule += len(sync_affected_req_ids)
|
total_failed_requests += len(sync_failed_req_ids)
|
||||||
total_tokens_to_reschedule += num_tokens_to_reschedule
|
total_failed_tokens += num_failed_tokens
|
||||||
|
|
||||||
if total_requests_to_reschedule:
|
if not total_failed_requests:
|
||||||
logger.warning(
|
return set()
|
||||||
"Recovered from KV load failure: "
|
|
||||||
"%d request(s) rescheduled (%d tokens affected).",
|
# evict invalid blocks and downstream dependent blocks from cache
|
||||||
total_requests_to_reschedule,
|
# only when not using recompute policy (where blocks will be recomputed
|
||||||
total_tokens_to_reschedule,
|
# and reused by other requests sharing them)
|
||||||
|
if sync_blocks_to_evict and not self.recompute_kv_load_failures:
|
||||||
|
self.kv_cache_manager.evict_blocks(sync_blocks_to_evict)
|
||||||
|
|
||||||
|
if should_fail:
|
||||||
|
all_failed_req_ids = async_failed_req_ids | sync_failed_req_ids
|
||||||
|
logger.error(
|
||||||
|
"Failing %d request(s) due to KV load failure "
|
||||||
|
"(failure_policy=fail, %d tokens affected). Request IDs: %s",
|
||||||
|
total_failed_requests,
|
||||||
|
total_failed_tokens,
|
||||||
|
all_failed_req_ids,
|
||||||
)
|
)
|
||||||
|
return all_failed_req_ids
|
||||||
|
|
||||||
# Return the IDs of affected running requests to skip in
|
logger.warning(
|
||||||
# update_from_output.
|
"Recovered from KV load failure: "
|
||||||
return sync_affected_req_ids
|
"%d request(s) rescheduled (%d tokens affected).",
|
||||||
|
total_failed_requests,
|
||||||
|
total_failed_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mark async requests with KV load failures for retry once loading completes
|
||||||
|
self.failed_recving_kv_req_ids |= async_failed_req_ids
|
||||||
|
# Return sync affected IDs to skip in update_from_output
|
||||||
|
return sync_failed_req_ids
|
||||||
|
|||||||
@ -19,24 +19,27 @@ from vllm.v1.serial_utils import UtilityResult
|
|||||||
|
|
||||||
# These are possible values of RequestOutput.finish_reason,
|
# These are possible values of RequestOutput.finish_reason,
|
||||||
# so form part of the external API.
|
# so form part of the external API.
|
||||||
FINISH_REASON_STRINGS = ("stop", "length", "abort")
|
FINISH_REASON_STRINGS = ("stop", "length", "abort", "error")
|
||||||
|
|
||||||
|
|
||||||
class FinishReason(enum.IntEnum):
|
class FinishReason(enum.IntEnum):
|
||||||
"""
|
"""
|
||||||
Reason a request finished - stop, length, or abort.
|
Reason a request finished - stop, length, abort, or error.
|
||||||
|
|
||||||
Int rather than Str for more compact serialization.
|
Int rather than Str for more compact serialization.
|
||||||
|
|
||||||
stop - a stop string was emitted
|
stop - a stop string was emitted
|
||||||
length - max_tokens was consumed, or max_model_len was reached
|
length - max_tokens was consumed, or max_model_len was reached
|
||||||
abort - aborted for another reason
|
abort - aborted by client
|
||||||
|
error - retryable request-level internal error (e.g., KV load failure).
|
||||||
|
Invariant: always converted to 500 Internal Server Error.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
STOP = 0
|
STOP = 0
|
||||||
LENGTH = 1
|
LENGTH = 1
|
||||||
ABORT = 2
|
ABORT = 2
|
||||||
|
ERROR = 3
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return FINISH_REASON_STRINGS[self.value]
|
return FINISH_REASON_STRINGS[self.value]
|
||||||
|
|||||||
@ -192,7 +192,7 @@ class AsyncLLM(EngineClient):
|
|||||||
@property
|
@property
|
||||||
@deprecated(
|
@deprecated(
|
||||||
"`AsyncLLM.processor` has been renamed to `AsyncLLM.input_processor`. "
|
"`AsyncLLM.processor` has been renamed to `AsyncLLM.input_processor`. "
|
||||||
"The old name will be removed in v0.13."
|
"The old name will be removed in v0.14."
|
||||||
)
|
)
|
||||||
def processor(self):
|
def processor(self):
|
||||||
return self.input_processor
|
return self.input_processor
|
||||||
@ -701,10 +701,6 @@ class AsyncLLM(EngineClient):
|
|||||||
def tokenizer(self) -> TokenizerLike | None:
|
def tokenizer(self) -> TokenizerLike | None:
|
||||||
return self.input_processor.tokenizer
|
return self.input_processor.tokenizer
|
||||||
|
|
||||||
@tokenizer.setter
|
|
||||||
def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
|
|
||||||
self.input_processor.tokenizer = tokenizer
|
|
||||||
|
|
||||||
async def get_tokenizer(self) -> TokenizerLike:
|
async def get_tokenizer(self) -> TokenizerLike:
|
||||||
if self.tokenizer is None:
|
if self.tokenizer is None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
|||||||
@ -211,6 +211,9 @@ class EngineCore:
|
|||||||
freeze_gc_heap()
|
freeze_gc_heap()
|
||||||
# If enable, attach GC debugger after static variable freeze.
|
# If enable, attach GC debugger after static variable freeze.
|
||||||
maybe_attach_gc_debug_callback()
|
maybe_attach_gc_debug_callback()
|
||||||
|
# Enable environment variable cache (e.g. assume no more
|
||||||
|
# environment variable overrides after this point)
|
||||||
|
enable_envs_cache()
|
||||||
|
|
||||||
def _initialize_kv_caches(
|
def _initialize_kv_caches(
|
||||||
self, vllm_config: VllmConfig
|
self, vllm_config: VllmConfig
|
||||||
@ -672,10 +675,6 @@ class EngineCoreProc(EngineCore):
|
|||||||
assert addresses.coordinator_input is not None
|
assert addresses.coordinator_input is not None
|
||||||
logger.info("Waiting for READY message from DP Coordinator...")
|
logger.info("Waiting for READY message from DP Coordinator...")
|
||||||
|
|
||||||
# Enable environment variable cache (e.g. assume no more
|
|
||||||
# environment variable overrides after this point)
|
|
||||||
enable_envs_cache()
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def _perform_handshakes(
|
def _perform_handshakes(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -64,10 +64,6 @@ class InputProcessor:
|
|||||||
def tokenizer(self) -> TokenizerLike | None:
|
def tokenizer(self) -> TokenizerLike | None:
|
||||||
return self.input_preprocessor.tokenizer
|
return self.input_preprocessor.tokenizer
|
||||||
|
|
||||||
@tokenizer.setter
|
|
||||||
def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
|
|
||||||
self.input_preprocessor.tokenizer = tokenizer
|
|
||||||
|
|
||||||
def _validate_logprobs(
|
def _validate_logprobs(
|
||||||
self,
|
self,
|
||||||
params: SamplingParams,
|
params: SamplingParams,
|
||||||
|
|||||||
@ -139,7 +139,7 @@ class LLMEngine:
|
|||||||
@property
|
@property
|
||||||
@deprecated(
|
@deprecated(
|
||||||
"`LLMEngine.processor` has been renamed to `LLMEngine.input_processor`. "
|
"`LLMEngine.processor` has been renamed to `LLMEngine.input_processor`. "
|
||||||
"The old name will be removed in v0.13."
|
"The old name will be removed in v0.14."
|
||||||
)
|
)
|
||||||
def processor(self):
|
def processor(self):
|
||||||
return self.input_processor
|
return self.input_processor
|
||||||
@ -358,10 +358,6 @@ class LLMEngine:
|
|||||||
def tokenizer(self) -> TokenizerLike | None:
|
def tokenizer(self) -> TokenizerLike | None:
|
||||||
return self.input_processor.tokenizer
|
return self.input_processor.tokenizer
|
||||||
|
|
||||||
@tokenizer.setter
|
|
||||||
def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
|
|
||||||
self.input_processor.tokenizer = tokenizer
|
|
||||||
|
|
||||||
def get_tokenizer(self) -> TokenizerLike:
|
def get_tokenizer(self) -> TokenizerLike:
|
||||||
if self.tokenizer is None:
|
if self.tokenizer is None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
|||||||
@ -10,7 +10,7 @@ def __getattr__(name: str):
|
|||||||
warnings.warn(
|
warnings.warn(
|
||||||
"`vllm.v1.engine.processor.Processor` has been moved to "
|
"`vllm.v1.engine.processor.Processor` has been moved to "
|
||||||
"`vllm.v1.engine.input_processor.InputProcessor`. "
|
"`vllm.v1.engine.input_processor.InputProcessor`. "
|
||||||
"The old name will be removed in v0.13.",
|
"The old name will be removed in v0.14.",
|
||||||
DeprecationWarning,
|
DeprecationWarning,
|
||||||
stacklevel=2,
|
stacklevel=2,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -255,6 +255,7 @@ class RequestStatus(enum.IntEnum):
|
|||||||
FINISHED_LENGTH_CAPPED = enum.auto()
|
FINISHED_LENGTH_CAPPED = enum.auto()
|
||||||
FINISHED_ABORTED = enum.auto()
|
FINISHED_ABORTED = enum.auto()
|
||||||
FINISHED_IGNORED = enum.auto()
|
FINISHED_IGNORED = enum.auto()
|
||||||
|
FINISHED_ERROR = enum.auto()
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.name
|
return self.name
|
||||||
@ -277,4 +278,5 @@ _FINISHED_REASON_MAP = {
|
|||||||
RequestStatus.FINISHED_LENGTH_CAPPED: FinishReason.LENGTH,
|
RequestStatus.FINISHED_LENGTH_CAPPED: FinishReason.LENGTH,
|
||||||
RequestStatus.FINISHED_ABORTED: FinishReason.ABORT,
|
RequestStatus.FINISHED_ABORTED: FinishReason.ABORT,
|
||||||
RequestStatus.FINISHED_IGNORED: FinishReason.LENGTH,
|
RequestStatus.FINISHED_IGNORED: FinishReason.LENGTH,
|
||||||
|
RequestStatus.FINISHED_ERROR: FinishReason.ERROR,
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1275,6 +1275,8 @@ class GPUModelRunner(
|
|||||||
if not isinstance(kv_cache_spec, CrossAttentionSpec):
|
if not isinstance(kv_cache_spec, CrossAttentionSpec):
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
|
# Zero out buffer for padding requests that are not actually scheduled (CGs)
|
||||||
|
self.encoder_seq_lens.np[:num_reqs] = 0
|
||||||
# Build encoder_seq_lens array mapping request indices to
|
# Build encoder_seq_lens array mapping request indices to
|
||||||
# encoder lengths for inputs scheduled in this batch
|
# encoder lengths for inputs scheduled in this batch
|
||||||
for req_id in num_scheduled_tokens:
|
for req_id in num_scheduled_tokens:
|
||||||
@ -2874,6 +2876,7 @@ class GPUModelRunner(
|
|||||||
# be improved in model runner v2)
|
# be improved in model runner v2)
|
||||||
force_uniform_decode: bool | None = None,
|
force_uniform_decode: bool | None = None,
|
||||||
force_has_lora: bool | None = None,
|
force_has_lora: bool | None = None,
|
||||||
|
num_encoder_reqs: int = 0,
|
||||||
) -> tuple[
|
) -> tuple[
|
||||||
CUDAGraphMode,
|
CUDAGraphMode,
|
||||||
BatchDescriptor,
|
BatchDescriptor,
|
||||||
@ -2890,6 +2893,11 @@ class GPUModelRunner(
|
|||||||
if force_uniform_decode is None
|
if force_uniform_decode is None
|
||||||
else force_uniform_decode
|
else force_uniform_decode
|
||||||
)
|
)
|
||||||
|
# Encoder-decoder models only support CG for decoder_step > 0 (no enc_output
|
||||||
|
# is present). Also, chunked-prefill is disabled, so batch are uniform.
|
||||||
|
has_encoder_output = (
|
||||||
|
self.model_config.is_encoder_decoder and num_encoder_reqs > 0
|
||||||
|
)
|
||||||
|
|
||||||
has_lora = (
|
has_lora = (
|
||||||
len(self.input_batch.lora_id_to_lora_request) > 0
|
len(self.input_batch.lora_id_to_lora_request) > 0
|
||||||
@ -2909,7 +2917,7 @@ class GPUModelRunner(
|
|||||||
)
|
)
|
||||||
|
|
||||||
cudagraph_mode, batch_descriptor = dispatch_cudagraph(
|
cudagraph_mode, batch_descriptor = dispatch_cudagraph(
|
||||||
num_tokens_padded, use_cascade_attn
|
num_tokens_padded, use_cascade_attn or has_encoder_output
|
||||||
)
|
)
|
||||||
num_tokens_padded = batch_descriptor.num_tokens
|
num_tokens_padded = batch_descriptor.num_tokens
|
||||||
|
|
||||||
@ -3107,6 +3115,7 @@ class GPUModelRunner(
|
|||||||
num_scheduled_tokens_np=num_scheduled_tokens_np,
|
num_scheduled_tokens_np=num_scheduled_tokens_np,
|
||||||
max_num_scheduled_tokens=max_num_scheduled_tokens,
|
max_num_scheduled_tokens=max_num_scheduled_tokens,
|
||||||
use_cascade_attn=cascade_attn_prefix_lens is not None,
|
use_cascade_attn=cascade_attn_prefix_lens is not None,
|
||||||
|
num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs),
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.debug(
|
logger.debug(
|
||||||
@ -3672,74 +3681,89 @@ class GPUModelRunner(
|
|||||||
if self.parallel_config.enable_eplb:
|
if self.parallel_config.enable_eplb:
|
||||||
self.eplb_state = EplbState(self.parallel_config, self.device)
|
self.eplb_state = EplbState(self.parallel_config, self.device)
|
||||||
eplb_models = 0
|
eplb_models = 0
|
||||||
with DeviceMemoryProfiler() as m:
|
|
||||||
time_before_load = time.perf_counter()
|
try:
|
||||||
model_loader = get_model_loader(self.load_config)
|
with DeviceMemoryProfiler() as m:
|
||||||
self.model = model_loader.load_model(
|
time_before_load = time.perf_counter()
|
||||||
vllm_config=self.vllm_config, model_config=self.model_config
|
model_loader = get_model_loader(self.load_config)
|
||||||
)
|
self.model = model_loader.load_model(
|
||||||
if self.lora_config:
|
vllm_config=self.vllm_config, model_config=self.model_config
|
||||||
self.model = self.load_lora_model(
|
|
||||||
self.model, self.vllm_config, self.device
|
|
||||||
)
|
)
|
||||||
if hasattr(self, "drafter"):
|
if self.lora_config:
|
||||||
logger.info_once("Loading drafter model...")
|
self.model = self.load_lora_model(
|
||||||
self.drafter.load_model(self.model)
|
self.model, self.vllm_config, self.device
|
||||||
if (
|
|
||||||
hasattr(self.drafter, "model")
|
|
||||||
and is_mixture_of_experts(self.drafter.model)
|
|
||||||
and self.parallel_config.enable_eplb
|
|
||||||
):
|
|
||||||
spec_config = self.vllm_config.speculative_config
|
|
||||||
assert spec_config is not None
|
|
||||||
assert spec_config.draft_model_config is not None
|
|
||||||
logger.info_once(
|
|
||||||
"EPLB is enabled for drafter model %s.",
|
|
||||||
spec_config.draft_model_config.model,
|
|
||||||
)
|
)
|
||||||
|
if hasattr(self, "drafter"):
|
||||||
|
logger.info_once("Loading drafter model...")
|
||||||
|
self.drafter.load_model(self.model)
|
||||||
|
if (
|
||||||
|
hasattr(self.drafter, "model")
|
||||||
|
and is_mixture_of_experts(self.drafter.model)
|
||||||
|
and self.parallel_config.enable_eplb
|
||||||
|
):
|
||||||
|
spec_config = self.vllm_config.speculative_config
|
||||||
|
assert spec_config is not None
|
||||||
|
assert spec_config.draft_model_config is not None
|
||||||
|
logger.info_once(
|
||||||
|
"EPLB is enabled for drafter model %s.",
|
||||||
|
spec_config.draft_model_config.model,
|
||||||
|
)
|
||||||
|
|
||||||
global_expert_load = (
|
global_expert_load = (
|
||||||
global_expert_loads[eplb_models]
|
global_expert_loads[eplb_models]
|
||||||
if global_expert_loads
|
if global_expert_loads
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
old_global_expert_indices = (
|
old_global_expert_indices = (
|
||||||
old_global_expert_indices_per_model[eplb_models]
|
old_global_expert_indices_per_model[eplb_models]
|
||||||
if old_global_expert_indices_per_model
|
if old_global_expert_indices_per_model
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
if self.eplb_state is None:
|
if self.eplb_state is None:
|
||||||
self.eplb_state = EplbState(self.parallel_config, self.device)
|
self.eplb_state = EplbState(
|
||||||
self.eplb_state.add_model(
|
self.parallel_config, self.device
|
||||||
self.drafter.model,
|
)
|
||||||
spec_config.draft_model_config,
|
self.eplb_state.add_model(
|
||||||
global_expert_load,
|
self.drafter.model,
|
||||||
old_global_expert_indices,
|
spec_config.draft_model_config,
|
||||||
rank_mapping,
|
global_expert_load,
|
||||||
)
|
old_global_expert_indices,
|
||||||
eplb_models += 1
|
rank_mapping,
|
||||||
|
)
|
||||||
|
eplb_models += 1
|
||||||
|
|
||||||
if self.use_aux_hidden_state_outputs:
|
if self.use_aux_hidden_state_outputs:
|
||||||
if not supports_eagle3(self.get_model()):
|
if not supports_eagle3(self.get_model()):
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"Model does not support EAGLE3 interface but "
|
"Model does not support EAGLE3 interface but "
|
||||||
"aux_hidden_state_outputs was requested"
|
"aux_hidden_state_outputs was requested"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Try to get auxiliary layers from speculative config,
|
# Try to get auxiliary layers from speculative config,
|
||||||
# otherwise use model's default layers
|
# otherwise use model's default layers
|
||||||
aux_layers = self._get_eagle3_aux_layers_from_config()
|
aux_layers = self._get_eagle3_aux_layers_from_config()
|
||||||
if aux_layers:
|
if aux_layers:
|
||||||
logger.info(
|
logger.info(
|
||||||
"Using auxiliary layers from speculative config: %s",
|
"Using auxiliary layers from speculative config: %s",
|
||||||
aux_layers,
|
aux_layers,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
aux_layers = self.model.get_eagle3_aux_hidden_state_layers()
|
aux_layers = self.model.get_eagle3_aux_hidden_state_layers()
|
||||||
|
|
||||||
self.model.set_aux_hidden_state_layers(aux_layers)
|
self.model.set_aux_hidden_state_layers(aux_layers)
|
||||||
time_after_load = time.perf_counter()
|
time_after_load = time.perf_counter()
|
||||||
self.model_memory_usage = m.consumed_memory
|
self.model_memory_usage = m.consumed_memory
|
||||||
|
except torch.cuda.OutOfMemoryError as e:
|
||||||
|
msg = (
|
||||||
|
"Failed to load model - not enough GPU memory. "
|
||||||
|
"Try lowering --gpu-memory-utilization to free memory for weights, "
|
||||||
|
"increasing --tensor-parallel-size, or using --quantization. "
|
||||||
|
"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
|
||||||
|
"for more tips."
|
||||||
|
)
|
||||||
|
combined_msg = f"{msg} (original error: {e})"
|
||||||
|
logger.error(combined_msg)
|
||||||
|
raise e
|
||||||
logger.info_once(
|
logger.info_once(
|
||||||
"Model loading took %.4f GiB memory and %.6f seconds",
|
"Model loading took %.4f GiB memory and %.6f seconds",
|
||||||
self.model_memory_usage / GiB_bytes,
|
self.model_memory_usage / GiB_bytes,
|
||||||
|
|||||||
@ -10,7 +10,7 @@ import torch
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm.config import VllmConfig
|
from vllm.config import VllmConfig, set_current_vllm_config
|
||||||
from vllm.distributed import (
|
from vllm.distributed import (
|
||||||
ensure_model_parallel_initialized,
|
ensure_model_parallel_initialized,
|
||||||
init_distributed_environment,
|
init_distributed_environment,
|
||||||
@ -207,7 +207,8 @@ class TPUWorker:
|
|||||||
# one compiled bytecode. Having one FX graph/cached bytecode per
|
# one compiled bytecode. Having one FX graph/cached bytecode per
|
||||||
# compiled model is required for `support_torch_compile` decorator to
|
# compiled model is required for `support_torch_compile` decorator to
|
||||||
# skip dynamo guard.
|
# skip dynamo guard.
|
||||||
self.model_runner.reset_dynamo_cache()
|
with set_current_vllm_config(self.vllm_config):
|
||||||
|
self.model_runner.reset_dynamo_cache()
|
||||||
|
|
||||||
# Get the maximum amount of memory used by the model weights and
|
# Get the maximum amount of memory used by the model weights and
|
||||||
# intermediate activations.
|
# intermediate activations.
|
||||||
|
|||||||
@ -313,8 +313,12 @@ def bind_kv_cache(
|
|||||||
# TODO - analyze where runner_kv_caches is used and the right
|
# TODO - analyze where runner_kv_caches is used and the right
|
||||||
# way to ensure it properly reflects multiple attention layers
|
# way to ensure it properly reflects multiple attention layers
|
||||||
# in the same decoder block.
|
# in the same decoder block.
|
||||||
if current_platform.is_cuda_alike() or current_platform.is_xpu():
|
if (
|
||||||
# We know that the GPU runner is not impacted by this
|
current_platform.is_cuda_alike()
|
||||||
|
or current_platform.is_xpu()
|
||||||
|
or current_platform.is_cpu()
|
||||||
|
):
|
||||||
|
# We know that the GPU / CPU runner is not impacted by this
|
||||||
# case. Some test code depends on runner_kv_caches, but
|
# case. Some test code depends on runner_kv_caches, but
|
||||||
# not in a way that's impacted by ignoring this.
|
# not in a way that's impacted by ignoring this.
|
||||||
pass
|
pass
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user