Merge branch 'main' into mlm-full-lora-support

2026-06-02 20:44:31 +08:00 · 2025-12-11 11:47:50 +08:00 · 2025-12-11 11:47:50 +08:00 · d1307e1d29
commit d1307e1d29
parent e2ea025ee3 d1e1fb4363
65 changed files with 2160 additions and 400 deletions
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@ -36,6 +36,11 @@ function cpu_tests() {
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
  # Run model tests
  docker exec cpu-test bash -c "
    set -e
    pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
  # Run kernel tests
  docker exec cpu-test bash -c "
    set -e
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh
@ -1,73 +0,0 @@
 #!/usr/bin/env bash
 set -euxo pipefail
 # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
 THRESHOLD=${1:-0.25}
 NUM_Q=${2:-1319}
 PORT=${3:-8030}
 OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
 mkdir -p "${OUT_DIR}"
 wait_for_server() {
  local port=$1
  timeout 600 bash -c '
    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
      sleep 1
    done'
 }
 MODEL="deepseek-ai/DeepSeek-V2-lite"
 # Set BACKENDS based on platform
 if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
  # ROCm platform
  BACKENDS=("allgather_reducescatter")
  # Disable MOE padding for ROCm since it is causing eplb to fail
  export VLLM_ROCM_MOE_PADDING=0
 else
  # Non-ROCm platform (CUDA/other)
  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
 fi
 cleanup() {
  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
    kill "${SERVER_PID}" 2>/dev/null || true
    for _ in {1..20}; do
      kill -0 "${SERVER_PID}" 2>/dev/null || break
      sleep 0.5
    done
    kill -9 "${SERVER_PID}" 2>/dev/null || true
  fi
 }
 trap cleanup EXIT
 for BACK in "${BACKENDS[@]}"; do
  VLLM_DEEP_GEMM_WARMUP=skip \
  VLLM_ALL2ALL_BACKEND=$BACK \
  vllm serve "$MODEL" \
    --enforce-eager \
    --tensor-parallel-size 2 \
    --data-parallel-size 2 \
    --enable-expert-parallel \
    --enable-eplb \
    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
    --trust-remote-code \
    --max-model-len 2048 \
    --port $PORT &
  SERVER_PID=$!
  wait_for_server $PORT
  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json"
  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
 assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
 PY
  cleanup
  SERVER_PID=
  sleep 1
  PORT=$((PORT+1))
 done
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@ -50,7 +50,6 @@ for BACK in "${BACKENDS[@]}"; do
    --data-parallel-size 2 \
    --enable-expert-parallel \
    --enable-eplb \
    --eplb-config '{"window_size":200,"step_interval":600}' \
    --trust-remote-code \
    --max-model-len 2048 \
    --port $PORT &
--- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh
@ -1,74 +0,0 @@
 #!/usr/bin/env bash
 set -euxo pipefail
 # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
 THRESHOLD=${1:-0.25}
 NUM_Q=${2:-1319}
 PORT=${3:-8040}
 OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
 mkdir -p "${OUT_DIR}"
 wait_for_server() {
  local port=$1
  timeout 600 bash -c '
    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
      sleep 1
    done'
 }
 MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
 # Set BACKENDS based on platform
 if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
  # ROCm platform
  BACKENDS=("allgather_reducescatter")
  # Disable MOE padding for ROCm since it is causing eplb to fail
  export VLLM_ROCM_MOE_PADDING=0
 else
  # Non-ROCm platform (CUDA/other)
  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
 fi
 cleanup() {
  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
    kill "${SERVER_PID}" 2>/dev/null || true
    for _ in {1..20}; do
      kill -0 "${SERVER_PID}" 2>/dev/null || break
      sleep 0.5
    done
    kill -9 "${SERVER_PID}" 2>/dev/null || true
  fi
 }
 trap cleanup EXIT
 for BACK in "${BACKENDS[@]}"; do
  VLLM_DEEP_GEMM_WARMUP=skip \
  VLLM_ALL2ALL_BACKEND=$BACK \
  vllm serve "$MODEL" \
    --enforce-eager \
    --tensor-parallel-size 4 \
    --enable-expert-parallel \
    --enable-eplb \
    --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
    --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
    --trust-remote-code \
    --max-model-len 2048 \
    --gpu-memory-utilization 0.9 \
    --port $PORT &
  SERVER_PID=$!
  wait_for_server $PORT
  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
  python3 - <<PY
 import json; acc=json.load(open('${OUT}'))['accuracy']
 print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
 assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
 PY
  cleanup
  SERVER_PID=
  sleep 1
  PORT=$((PORT+1))
 done
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -1379,22 +1379,4 @@ steps:
  num_gpus: 2
  working_dir: "/vllm-workspace"
  commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
 - label: DeepSeek V2-Lite Async EPLB Accuracy
  timeout_in_minutes: 60
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
 - label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
  timeout_in_minutes: 60
  gpu: h100
  optional: true
  num_gpus: 4
  working_dir: "/vllm-workspace"
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -574,7 +574,7 @@ async def benchmark(
    )
    print(
        "{:<40} {:<10.2f}".format(
-            "Total Token throughput (tok/s):", metrics.total_token_throughput
+            "Total token throughput (tok/s):", metrics.total_token_throughput
        )
    )
--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@ -117,7 +117,6 @@ torch::Tensor get_scheduler_metadata(
  input.casual = casual;
  input.isa = isa;
  input.enable_kv_split = enable_kv_split;
  TORCH_CHECK(casual, "Only supports casual mask for now.");
  VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
    CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
--- a/docs/benchmarking/cli.md
+++ b/docs/benchmarking/cli.md
@ -84,7 +84,7 @@ Total input tokens:                      1369
 Total generated tokens:                  2212
 Request throughput (req/s):              1.73
 Output token throughput (tok/s):         382.89
-Total Token throughput (tok/s):          619.85
+Total token throughput (tok/s):          619.85
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          71.54
 Median TTFT (ms):                        73.88
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@ -21,30 +21,20 @@ The mental model is that server-level metrics help explain the values of request
 ### v1 Metrics
-In v1, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
+In v1, an extensive set of metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix, for example:
 - `vllm:num_requests_running` (Gauge) - Number of requests currently running.
 - `vllm:num_requests_waiting` (Gauge) - Number of requests currently waiting.
 - `vllm:kv_cache_usage_perc` (Gauge) - Fraction of used KV cache blocks (0–1).
 - `vllm:prefix_cache_queries` (Counter) - Number of prefix cache queries.
 - `vllm:prefix_cache_hits` (Counter) - Number of prefix cache hits.
 - `vllm:mm_cache_queries` (Counter) - (For multimodal models) Number of multimodal cache queries.
 - `vllm:mm_cache_hits` (Counter) - (For multimodal models) Number of multimodal cache hits.
 - `vllm:num_preemptions_total` (Counter) - Number of preemptions.
 - `vllm:prompt_tokens_total` (Counter) - Total number of prompt tokens processed.
 - `vllm:generation_tokens_total` (Counter) - Total number of generated tokens.
 - `vllm:iteration_tokens_total` (Histogram) - Histogram of tokens processed in each engine step.
 - `vllm:cache_config_info` (Gauge) - Information about the cache configuration.
 - `vllm:request_success_total` (Counter) - Number of finished requests (by finish reason).
 - `vllm:request_prompt_tokens` (Histogram) - Histogram of input prompt token counts.
 - `vllm:request_generation_tokens` (Histogram) - Histogram of generation token counts.
 - `vllm:request_params_n` (Histogram) - Histogram of request parameter n.
 - `vllm:request_params_max_tokens` - (Histogram) - Histogram of max_tokens parameter in requests.
 - `vllm:time_to_first_token_seconds` (Histogram) - Time to first token (TTFT).
 - `vllm:inter_token_latency_seconds` (Histogram) - Inter-token latency.
 - `vllm:e2e_request_latency_seconds` (Histogram) - End-to-end request latency.
 - `vllm:request_queue_time_seconds` (Histogram) - Time spent in the queue.
 - `vllm:request_inference_time_seconds` (Histogram) - Request inference time.
 - `vllm:request_prefill_time_seconds` (Histogram) - Request prefill time.
 - `vllm:request_decode_time_seconds` (Histogram) - Request decode time.
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@ -61,7 +61,7 @@ Now let´s see an example for each of the cases, starting with the `choice`, as
    print(completion.choices[0].message.content)
    ```
-The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template:
+The next example shows how to use the `regex`. The supported regex syntax depends on the structured output backend. For example, `xgrammar`, `guidance`, and `outlines` use Rust-style regex, while `lm-format-enforcer` uses Python's `re` module. The idea is to generate an email address, given a simple regex template:
 ??? code
--- a/docs/mkdocs/hooks/generate_metrics.py
+++ b/docs/mkdocs/hooks/generate_metrics.py
@ -0,0 +1,149 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import ast
 import logging
 from pathlib import Path
 from typing import Literal
 logger = logging.getLogger("mkdocs")
 ROOT_DIR = Path(__file__).parent.parent.parent.parent
 DOCS_DIR = ROOT_DIR / "docs"
 GENERATED_METRICS_DIR = DOCS_DIR / "generated" / "metrics"
 # Files to scan for metric definitions - each will generate a separate table
 METRIC_SOURCE_FILES = [
    {"path": "vllm/v1/metrics/loggers.py", "output": "general.md"},
    {
        "path": "vllm/v1/spec_decode/metrics.py",
        "output": "spec_decode.md",
    },
    {
        "path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py",
        "output": "nixl_connector.md",
    },
 ]
 class MetricExtractor(ast.NodeVisitor):
    """AST visitor to extract metric definitions."""
    def __init__(self):
        self.metrics: list[dict[str, str]] = []
    def visit_Call(self, node: ast.Call) -> None:
        """Visit function calls to find metric class instantiations."""
        metric_type = self._get_metric_type(node)
        if metric_type:
            name = self._extract_kwarg(node, "name")
            documentation = self._extract_kwarg(node, "documentation")
            if name:
                self.metrics.append(
                    {
                        "name": name,
                        "type": metric_type,
                        "documentation": documentation or "",
                    }
                )
        self.generic_visit(node)
    def _get_metric_type(self, node: ast.Call) -> str | None:
        """Determine if this call creates a metric and return its type."""
        metric_type_map = {
            "_gauge_cls": "gauge",
            "_counter_cls": "counter",
            "_histogram_cls": "histogram",
        }
        if isinstance(node.func, ast.Attribute):
            return metric_type_map.get(node.func.attr)
        return None
    def _extract_kwarg(self, node: ast.Call, key: str) -> str | None:
        """Extract a keyword argument value from a function call."""
        for keyword in node.keywords:
            if keyword.arg == key:
                return self._get_string_value(keyword.value)
        return None
    def _get_string_value(self, node: ast.AST) -> str | None:
        """Extract string value from an AST node."""
        if isinstance(node, ast.Constant):
            return str(node.value) if node.value is not None else None
        return None
 def extract_metrics_from_file(filepath: Path) -> list[dict[str, str]]:
    """Parse a Python file and extract all metric definitions."""
    try:
        with open(filepath, encoding="utf-8") as f:
            source = f.read()
        tree = ast.parse(source, filename=str(filepath))
        extractor = MetricExtractor()
        extractor.visit(tree)
        return extractor.metrics
    except Exception as e:
        raise RuntimeError(f"Failed to parse {filepath}: {e}") from e
 def generate_markdown_table(metrics: list[dict[str, str]]) -> str:
    """Generate a markdown table from extracted metrics."""
    if not metrics:
        return "No metrics found.\n"
    # Sort by type, then by name
    metrics_sorted = sorted(metrics, key=lambda m: (m["type"], m["name"]))
    lines = []
    lines.append("| Metric Name | Type | Description |")
    lines.append("|-------------|------|-------------|")
    for metric in metrics_sorted:
        name = metric["name"]
        metric_type = metric["type"].capitalize()
        doc = metric["documentation"].replace("\n", " ").strip()
        lines.append(f"| `{name}` | {metric_type} | {doc} |")
    return "\n".join(lines) + "\n"
 def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
    """Generate metrics documentation tables from source files."""
    logger.info("Generating metrics documentation")
    # Create generated directory if it doesn't exist
    GENERATED_METRICS_DIR.mkdir(parents=True, exist_ok=True)
    total_metrics = 0
    for source_config in METRIC_SOURCE_FILES:
        source_path = source_config["path"]
        output_file = source_config["output"]
        filepath = ROOT_DIR / source_path
        if not filepath.exists():
            raise FileNotFoundError(f"Metrics source file not found: {filepath}")
        logger.debug("Extracting metrics from: %s", source_path)
        metrics = extract_metrics_from_file(filepath)
        logger.debug("Found %d metrics in %s", len(metrics), source_path)
        # Generate and write the markdown table for this source
        table_content = generate_markdown_table(metrics)
        output_path = GENERATED_METRICS_DIR / output_file
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(table_content)
        total_metrics += len(metrics)
        logger.info(
            "Generated metrics table: %s (%d metrics)",
            output_path.relative_to(ROOT_DIR),
            len(metrics),
        )
    logger.info(
        "Total metrics generated: %d across %d files",
        total_metrics,
        len(METRIC_SOURCE_FILES),
    )
--- a/docs/serving/data_parallel_deployment.md
+++ b/docs/serving/data_parallel_deployment.md
@ -24,7 +24,7 @@ There are two distinct modes supported for online deployments - self-contained w
 vLLM supports "self-contained" data parallel deployments that expose a single API endpoint.
-It can be configured by simply including e.g. `--data-parallel-size=4` in the vllm serve command line arguments. This will require 4 GPUs. It can be combined with tensor parallel, for example `--data-parallel-size=4 --tensor-parallel-size=2`, which would require 8 GPUs.
+It can be configured by simply including e.g. `--data-parallel-size=4` in the vllm serve command line arguments. This will require 4 GPUs. It can be combined with tensor parallel, for example `--data-parallel-size=4 --tensor-parallel-size=2`, which would require 8 GPUs. When sizing DP deployments, remember that `--max-num-seqs` applies per DP rank.
 Running a single data parallel deployment across multiple nodes requires a different `vllm serve` to be run on each node, specifying which DP ranks should run on that node. In this case, there will still be a single HTTP entrypoint - the API server(s) will run only on one node, but it doesn't necessarily need to be co-located with the DP ranks.
@ -80,6 +80,18 @@ When deploying large DP sizes using this method, the API server process can beco
 ![DP Internal LB Diagram](../assets/deployment/dp_internal_lb.png)
 </figure>
 ## Hybrid Load Balancing
 Hybrid load balancing sits between the internal and external approaches. Each node runs its own API server(s) that only queue requests to the data-parallel engines colocated on that node. An upstream load balancer (for example, an ingress controller or traffic router) spreads user requests across those per-node endpoints.
 Enable this mode with `--data-parallel-hybrid-lb` while still launching every node with the global data-parallel size. The key differences from internal load balancing are:
 - You must provide `--data-parallel-size-local` and `--data-parallel-start-rank` so each node knows which ranks it owns.
 - Not compatible with `--headless` since every node exposes an API endpoint.
 - Scale `--api-server-count` per node based on the number of local ranks
 In this configuration, each node keeps scheduling decisions local, which reduces cross-node traffic and avoids single node bottlenecks at larger DP sizes.
 ## External Load Balancing
 For larger scale deployments especially, it can make sense to handle the orchestration and load balancing of data parallel ranks externally.
--- a/docs/serving/expert_parallel_deployment.md
+++ b/docs/serving/expert_parallel_deployment.md
@ -40,10 +40,12 @@ EP_SIZE = TP_SIZE × DP_SIZE
 Where:
- `TP_SIZE`: Tensor parallel size (always 1 for now)
+- `TP_SIZE`: Tensor parallel size
 - `DP_SIZE`: Data parallel size
 - `EP_SIZE`: Expert parallel size (computed automatically)
 When EP is enabled, MoE layers use expert parallelism instead of tensor parallelism, while attention layers continue to use tensor parallelism if `TP_SIZE > 1`.
 ### Example Command
 The following command serves a `DeepSeek-V3-0324` model with 1-way tensor parallel, 8-way (attention) data parallel, and 8-way expert parallel. The attention weights are replicated across all GPUs, while the expert weights are split across GPUs. It will work on a H200 (or H20) node with 8 GPUs. For H100, you can try to serve a smaller model or refer to the multi-node deployment section.
@ -81,7 +83,7 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \
    --data-parallel-size-local 8 \           # Local DP size on this node (8 GPUs per node)
    --data-parallel-address 192.168.1.100 \  # Replace with actual IP of Node 1
    --data-parallel-rpc-port 13345 \         # RPC communication port, can be any port as long as reachable by all nodes
-    --api-server-count=8                     # Number of API servers for load handling (scaling this out to total ranks are recommended)
+    --api-server-count=8                     # Number of API servers for load handling (scaling this out to # local ranks is recommended)
 # Node 2 (Secondary - headless mode, no API server)
 vllm serve deepseek-ai/DeepSeek-V3-0324 \
@ -119,9 +121,6 @@ While MoE models are typically trained so that each expert receives a similar nu
 Enable EPLB with the `--enable-eplb` flag.
 !!! note "Model Support"
    Currently only DeepSeek V3 architecture is supported.
 When enabled, vLLM collects load statistics with every forward pass and periodically rebalances expert distribution.
 ### EPLB Parameters
@ -134,6 +133,8 @@ Configure EPLB with the `--eplb-config` argument, which accepts a JSON string. T
 | `step_interval`| Frequency of rebalancing (every N engine steps) | 3000 |
 | `log_balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` |
 | `num_redundant_experts` | Additional global experts per EP rank beyond equal distribution | `0` |
 | `use_async` | Use non-blocking EPLB for reduced latency overhead | `false` |
 | `policy` | The policy type for expert parallel load balancing | `"default"` |
 For example:
@ -183,6 +184,26 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \
 For multi-node deployment, add these EPLB flags to each node's command. We recommend setting `--eplb-config '{"num_redundant_experts":32}'` to 32 in large scale use cases so the most popular experts are always available.
 ## Advanced Configuration
 ### Performance Optimization
 - **DeepEP kernels**: The `high_throughput` and `low_latency` kernels are optimized for disaggregated serving and may show poor performance for mixed workloads
 - **Dual Batch Overlap**: Use `--enable-dbo` to overlap all-to-all communication with compute. See [Dual Batch Overlap](../design/dbo.md) for more details.
 - **Async scheduling (experimental)**: Try `--async-scheduling` to overlap scheduling with model execution.
 ### Troubleshooting
 - **`non-zero status: 7 cannot register cq buf`**: When using Infiniband/RoCE, make sure host VM and pods show `ulimit -l` "unlimited".
 - **`init failed for transport: IBGDA`**: The InfiniBand GDA kernel modules are missing. Run `tools/ep_kernels/configure_system_drivers.sh` on each GPU node and reboot. Also fixes error `NVSHMEM API called before NVSHMEM initialization has completed`.
 - **NVSHMEM peer disconnect**: Usually a networking misconfiguration. If deploying via Kubernetes, verify that every pod runs with `hostNetwork: true`, `securityContext.privileged: true` to access Infiniband.
 ### Benchmarking
 - Use simulator flags `VLLM_MOE_ROUTING_SIMULATION_STRATEGY=uniform_random` and `VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1` so token routing is balanced across EP ranks.
 - Increasing `VLLM_MOE_DP_CHUNK_SIZE` may increase throughput by increasing the maximum batch size for inter-rank token transfers. This may cause DeepEP  to throw `assert self.nvshmem_qp_depth >= (num_max_dispatch_tokens_per_rank + 1) * 2`, which can be fixed by increasing environment variable `NVSHMEM_QP_DEPTH`.
 ## Disaggregated Serving (Prefill/Decode Split)
 For production deployments requiring strict SLA guarantees for time-to-first-token and inter-token latency, disaggregated serving allows independent scaling of prefill and decode operations.
@ -273,3 +294,9 @@ except Exception as e:
    print(f"❌ Error during disaggregated serving: {e}")
    print("Check that both prefill and decode instances are running and accessible")
 ```
 ### Benchmarking
 - To simulate the decode deployment of disaggregated serving, pass `--kv-transfer-config '{"kv_connector":"DecodeBenchConnector","kv_role":"kv_both"}'` to the `vllm serve` invocation. The connector populates KV cache with random values so decode can be profiled in isolation.
 - **CUDAGraph capture**: Use `--compilation_config '{"cudagraph_mode": "FULL_DECODE_ONLY"}'` to enable CUDA graph capture for decode only and save KV cache.
--- a/docs/usage/metrics.md
+++ b/docs/usage/metrics.md
@ -33,11 +33,19 @@ Then query the endpoint to get the latest metrics from the server:
 The following metrics are exposed:
-??? code
+## General Metrics
-    ```python
+--8<-- "docs/generated/metrics/general.md"
-    --8<-- "vllm/engine/metrics.py:metrics-definitions"
+
-    ```
+## Speculative Decoding Metrics
 --8<-- "docs/generated/metrics/spec_decode.md"
 ## NIXL KV Connector Metrics
 --8<-- "docs/generated/metrics/nixl_connector.md"
 ## Deprecation Policy
 Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
 but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@ -51,6 +51,7 @@ hooks:
  - docs/mkdocs/hooks/remove_announcement.py
  - docs/mkdocs/hooks/generate_examples.py
  - docs/mkdocs/hooks/generate_argparse.py
  - docs/mkdocs/hooks/generate_metrics.py
  - docs/mkdocs/hooks/url_schemes.py
 plugins:
--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
@ -1,2 +1,2 @@
-lmcache
+lmcache >= 0.3.10.post1
 nixl >= 0.7.1 # Required for disaggregated prefill
--- a/tests/entrypoints/openai/test_chat_error.py
+++ b/tests/entrypoints/openai/test_chat_error.py
@ -0,0 +1,228 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass, field
 from http import HTTPStatus
 from typing import Any
 from unittest.mock import AsyncMock, MagicMock
 import pytest
 from vllm.config.multimodal import MultiModalConfig
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ErrorResponse
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.v1.engine.async_llm import AsyncLLM
 MODEL_NAME = "openai-community/gpt2"
 MODEL_NAME_SHORT = "gpt2"
 BASE_MODEL_PATHS = [
    BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME),
    BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT),
 ]
@dataclass
 class MockHFConfig:
    model_type: str = "any"
@dataclass
 class MockModelConfig:
    task = "generate"
    runner_type = "generate"
    tokenizer = MODEL_NAME
    trust_remote_code = False
    tokenizer_mode = "auto"
    max_model_len = 100
    tokenizer_revision = None
    multimodal_config = MultiModalConfig()
    hf_config = MockHFConfig()
    logits_processor_pattern = None
    logits_processors: list[str] | None = None
    diff_sampling_param: dict | None = None
    allowed_local_media_path: str = ""
    allowed_media_domains: list[str] | None = None
    encoder_config = None
    generation_config: str = "auto"
    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
    skip_tokenizer_init = False
    def get_diff_sampling_param(self):
        return self.diff_sampling_param or {}
 def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
    models = OpenAIServingModels(
        engine_client=engine,
        base_model_paths=BASE_MODEL_PATHS,
    )
    serving_chat = OpenAIServingChat(
        engine,
        models,
        response_role="assistant",
        request_logger=None,
        chat_template=None,
        chat_template_content_format="auto",
    )
    async def _fake_process_inputs(
        request_id,
        engine_prompt,
        sampling_params,
        *,
        lora_request,
        trace_headers,
        priority,
    ):
        return dict(engine_prompt), {}
    async def _fake_preprocess_chat(*args, **kwargs):
        # return conversation, request_prompts, engine_prompts
        return (
            [{"role": "user", "content": "Test"}],
            [[1, 2, 3]],
            [{"prompt_token_ids": [1, 2, 3]}],
        )
    serving_chat._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
    serving_chat._preprocess_chat = AsyncMock(side_effect=_fake_preprocess_chat)
    return serving_chat
@pytest.mark.asyncio
 async def test_chat_error_non_stream():
    """test finish_reason='error' returns 500 InternalServerError (non-streaming)"""
    mock_engine = MagicMock(spec=AsyncLLM)
    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
    mock_engine.errored = False
    mock_engine.model_config = MockModelConfig()
    mock_engine.input_processor = MagicMock()
    mock_engine.io_processor = MagicMock()
    serving_chat = _build_serving_chat(mock_engine)
    completion_output = CompletionOutput(
        index=0,
        text="",
        token_ids=[],
        cumulative_logprob=None,
        logprobs=None,
        finish_reason="error",
    )
    request_output = RequestOutput(
        request_id="test-id",
        prompt="Test prompt",
        prompt_token_ids=[1, 2, 3],
        prompt_logprobs=None,
        outputs=[completion_output],
        finished=True,
        metrics=None,
        lora_request=None,
        encoder_prompt=None,
        encoder_prompt_token_ids=None,
    )
    async def mock_generate(*args, **kwargs):
        yield request_output
    mock_engine.generate = MagicMock(side_effect=mock_generate)
    request = ChatCompletionRequest(
        model=MODEL_NAME,
        messages=[{"role": "user", "content": "Test prompt"}],
        max_tokens=10,
        stream=False,
    )
    response = await serving_chat.create_chat_completion(request)
    assert isinstance(response, ErrorResponse)
    assert response.error.type == "InternalServerError"
    assert response.error.message == "Internal server error"
    assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
@pytest.mark.asyncio
 async def test_chat_error_stream():
    """test finish_reason='error' returns 500 InternalServerError (streaming)"""
    mock_engine = MagicMock(spec=AsyncLLM)
    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
    mock_engine.errored = False
    mock_engine.model_config = MockModelConfig()
    mock_engine.input_processor = MagicMock()
    mock_engine.io_processor = MagicMock()
    serving_chat = _build_serving_chat(mock_engine)
    completion_output_1 = CompletionOutput(
        index=0,
        text="Hello",
        token_ids=[100],
        cumulative_logprob=None,
        logprobs=None,
        finish_reason=None,
    )
    request_output_1 = RequestOutput(
        request_id="test-id",
        prompt="Test prompt",
        prompt_token_ids=[1, 2, 3],
        prompt_logprobs=None,
        outputs=[completion_output_1],
        finished=False,
        metrics=None,
        lora_request=None,
        encoder_prompt=None,
        encoder_prompt_token_ids=None,
    )
    completion_output_2 = CompletionOutput(
        index=0,
        text="Hello",
        token_ids=[100],
        cumulative_logprob=None,
        logprobs=None,
        finish_reason="error",
    )
    request_output_2 = RequestOutput(
        request_id="test-id",
        prompt="Test prompt",
        prompt_token_ids=[1, 2, 3],
        prompt_logprobs=None,
        outputs=[completion_output_2],
        finished=True,
        metrics=None,
        lora_request=None,
        encoder_prompt=None,
        encoder_prompt_token_ids=None,
    )
    async def mock_generate(*args, **kwargs):
        yield request_output_1
        yield request_output_2
    mock_engine.generate = MagicMock(side_effect=mock_generate)
    request = ChatCompletionRequest(
        model=MODEL_NAME,
        messages=[{"role": "user", "content": "Test prompt"}],
        max_tokens=10,
        stream=True,
    )
    response = await serving_chat.create_chat_completion(request)
    chunks = []
    async for chunk in response:
        chunks.append(chunk)
    assert len(chunks) >= 2
    assert any("Internal server error" in chunk for chunk in chunks), (
        f"Expected error message in chunks: {chunks}"
    )
    assert chunks[-1] == "data: [DONE]\n\n"
--- a/tests/entrypoints/openai/test_completion_error.py
+++ b/tests/entrypoints/openai/test_completion_error.py
@ -0,0 +1,216 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass, field
 from http import HTTPStatus
 from typing import Any
 from unittest.mock import AsyncMock, MagicMock
 import pytest
 from vllm.config.multimodal import MultiModalConfig
 from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.v1.engine.async_llm import AsyncLLM
 MODEL_NAME = "openai-community/gpt2"
 MODEL_NAME_SHORT = "gpt2"
 BASE_MODEL_PATHS = [
    BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME),
    BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT),
 ]
@dataclass
 class MockHFConfig:
    model_type: str = "any"
@dataclass
 class MockModelConfig:
    task = "generate"
    runner_type = "generate"
    tokenizer = MODEL_NAME
    trust_remote_code = False
    tokenizer_mode = "auto"
    max_model_len = 100
    tokenizer_revision = None
    multimodal_config = MultiModalConfig()
    hf_config = MockHFConfig()
    logits_processor_pattern = None
    logits_processors: list[str] | None = None
    diff_sampling_param: dict | None = None
    allowed_local_media_path: str = ""
    allowed_media_domains: list[str] | None = None
    encoder_config = None
    generation_config: str = "auto"
    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
    skip_tokenizer_init = False
    def get_diff_sampling_param(self):
        return self.diff_sampling_param or {}
 def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion:
    models = OpenAIServingModels(
        engine_client=engine,
        base_model_paths=BASE_MODEL_PATHS,
    )
    serving_completion = OpenAIServingCompletion(
        engine,
        models,
        request_logger=None,
    )
    async def _fake_process_inputs(
        request_id,
        engine_prompt,
        sampling_params,
        *,
        lora_request,
        trace_headers,
        priority,
    ):
        return dict(engine_prompt), {}
    serving_completion._process_inputs = AsyncMock(side_effect=_fake_process_inputs)
    return serving_completion
@pytest.mark.asyncio
 async def test_completion_error_non_stream():
    """test finish_reason='error' returns 500 InternalServerError (non-streaming)"""
    mock_engine = MagicMock(spec=AsyncLLM)
    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
    mock_engine.errored = False
    mock_engine.model_config = MockModelConfig()
    mock_engine.input_processor = MagicMock()
    mock_engine.io_processor = MagicMock()
    serving_completion = _build_serving_completion(mock_engine)
    completion_output = CompletionOutput(
        index=0,
        text="",
        token_ids=[],
        cumulative_logprob=None,
        logprobs=None,
        finish_reason="error",
    )
    request_output = RequestOutput(
        request_id="test-id",
        prompt="Test prompt",
        prompt_token_ids=[1, 2, 3],
        prompt_logprobs=None,
        outputs=[completion_output],
        finished=True,
        metrics=None,
        lora_request=None,
        encoder_prompt=None,
        encoder_prompt_token_ids=None,
    )
    async def mock_generate(*args, **kwargs):
        yield request_output
    mock_engine.generate = MagicMock(side_effect=mock_generate)
    request = CompletionRequest(
        model=MODEL_NAME,
        prompt="Test prompt",
        max_tokens=10,
        stream=False,
    )
    response = await serving_completion.create_completion(request)
    assert isinstance(response, ErrorResponse)
    assert response.error.type == "InternalServerError"
    assert response.error.message == "Internal server error"
    assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
@pytest.mark.asyncio
 async def test_completion_error_stream():
    """test finish_reason='error' returns 500 InternalServerError (streaming)"""
    mock_engine = MagicMock(spec=AsyncLLM)
    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
    mock_engine.errored = False
    mock_engine.model_config = MockModelConfig()
    mock_engine.input_processor = MagicMock()
    mock_engine.io_processor = MagicMock()
    serving_completion = _build_serving_completion(mock_engine)
    completion_output_1 = CompletionOutput(
        index=0,
        text="Hello",
        token_ids=[100],
        cumulative_logprob=None,
        logprobs=None,
        finish_reason=None,
    )
    request_output_1 = RequestOutput(
        request_id="test-id",
        prompt="Test prompt",
        prompt_token_ids=[1, 2, 3],
        prompt_logprobs=None,
        outputs=[completion_output_1],
        finished=False,
        metrics=None,
        lora_request=None,
        encoder_prompt=None,
        encoder_prompt_token_ids=None,
    )
    completion_output_2 = CompletionOutput(
        index=0,
        text="Hello",
        token_ids=[100],
        cumulative_logprob=None,
        logprobs=None,
        finish_reason="error",
    )
    request_output_2 = RequestOutput(
        request_id="test-id",
        prompt="Test prompt",
        prompt_token_ids=[1, 2, 3],
        prompt_logprobs=None,
        outputs=[completion_output_2],
        finished=True,
        metrics=None,
        lora_request=None,
        encoder_prompt=None,
        encoder_prompt_token_ids=None,
    )
    async def mock_generate(*args, **kwargs):
        yield request_output_1
        yield request_output_2
    mock_engine.generate = MagicMock(side_effect=mock_generate)
    request = CompletionRequest(
        model=MODEL_NAME,
        prompt="Test prompt",
        max_tokens=10,
        stream=True,
    )
    response = await serving_completion.create_completion(request)
    chunks = []
    async for chunk in response:
        chunks.append(chunk)
    assert len(chunks) >= 2
    assert any("Internal server error" in chunk for chunk in chunks), (
        f"Expected error message in chunks: {chunks}"
    )
    assert chunks[-1] == "data: [DONE]\n\n"
--- a/tests/entrypoints/openai/test_responses_error.py
+++ b/tests/entrypoints/openai/test_responses_error.py
@ -0,0 +1,89 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from http import HTTPStatus
 from unittest.mock import MagicMock
 import pytest
 from vllm.entrypoints.openai.protocol import ErrorResponse
 from vllm.entrypoints.openai.serving_engine import GenerationError, OpenAIServing
@pytest.mark.asyncio
 async def test_raise_if_error_raises_generation_error():
    """test _raise_if_error raises GenerationError"""
    # create a minimal OpenAIServing instance
    mock_engine = MagicMock()
    mock_engine.model_config = MagicMock()
    mock_engine.model_config.max_model_len = 100
    mock_models = MagicMock()
    serving = OpenAIServing(
        engine_client=mock_engine,
        models=mock_models,
        request_logger=None,
    )
    # test that error finish_reason raises GenerationError
    with pytest.raises(GenerationError) as exc_info:
        serving._raise_if_error("error", "test-request-id")
    assert str(exc_info.value) == "Internal server error"
    assert exc_info.value.status_code == HTTPStatus.INTERNAL_SERVER_ERROR
    # test that other finish_reasons don't raise
    serving._raise_if_error("stop", "test-request-id")  # should not raise
    serving._raise_if_error("length", "test-request-id")  # should not raise
    serving._raise_if_error(None, "test-request-id")  # should not raise
@pytest.mark.asyncio
 async def test_convert_generation_error_to_response():
    """test _convert_generation_error_to_response creates proper ErrorResponse"""
    mock_engine = MagicMock()
    mock_engine.model_config = MagicMock()
    mock_engine.model_config.max_model_len = 100
    mock_models = MagicMock()
    serving = OpenAIServing(
        engine_client=mock_engine,
        models=mock_models,
        request_logger=None,
    )
    # create a GenerationError
    gen_error = GenerationError("Internal server error")
    # convert to ErrorResponse
    error_response = serving._convert_generation_error_to_response(gen_error)
    assert isinstance(error_response, ErrorResponse)
    assert error_response.error.type == "InternalServerError"
    assert error_response.error.message == "Internal server error"
    assert error_response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR
@pytest.mark.asyncio
 async def test_convert_generation_error_to_streaming_response():
    """test _convert_generation_error_to_streaming_response output"""
    mock_engine = MagicMock()
    mock_engine.model_config = MagicMock()
    mock_engine.model_config.max_model_len = 100
    mock_models = MagicMock()
    serving = OpenAIServing(
        engine_client=mock_engine,
        models=mock_models,
        request_logger=None,
    )
    # create a GenerationError
    gen_error = GenerationError("Internal server error")
    # convert to streaming error response
    error_json = serving._convert_generation_error_to_streaming_response(gen_error)
    assert isinstance(error_json, str)
    assert "Internal server error" in error_json
    assert "InternalServerError" in error_json
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@ -92,16 +92,19 @@ def run_test(
    *,
    tensor_parallel_size: int,
    distributed_executor_backend: str | None = None,
    dtype: str = "half",
 ) -> None:
    prompt_list = PROMPTS * 10
    expected_list = EXPECTED[model] * 10
    with vllm_runner(
        model,
-        dtype="half",
+        dtype=dtype,
        max_model_len=448,
        tensor_parallel_size=tensor_parallel_size,
        distributed_executor_backend=distributed_executor_backend,
        # TODO (NickLucche) figure out output differences with non-eager and re-enable
        enforce_eager=True,
    ) as vllm_model:
        llm = vllm_model.llm
@ -120,12 +123,28 @@ def run_test(
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("dtype", ["half"])
@create_new_process_for_each_test()
-def test_models(vllm_runner, model) -> None:
+def test_models(vllm_runner, model, dtype) -> None:
    run_test(
        vllm_runner,
        model,
        tensor_parallel_size=1,
        dtype=dtype,
    )
@pytest.mark.cpu_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("dtype", ["half"])
 def test_models_cpu(vllm_runner, model, dtype) -> None:
    # @create_new_process_for_each_test() does not work for some runners
    # TODO: to fix cpu privilege issues in run-cpu-test-arm.sh
    run_test(
        vllm_runner,
        model,
        tensor_parallel_size=1,
        dtype=dtype,
    )
--- a/tests/test_envs.py
+++ b/tests/test_envs.py
@ -8,6 +8,7 @@ import pytest
 import vllm.envs as envs
 from vllm.envs import (
    disable_envs_cache,
    enable_envs_cache,
    env_list_with_choices,
    env_set_with_choices,
@ -57,6 +58,43 @@ def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch):
    envs.__getattr__ = envs.__getattr__.__wrapped__
 def test_getattr_with_reset(monkeypatch: pytest.MonkeyPatch) -> None:
    monkeypatch.setenv("VLLM_HOST_IP", "1.1.1.1")
    # __getattr__ is not decorated with functools.cache
    assert not hasattr(envs.__getattr__, "cache_info")
    # Enable envs cache and ignore ongoing environment changes
    enable_envs_cache()
    assert envs.VLLM_HOST_IP == "1.1.1.1"
    # With cache enabled, the environment variable value is cached and unchanged
    monkeypatch.setenv("VLLM_HOST_IP", "2.2.2.2")
    assert envs.VLLM_HOST_IP == "1.1.1.1"
    disable_envs_cache()
    assert envs.VLLM_HOST_IP == "2.2.2.2"
    # After cache disabled, the environment variable value would be synced
    # with os.environ
    monkeypatch.setenv("VLLM_HOST_IP", "3.3.3.3")
    assert envs.VLLM_HOST_IP == "3.3.3.3"
 def test_is_envs_cache_enabled() -> None:
    assert not envs._is_envs_cache_enabled()
    enable_envs_cache()
    assert envs._is_envs_cache_enabled()
    # Only wrap one-layer of cache, so we only need to
    # call disable once to reset.
    enable_envs_cache()
    enable_envs_cache()
    enable_envs_cache()
    disable_envs_cache()
    assert not envs._is_envs_cache_enabled()
    disable_envs_cache()
    assert not envs._is_envs_cache_enabled()
 class TestEnvWithChoices:
    """Test cases for env_with_choices function."""
--- a/tests/v1/engine/test_init_error_messaging.py
+++ b/tests/v1/engine/test_init_error_messaging.py
@ -0,0 +1,54 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 from vllm.v1.core.kv_cache_utils import check_enough_kv_cache_memory
 from vllm.v1.kv_cache_interface import FullAttentionSpec
 def test_kv_cache_oom_no_memory():
    from unittest.mock import MagicMock
    config = MagicMock()
    config.model_config.max_model_len = 2048
    spec = {
        "layer_0": FullAttentionSpec(
            block_size=16,
            num_kv_heads=8,
            head_size=128,
            dtype="float16",
        )
    }
    with pytest.raises(ValueError):
        check_enough_kv_cache_memory(config, spec, 0)
 def test_kv_cache_oom_insufficient_memory(monkeypatch):
    from unittest.mock import MagicMock
    config = MagicMock()
    config.model_config.max_model_len = 2048
    config.cache_config.block_size = 16
    config.parallel_config.tensor_parallel_size = 1
    config.parallel_config.pipeline_parallel_size = 1
    config.parallel_config.decode_context_parallel_size = 1
    monkeypatch.setattr(
        "vllm.v1.core.kv_cache_utils.max_memory_usage_bytes",
        lambda c, s: 100 * 1024**3,  # 100 GiB
    )
    spec = {
        "layer_0": FullAttentionSpec(
            block_size=16,
            num_kv_heads=8,
            head_size=128,
            dtype="float16",
        )
    }
    with pytest.raises(ValueError):
        check_enough_kv_cache_memory(config, spec, 1024**3)  # 1 GiB
--- a/tests/v1/kv_connector/unit/test_cache_pollution_prevention.py
+++ b/tests/v1/kv_connector/unit/test_cache_pollution_prevention.py
@ -0,0 +1,163 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 test that invalid blocks are evicted from prefix cache to prevent pollution.
 verifies that when sync-loading fails, invalid blocks are removed from the
 prefix cache hash table so future requests cannot match and reuse corrupted data.
 """
 from collections.abc import Callable
 from unittest.mock import Mock
 import pytest
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.request import Request, RequestStatus
 from .utils import (
    create_model_runner_output,
    create_request,
    create_scheduler,
    create_vllm_config,
 )
 pytestmark = pytest.mark.cpu_test
 def _make_get_num_new_matched_tokens(
    req_num_new_matched_tokens: dict[str, int],
    async_load: bool,
 ) -> Callable[[Request, int], tuple[int, bool]]:
    def get_num_new_matched_tokens(request: Request, _: int) -> tuple[int, bool]:
        value = req_num_new_matched_tokens.get(request.request_id, 0)
        return value, async_load
    return get_num_new_matched_tokens
@pytest.fixture
 def fail_scheduler():
    """scheduler with kv_load_failure_policy='fail'"""
    vllm_config = create_vllm_config()
    vllm_config.kv_transfer_config.kv_load_failure_policy = "fail"
    return create_scheduler(vllm_config)
 def test_invalid_blocks_evicted_prevents_cache_pollution(
    fail_scheduler: Scheduler,
 ):
    """
    verify invalid blocks are evicted to prevent future cache hits.
    scenario:
    1. request 1 loads externally-computed blocks (sync mode)
    2. some blocks fail to load and are marked invalid
    3. with fail policy, invalid blocks should be evicted from prefix cache
    4. request is marked as FINISHED_ERROR
    """
    num_prompt_blocks = 100
    num_external_computed_blocks = 99
    invalid_block_idx = 50
    num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size
    num_external_computed_tokens = (
        num_external_computed_blocks * fail_scheduler.block_size
    )
    # request 1: will have invalid blocks
    request1 = create_request(num_tokens=num_prompt_tokens, request_id=1)
    fail_scheduler.add_request(request=request1)
    req_num_new_matched_tokens = {
        request1.request_id: num_external_computed_tokens,
    }
    # mock connector indicating sync load
    fail_scheduler.connector = Mock()
    fail_scheduler.connector.get_num_new_matched_tokens.side_effect = (
        _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False)
    )
    fail_scheduler.connector.request_finished.return_value = (False, None)
    fail_scheduler.connector.take_events.return_value = ()
    scheduler_output = fail_scheduler.schedule()
    # request should be running with sync KV load
    assert len(fail_scheduler.running) == 1
    assert request1.status == RequestStatus.RUNNING
    # get allocated block IDs
    req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0]
    invalid_block_id = req_block_ids[invalid_block_idx]
    invalid_block_ids = {invalid_block_id}
    # get the block object to verify eviction later
    block = fail_scheduler.kv_cache_manager.block_pool.blocks[invalid_block_id]
    # cache the blocks to simulate they've been computed and cached
    # (in real scenario blocks would be cached after compute)
    fail_scheduler.kv_cache_manager.cache_blocks(request1, num_external_computed_tokens)
    # verify block has a hash (is cached) before reporting invalid blocks
    assert block.block_hash is not None, (
        f"block {invalid_block_id} should be cached (have a hash) before "
        f"eviction test, but hash is None"
    )
    # report invalid blocks
    model_runner_output = create_model_runner_output(
        [request1],
        invalid_block_ids=invalid_block_ids,
        use_eos=False,
    )
    fail_scheduler.update_from_output(scheduler_output, model_runner_output)
    # verify request finished with error (fail policy)
    assert request1.status == RequestStatus.FINISHED_ERROR
    # critical assertion: invalid block and all subsequent blocks should be evicted
    # all blocks from invalid_block_idx onwards become invalid since they were
    # computed based on the failed block
    for idx in range(invalid_block_idx, len(req_block_ids)):
        block_id = req_block_ids[idx]
        block_obj = fail_scheduler.kv_cache_manager.block_pool.blocks[block_id]
        assert block_obj.block_hash is None, (
            f"block {block_id} at index {idx} should have been evicted "
            f"(hash reset to None), but hash is {block_obj.block_hash}. "
            f"All blocks from index {invalid_block_idx} onwards should be evicted "
            f"since they depend on the invalid block at index {invalid_block_idx}."
        )
    # verify cache contains exactly the valid blocks (before first affected block)
    # and none of the invalid blocks (from first affected block onwards)
    # valid blocks: all blocks before invalid_block_idx should be cached
    for idx in range(invalid_block_idx):
        block_id = req_block_ids[idx]
        block_obj = fail_scheduler.kv_cache_manager.block_pool.blocks[block_id]
        assert block_obj.block_hash is not None, (
            f"valid block {block_id} at index {idx} should still be cached "
            f"(have a hash), but hash is None. Only blocks from index "
            f"{invalid_block_idx} onwards should be evicted."
        )
    # invalid blocks: verify they're not in the cached_block_hash_to_block map
    cached_blocks = (
        fail_scheduler.kv_cache_manager.block_pool.cached_block_hash_to_block
    )
    cached_block_ids = {
        b.block_id
        for blocks_val in cached_blocks._cache.values()
        for b in (
            [blocks_val] if not isinstance(blocks_val, dict) else blocks_val.values()
        )
    }
    for idx in range(invalid_block_idx, len(req_block_ids)):
        block_id = req_block_ids[idx]
        assert block_id not in cached_block_ids, (
            f"invalid block {block_id} at index {idx} should not be in cache hash table"
        )
--- a/tests/v1/kv_connector/unit/test_error_propagation.py
+++ b/tests/v1/kv_connector/unit/test_error_propagation.py
@ -0,0 +1,147 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Callable
 from unittest.mock import Mock
 import pytest
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.request import FinishReason, Request, RequestStatus
 from .utils import (
    create_model_runner_output,
    create_request,
    create_scheduler,
    create_vllm_config,
 )
 pytestmark = pytest.mark.cpu_test
 def _make_get_num_new_matched_tokens(
    req_num_new_matched_tokens: dict[str, int],
    async_load: bool,
 ) -> Callable[[Request, int], tuple[int, bool]]:
    def get_num_new_matched_tokens(request: Request, _: int) -> tuple[int, bool]:
        value = req_num_new_matched_tokens.get(request.request_id, 0)
        return value, async_load
    return get_num_new_matched_tokens
@pytest.fixture
 def fail_scheduler():
    """scheduler with kv_load_failure_policy='fail'"""
    vllm_config = create_vllm_config()
    vllm_config.kv_transfer_config.kv_load_failure_policy = "fail"
    return create_scheduler(vllm_config)
 def test_error_propagation_sync_load(fail_scheduler: Scheduler):
    """test invalid_block_ids with fail policy -> FINISHED_ERROR (sync load)"""
    num_prompt_blocks = 100
    num_external_computed_blocks = 99
    invalid_block_idx = 50
    num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size
    num_external_computed_tokens = (
        num_external_computed_blocks * fail_scheduler.block_size
    )
    request = create_request(num_tokens=num_prompt_tokens)
    fail_scheduler.add_request(request=request)
    req_num_new_matched_tokens = {
        request.request_id: num_external_computed_tokens,
    }
    fail_scheduler.connector = Mock()
    fail_scheduler.connector.get_num_new_matched_tokens.side_effect = (
        _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False)
    )
    fail_scheduler.connector.request_finished.return_value = (False, None)
    fail_scheduler.connector.take_events.return_value = ()
    scheduler_output = fail_scheduler.schedule()
    assert len(fail_scheduler.running) == 1
    assert len(scheduler_output.scheduled_new_reqs) == 1
    assert fail_scheduler.connector.get_num_new_matched_tokens.call_count == 1
    req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0]
    invalid_block_ids = {req_block_ids[invalid_block_idx]}
    model_runner_output = create_model_runner_output(
        [request],
        invalid_block_ids=invalid_block_ids,
        use_eos=True,
    )
    outputs = fail_scheduler.update_from_output(scheduler_output, model_runner_output)
    assert request.status == RequestStatus.FINISHED_ERROR
    assert request.get_finished_reason() == FinishReason.ERROR
    assert len(outputs) == 1
    engine_outputs = next(iter(outputs.values()))
    assert len(engine_outputs.outputs) == 1
    output = engine_outputs.outputs[0]
    assert output.request_id == request.request_id
    assert output.finish_reason == FinishReason.ERROR
    assert len(fail_scheduler.running) == 0
 def test_error_propagation_async_load(fail_scheduler: Scheduler):
    """test invalid_block_ids with fail policy -> FINISHED_ERROR (async load)"""
    num_prompt_blocks = 100
    num_external_computed_blocks = 99
    invalid_block_idx = 50
    num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size
    num_external_computed_tokens = (
        num_external_computed_blocks * fail_scheduler.block_size
    )
    request = create_request(num_tokens=num_prompt_tokens)
    fail_scheduler.add_request(request=request)
    req_num_new_matched_tokens = {
        request.request_id: num_external_computed_tokens,
    }
    fail_scheduler.connector = Mock()
    fail_scheduler.connector.get_num_new_matched_tokens.side_effect = (
        _make_get_num_new_matched_tokens(req_num_new_matched_tokens, True)
    )
    fail_scheduler.connector.request_finished.return_value = (False, None)
    fail_scheduler.connector.take_events.return_value = ()
    scheduler_output = fail_scheduler.schedule()
    assert len(fail_scheduler.waiting) == 1
    assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS
    assert request.num_computed_tokens == 0
    (req_block_ids,) = fail_scheduler.kv_cache_manager.get_block_ids(request.request_id)
    invalid_block_ids = {req_block_ids[invalid_block_idx]}
    model_runner_output = create_model_runner_output(
        reqs=[],
        finished_recving=set(),
        invalid_block_ids=invalid_block_ids,
        use_eos=True,
    )
    outputs = fail_scheduler.update_from_output(scheduler_output, model_runner_output)
    assert request.status == RequestStatus.FINISHED_ERROR
    assert request.get_finished_reason() == FinishReason.ERROR
    assert len(outputs) == 1
    engine_outputs = next(iter(outputs.values()))
    assert len(engine_outputs.outputs) == 1
    output = engine_outputs.outputs[0]
    assert output.request_id == request.request_id
    assert output.finish_reason == FinishReason.ERROR
    assert len(fail_scheduler.waiting) == 0
--- a/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py
+++ b/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py
@ -0,0 +1,454 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Tests for correctness in invalid block handling.
 These tests verify correct behavior in three scenarios:
 1. Sync recompute case: Blocks should not be freed for running requests
   that need to recompute invalid blocks
 2. Sync fail case: Invalid blocks must be evicted from cache when request fails
 3. Async recompute case: Invalid blocks should not be cached after transfer
 """
 from collections.abc import Callable
 from unittest.mock import Mock
 import pytest
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.request import FinishReason, Request, RequestStatus
 from .utils import (
    create_model_runner_output,
    create_request,
    create_scheduler,
    create_vllm_config,
 )
 pytestmark = pytest.mark.cpu_test
 def _make_get_num_new_matched_tokens(
    req_num_new_matched_tokens: dict[str, int],
    async_load: bool,
 ) -> Callable[[Request, int], tuple[int, bool]]:
    def get_num_new_matched_tokens(request: Request, _: int) -> tuple[int, bool]:
        value = req_num_new_matched_tokens.get(request.request_id, 0)
        return value, async_load
    return get_num_new_matched_tokens
@pytest.fixture
 def fail_scheduler():
    """scheduler with kv_load_failure_policy='fail'"""
    vllm_config = create_vllm_config()
    vllm_config.kv_transfer_config.kv_load_failure_policy = "fail"
    return create_scheduler(vllm_config)
@pytest.fixture
 def recompute_scheduler():
    """scheduler with kv_load_failure_policy='recompute'"""
    vllm_config = create_vllm_config()
    vllm_config.kv_transfer_config.kv_load_failure_policy = "recompute"
    return create_scheduler(vllm_config)
 def test_sync_recompute_blocks_not_freed_for_running_requests(
    recompute_scheduler: Scheduler,
 ):
    """
    Test sync recompute case - blocks must not be freed for running requests.
    When a running request has invalid blocks and retry_policy is 'recompute':
    1. Request should remain in RUNNING state
    2. num_computed_tokens should be truncated to invalid block boundary
    3. Blocks should NOT be freed (request still needs them for recomputation)
    4. Request should remain in scheduler.requests and scheduler.running
    """
    num_prompt_blocks = 100
    num_external_computed_blocks = 99
    invalid_block_idx = 50
    num_prompt_tokens = num_prompt_blocks * recompute_scheduler.block_size
    num_external_computed_tokens = (
        num_external_computed_blocks * recompute_scheduler.block_size
    )
    request = create_request(num_tokens=num_prompt_tokens)
    recompute_scheduler.add_request(request=request)
    req_num_new_matched_tokens = {
        request.request_id: num_external_computed_tokens,
    }
    # mock connector indicating sync load
    recompute_scheduler.connector = Mock()
    recompute_scheduler.connector.get_num_new_matched_tokens.side_effect = (
        _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False)
    )
    recompute_scheduler.connector.request_finished.return_value = (False, None)
    recompute_scheduler.connector.take_events.return_value = ()
    scheduler_output = recompute_scheduler.schedule()
    # request should be running with sync KV load
    assert len(recompute_scheduler.running) == 1
    assert len(scheduler_output.scheduled_new_reqs) == 1
    assert request.status == RequestStatus.RUNNING
    # get the allocated block IDs before invalid blocks are reported
    req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0]
    invalid_block_ids = {req_block_ids[invalid_block_idx]}
    # store original num_computed_tokens for comparison
    original_num_computed_tokens = request.num_computed_tokens
    model_runner_output = create_model_runner_output(
        [request],
        invalid_block_ids=invalid_block_ids,
        use_eos=False,  # not finished - should continue running
    )
    outputs = recompute_scheduler.update_from_output(
        scheduler_output, model_runner_output
    )
    # critical assertions for recompute case:
    # 1. request should still be RUNNING (not finished, not aborted)
    assert request.status == RequestStatus.RUNNING, (
        f"Request should remain RUNNING for recompute, got {request.status}"
    )
    # 2. num_computed_tokens should be truncated to first invalid block
    expected_truncated_tokens = invalid_block_idx * recompute_scheduler.block_size
    assert request.num_computed_tokens == expected_truncated_tokens, (
        f"num_computed_tokens should be truncated to {expected_truncated_tokens}, "
        f"got {request.num_computed_tokens}"
    )
    assert request.num_computed_tokens < original_num_computed_tokens, (
        "num_computed_tokens should be reduced after invalid block detection"
    )
    # 3. no output should be generated (request is still running)
    # the request should be skipped in the output loop
    assert len(outputs) == 0 or request.request_id not in [
        out.request_id for outs in outputs.values() for out in outs.outputs
    ], "No output should be generated for recompute requests"
    # 4. request should still be in running queue
    assert request in recompute_scheduler.running, (
        "Request should remain in running queue for recomputation"
    )
    # 5. request should still be in scheduler.requests (not deleted)
    assert request.request_id in recompute_scheduler.requests, (
        "Request should not be deleted from scheduler.requests"
    )
    # 6. blocks should NOT be freed - verify blocks are still allocated
    try:
        allocated_blocks = recompute_scheduler.kv_cache_manager.get_block_ids(
            request.request_id
        )
        assert allocated_blocks is not None
        assert len(allocated_blocks[0]) > 0, (
            "Blocks should still be allocated for recomputation"
        )
    except KeyError:
        pytest.fail(
            "Blocks were freed incorrectly! Running requests need their blocks "
            "to recompute invalid portions."
        )
    # 7. verify request can be rescheduled in next step
    scheduler_output_2 = recompute_scheduler.schedule()
    # request should appear in the new schedule to recompute invalid blocks
    scheduled_req_ids = [
        req.request_id for req in scheduler_output_2.scheduled_new_reqs
    ]
    if scheduler_output_2.num_scheduled_tokens:
        scheduled_req_ids.extend(scheduler_output_2.num_scheduled_tokens.keys())
    assert (
        request.request_id in scheduled_req_ids or len(recompute_scheduler.running) > 0
    ), "Request should be reschedulable for recomputation"
 def test_sync_fail_invalid_blocks_evicted(fail_scheduler: Scheduler):
    """
    Test sync fail case - invalid blocks must be evicted from cache.
    When a request fails with policy='fail' and has invalid blocks from sync loading:
    1. Request should be finished with FINISHED_ERROR
    2. Invalid blocks should be evicted from the KV cache
    3. Valid blocks (if shared) should remain in cache
    4. Future requests should not reuse the invalid blocks
    This test verifies that invalid blocks are properly evicted to prevent
    cache corruption and reuse of invalid data.
    """
    num_prompt_blocks = 100
    num_external_computed_blocks = 99
    invalid_block_idx = 50
    num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size
    num_external_computed_tokens = (
        num_external_computed_blocks * fail_scheduler.block_size
    )
    request = create_request(num_tokens=num_prompt_tokens)
    fail_scheduler.add_request(request=request)
    req_num_new_matched_tokens = {
        request.request_id: num_external_computed_tokens,
    }
    # mock connector indicating sync load
    fail_scheduler.connector = Mock()
    fail_scheduler.connector.get_num_new_matched_tokens.side_effect = (
        _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False)
    )
    fail_scheduler.connector.request_finished.return_value = (False, None)
    fail_scheduler.connector.take_events.return_value = ()
    scheduler_output = fail_scheduler.schedule()
    # request should be running with sync KV load
    assert len(fail_scheduler.running) == 1
    assert request.status == RequestStatus.RUNNING
    # get allocated block IDs
    req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0]
    invalid_block_id = req_block_ids[invalid_block_idx]
    invalid_block_ids = {invalid_block_id}
    # verify the block is in the block pool before we report it as invalid
    block = fail_scheduler.kv_cache_manager.block_pool.blocks[invalid_block_id]
    assert block is not None
    # report invalid blocks - request should fail
    model_runner_output = create_model_runner_output(
        [request],
        invalid_block_ids=invalid_block_ids,
        use_eos=True,
    )
    outputs = fail_scheduler.update_from_output(scheduler_output, model_runner_output)
    # verify request is finished with error
    assert request.status == RequestStatus.FINISHED_ERROR
    assert request.get_finished_reason() == FinishReason.ERROR
    # verify output is generated
    assert len(outputs) == 1
    engine_outputs = next(iter(outputs.values()))
    assert len(engine_outputs.outputs) == 1
    output = engine_outputs.outputs[0]
    assert output.request_id == request.request_id
    assert output.finish_reason == FinishReason.ERROR
    # verify the request was removed from scheduler
    assert request.request_id not in fail_scheduler.requests
    assert len(fail_scheduler.running) == 0
    # critical: verify invalid block was actually freed from cache
    # this is the key assertion - the invalid block should no longer be
    # tracked by the KV cache manager for this request
    # if it's still there, a future request could reuse the invalid data
    try:
        block_ids = fail_scheduler.kv_cache_manager.get_block_ids(request.request_id)
        # if we get here, check if blocks were actually freed
        if block_ids is not None and len(block_ids[0]) > 0:
            pytest.fail(
                f"Invalid blocks still tracked for finished request! "
                f"Request {request.request_id} should have been freed but "
                f"still has {len(block_ids[0])} blocks allocated."
            )
        # blocks list exists but is empty - this is fine, they were freed
    except KeyError:
        # expected - request completely removed from tracking
        pass
    # critical: verify invalid block was evicted from prefix cache
    # the block should no longer have a hash (hash is reset on eviction)
    assert block.block_hash is None, (
        f"Invalid block {invalid_block_id} should have been evicted from cache "
        f"(hash should be None), but hash is still {block.block_hash}"
    )
 def test_async_recompute_blocks_not_cached_when_invalid(
    recompute_scheduler: Scheduler,
 ):
    """
    Test async recompute case - invalid blocks not cached after transfer.
    When async KV loading has invalid blocks and retry_policy is 'recompute':
    1. Blocks are allocated but not cached yet
    2. When async transfer completes, only valid blocks should be cached
    3. Invalid blocks should never enter the prefix cache
    This test verifies correctness, the failed_recving_kv_req_ids protection
    ensures only valid blocks are cached when the transfer completes, and we
    only evict blocks from cache that are already hashed in the block table.
    """
    from unittest.mock import patch
    num_prompt_blocks = 100
    num_external_computed_blocks = 99
    invalid_block_idx = 50
    num_prompt_tokens = num_prompt_blocks * recompute_scheduler.block_size
    num_external_computed_tokens = (
        num_external_computed_blocks * recompute_scheduler.block_size
    )
    request = create_request(num_tokens=num_prompt_tokens)
    recompute_scheduler.add_request(request=request)
    req_num_new_matched_tokens = {
        request.request_id: num_external_computed_tokens,
    }
    # mock connector indicating async load
    recompute_scheduler.connector = Mock()
    recompute_scheduler.connector.get_num_new_matched_tokens.side_effect = (
        _make_get_num_new_matched_tokens(req_num_new_matched_tokens, True)
    )
    recompute_scheduler.connector.request_finished.return_value = (False, None)
    recompute_scheduler.connector.take_events.return_value = ()
    scheduler_output = recompute_scheduler.schedule()
    # request should be waiting for remote KVs
    assert len(recompute_scheduler.waiting) == 1
    assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS
    assert request.num_computed_tokens == 0
    # get the allocated block IDs
    (req_block_ids,) = recompute_scheduler.kv_cache_manager.get_block_ids(
        request.request_id
    )
    invalid_block_id = req_block_ids[invalid_block_idx]
    invalid_block_ids = {invalid_block_id}
    # get the block object to verify it's not cached yet and stays uncached
    block = recompute_scheduler.kv_cache_manager.block_pool.blocks[invalid_block_id]
    # verify block has no hash before invalid blocks are reported
    assert block.block_hash is None, (
        "Async loading blocks should not be cached yet (no hash)"
    )
    # report invalid blocks (transfer not finished yet)
    model_runner_output = create_model_runner_output(
        reqs=[],
        finished_recving=None,  # transfer NOT finished
        invalid_block_ids=invalid_block_ids,
        use_eos=False,
    )
    # critical: spy on evict_blocks to verify it's NOT called for async blocks
    original_evict_blocks = recompute_scheduler.kv_cache_manager.evict_blocks
    evict_blocks_calls = []
    def evict_blocks_spy(block_ids):
        evict_blocks_calls.append(set(block_ids))
        return original_evict_blocks(block_ids)
    with patch.object(
        recompute_scheduler.kv_cache_manager, "evict_blocks", evict_blocks_spy
    ):
        recompute_scheduler.update_from_output(scheduler_output, model_runner_output)
    # verify evict_blocks was NOT called (async blocks excluded from eviction)
    assert len(evict_blocks_calls) == 0, (
        f"evict_blocks should not be called for async-only invalid blocks, "
        f"but was called {len(evict_blocks_calls)} time(s) with {evict_blocks_calls}"
    )
    # request should still be waiting (not finished with error due to recompute policy)
    assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS
    assert request.request_id in recompute_scheduler.failed_recving_kv_req_ids
    # verify num_computed_tokens was truncated to before invalid block
    expected_valid_tokens = invalid_block_idx * recompute_scheduler.block_size
    assert request.num_computed_tokens == expected_valid_tokens
    # verify invalid block still has no hash (was not evicted)
    assert block.block_hash is None, (
        f"Async loading blocks shouldn't be cached or evicted. "
        f"Block {invalid_block_id} hash should be None but is {block.block_hash}"
    )
    # now simulate async transfer completing
    model_runner_output_2 = create_model_runner_output(
        reqs=[],
        finished_recving={request.request_id},
        invalid_block_ids=None,
        use_eos=False,
    )
    recompute_scheduler.update_from_output(scheduler_output, model_runner_output_2)
    # verify request is now marked as finished receiving and ready to be processed
    assert request.request_id in recompute_scheduler.finished_recving_kv_req_ids
    assert request.request_id in recompute_scheduler.failed_recving_kv_req_ids
    # critical: verify invalid block still has no hash before recompute
    # the async transfer invalid data was never cached
    assert block.block_hash is None, (
        f"Invalid block {invalid_block_id} should not be cached before recompute "
        f"(hash should be None), but hash is {block.block_hash}"
    )
    # critical end-to-end test: spy on cache_blocks to verify it's called with
    # the truncated num_computed_tokens value
    original_cache_blocks = recompute_scheduler.kv_cache_manager.cache_blocks
    cache_blocks_calls = []
    def cache_blocks_spy(req, num_tokens):
        cache_blocks_calls.append((req.request_id, num_tokens))
        return original_cache_blocks(req, num_tokens)
    with patch.object(
        recompute_scheduler.kv_cache_manager, "cache_blocks", cache_blocks_spy
    ):
        # call schedule() again - this triggers _update_waiting_for_remote_kv()
        # which should call cache_blocks with the truncated value
        recompute_scheduler.schedule()
    # verify cache_blocks was called with the truncated value
    assert len(cache_blocks_calls) == 1, (
        f"cache_blocks should be called exactly once, "
        f"got {len(cache_blocks_calls)} calls"
    )
    cached_req_id, cached_num_tokens = cache_blocks_calls[0]
    assert cached_req_id == request.request_id
    assert cached_num_tokens == expected_valid_tokens, (
        f"cache_blocks should be called with truncated value {expected_valid_tokens}, "
        f"but was called with {cached_num_tokens}"
    )
    # request should now be RUNNING (scheduled immediately after transfer completes)
    # the flow is: WAITING_FOR_REMOTE_KVS -> WAITING -> RUNNING in same schedule() call
    assert request.status == RequestStatus.RUNNING
    # num_computed_tokens should be >= expected_valid_tokens because the scheduler
    # will schedule additional new tokens (up to max_num_batched_tokens) for the request
    assert request.num_computed_tokens >= expected_valid_tokens, (
        f"num_computed_tokens should be at least {expected_valid_tokens}, "
        f"got {request.num_computed_tokens}"
    )
    # request should no longer be in the failed/finished receiving sets
    assert request.request_id not in recompute_scheduler.failed_recving_kv_req_ids
    assert request.request_id not in recompute_scheduler.finished_recving_kv_req_ids
    # request should be in the running queue
    assert request in recompute_scheduler.running
--- a/tools/ep_kernels/README.md
+++ b/tools/ep_kernels/README.md
@ -7,7 +7,7 @@ Here we break down the requirements in 2 steps:
 1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this.
 2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine.
-2 is necessary for multi-node deployment.
+Step 2 is necessary for multi-node deployment.
 All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`.
@ -23,6 +23,6 @@ TORCH_CUDA_ARCH_LIST="10.0" bash install_python_libraries.sh
 Additional step for multi-node deployment:
 ```bash
-sudo bash configure_system_drivers.sh
+sudo bash configure_system_drivers.sh # update-initramfs can take several minutes
 sudo reboot # Reboot is required to load the new driver
 ```
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@ -788,7 +788,7 @@ async def benchmark(
        )
    print(
        "{:<40} {:<10.2f}".format(
-            "Total Token throughput (tok/s):", metrics.total_token_throughput
+            "Total token throughput (tok/s):", metrics.total_token_throughput
        )
    )
--- a/vllm/config/kv_transfer.py
+++ b/vllm/config/kv_transfer.py
@ -64,6 +64,11 @@ class KVTransferConfig:
    enable_permute_local_kv: bool = False
    """Experiment feature flag to enable HND to NHD KV Transfer"""
    kv_load_failure_policy: Literal["recompute", "fail"] = "recompute"
    """Policy for handling KV cache load failures.
    'recompute': reschedule the request to recompute failed blocks (default)
    'fail': immediately fail the request with an error finish reason"""
    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@ -666,8 +666,9 @@ class VllmConfig:
        default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
        self._apply_optimization_level_defaults(default_config)
        if (
-            self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            self.compilation_config.cudagraph_mode.requires_piecewise_compilation()
            and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
        ):
            logger.info(
@ -692,22 +693,29 @@ class VllmConfig:
        if current_platform.support_static_graph_mode():
            # if cudagraph_mode has full cudagraphs, we need to check support
-            if (
+            if model_config := self.model_config:
-                self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+                if (
-                and self.model_config is not None
+                    self.compilation_config.cudagraph_mode.has_full_cudagraphs()
-            ):
+                    and model_config.pooler_config is not None
-                if self.model_config.pooler_config is not None:
+                ):
                    logger.warning_once(
                        "Pooling models do not support full cudagraphs. "
                        "Overriding cudagraph_mode to PIECEWISE."
                    )
                    self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
-                elif self.model_config.is_encoder_decoder:
+                elif (
-                    logger.warning_once(
+                    model_config.is_encoder_decoder
-                        "Encoder-decoder models do not support full cudagraphs. "
+                    and self.compilation_config.cudagraph_mode
-                        "Overriding cudagraph_mode to PIECEWISE."
+                    not in (CUDAGraphMode.NONE, CUDAGraphMode.FULL_DECODE_ONLY)
                ):
                    logger.info_once(
                        "Encoder-decoder models do not support %s. "
                        "Overriding cudagraph_mode to FULL_DECODE_ONLY.",
                        self.compilation_config.cudagraph_mode.name,
                    )
                    self.compilation_config.cudagraph_mode = (
                        CUDAGraphMode.FULL_DECODE_ONLY
                    )
                    self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
            # disable cudagraph when enforce eager execution
            if self.model_config is not None and self.model_config.enforce_eager:
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
 import pickle
 import threading
 import time
 from contextlib import contextmanager
 from dataclasses import dataclass, field
@ -43,6 +44,33 @@ VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL
 from_bytes_big = functools.partial(int.from_bytes, byteorder="big")
 # Memory fence for cross-process shared memory visibility.
 # Required for correct producer-consumer synchronization when using
 # shared memory without locks.
 _memory_fence_lock = threading.Lock()
 def memory_fence():
    """
    Full memory barrier for shared memory synchronization.
    Ensures all prior memory writes are visible to other processes before
    any subsequent reads. This is critical for lock-free producer-consumer
    patterns using shared memory.
    Implementation acquires and immediately releases a lock. Python's
    threading.Lock provides sequentially consistent memory barrier semantics
    across all major platforms (POSIX, Windows). This is a lightweight
    operation (~20ns) that guarantees:
    - All stores before the barrier are visible to other threads/processes
    - All loads after the barrier see the latest values
    """
    # Lock acquire/release provides full memory barrier semantics.
    # Using context manager ensures lock release even on exceptions.
    with _memory_fence_lock:
        pass
 def to_bytes_big(value: int, size: int) -> bytes:
    return value.to_bytes(size, byteorder="big")
@ -414,6 +442,10 @@ class MessageQueue:
        n_warning = 1
        while True:
            with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
                # Memory fence ensures we see the latest read flags from readers.
                # Without this, we may read stale flags from our CPU cache and
                # spin indefinitely even though readers have completed.
                memory_fence()
                read_count = sum(metadata_buffer[1:])
                written_flag = metadata_buffer[0]
                if written_flag and read_count != self.buffer.n_reader:
@ -458,6 +490,10 @@ class MessageQueue:
                    metadata_buffer[i] = 0
                # mark the block as written
                metadata_buffer[0] = 1
                # Memory fence ensures the write is visible to readers on other cores
                # before we proceed. Without this, readers may spin indefinitely
                # waiting for a write that's stuck in our CPU's store buffer.
                memory_fence()
                self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks
                break
@ -473,6 +509,10 @@ class MessageQueue:
        n_warning = 1
        while True:
            with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
                # Memory fence ensures we see the latest writes from the writer.
                # Without this, we may read stale flags from our CPU cache
                # and spin indefinitely even though writer has updated them.
                memory_fence()
                read_flag = metadata_buffer[self.local_reader_rank + 1]
                written_flag = metadata_buffer[0]
                if not written_flag or read_flag:
@ -513,6 +553,10 @@ class MessageQueue:
                # caller has read from the buffer
                # set the read flag
                metadata_buffer[self.local_reader_rank + 1] = 1
                # Memory fence ensures the read flag is visible to the writer.
                # Without this, writer may not see our read completion and
                # could wait indefinitely for all readers to finish.
                memory_fence()
                self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks
                self._read_spin_timer.record_activity()
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@ -322,6 +322,9 @@ async def transfer_layer(
    num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
    assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
    assert num_physical_experts == ep_size * num_local_physical_experts
    # A buffer to hold the expert weights in one layer during the exchange.
    # NOTE: Currently we assume the same weights across different layers
    # have the same shape.
    is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
        num_local_experts=num_local_physical_experts,
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
@ -27,7 +27,7 @@ from lmcache.v1.lookup_client.lmcache_async_lookup_client import (
    LMCacheAsyncLookupServer,
 )
 from lmcache.v1.offload_server.zmq_server import ZMQOffloadServer
-from lmcache.v1.plugin.plugin_launcher import PluginLauncher
+from lmcache.v1.plugin.runtime_plugin_launcher import RuntimePluginLauncher
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
@ -683,7 +683,7 @@ class LMCacheConnectorV1Impl:
            self.api_server = InternalAPIServer(self)
            self.api_server.start()
            # Launch plugins
-            self.plugin_launcher = PluginLauncher(
+            self.plugin_launcher = RuntimePluginLauncher(
                self.config,
                role,
                self.worker_count,
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@ -1586,6 +1586,8 @@ def destroy_distributed_environment():
 def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
    # Reset environment variable cache
    envs.disable_envs_cache()
    # Ensure all objects are not frozen before cleanup
    gc.unfreeze()
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -9,7 +9,7 @@ import cloudpickle
 import torch.nn as nn
 from pydantic import ValidationError
 from tqdm.auto import tqdm
-from typing_extensions import TypeVar, deprecated
+from typing_extensions import TypeVar
 from vllm.beam_search import (
    BeamSearchInstance,
@ -73,7 +73,6 @@ from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams
 from vllm.tasks import PoolingTask
 from vllm.tokenizers import MistralTokenizer, TokenizerLike
 from vllm.tokenizers.hf import get_cached_tokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.collection_utils import as_iter, is_list_of
 from vllm.utils.counter import Counter
@ -367,16 +366,6 @@ class LLM:
    def get_tokenizer(self) -> TokenizerLike:
        return self.llm_engine.get_tokenizer()
    @deprecated("`set_tokenizer` is deprecated and will be removed in v0.13.")
    def set_tokenizer(self, tokenizer: TokenizerLike) -> None:
        # While CachedTokenizer is dynamic, have no choice but
        # compare class name. Misjudgment will arise from
        # user-defined tokenizer started with 'Cached'
        if tokenizer.__class__.__name__.startswith("Cached"):
            self.llm_engine.tokenizer = tokenizer
        else:
            self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer)
    def reset_mm_cache(self) -> None:
        self.input_processor.clear_mm_cache()
        self.llm_engine.reset_mm_cache()
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@ -51,7 +51,11 @@ from vllm.entrypoints.openai.protocol import (
    ToolCall,
    UsageInfo,
 )
-from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
+from vllm.entrypoints.openai.serving_engine import (
    GenerationError,
    OpenAIServing,
    clamp_prompt_logprobs,
 )
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
@ -380,6 +384,8 @@ class OpenAIServingChat(OpenAIServing):
                tokenizer,
                request_metadata,
            )
        except GenerationError as e:
            return self._convert_generation_error_to_response(e)
        except ValueError as e:
            # TODO: Use a vllm-specific Validation Error
            return self.create_error_response(str(e))
@ -1120,6 +1126,10 @@ class OpenAIServingChat(OpenAIServing):
                    # if the model is finished generating
                    else:
                        # check for error finish reason and abort streaming
                        # finish_reason='error' indicates a retryable error
                        self._raise_if_error(output.finish_reason, request_id)
                        # check to make sure we haven't "forgotten" to stream
                        #   any tokens that were generated but previously
                        #   matched by partial json parsing
@ -1287,6 +1297,8 @@ class OpenAIServingChat(OpenAIServing):
                        delta=False,
                    )
        except GenerationError as e:
            yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n"
        except Exception as e:
            # TODO: Use a vllm-specific Validation Error
            logger.exception("Error in chat completion stream generator.")
@ -1327,6 +1339,9 @@ class OpenAIServingChat(OpenAIServing):
        role = self.get_chat_request_role(request)
        for output in final_res.outputs:
            # check for error finish reason and raise GenerationError
            # finish_reason='error' indicates a retryable request-level internal error
            self._raise_if_error(output.finish_reason, request_id)
            token_ids = output.token_ids
            out_logprobs = output.logprobs
            tool_call_info = None
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@ -24,7 +24,11 @@ from vllm.entrypoints.openai.protocol import (
    RequestResponseMetadata,
    UsageInfo,
 )
-from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
+from vllm.entrypoints.openai.serving_engine import (
    GenerationError,
    OpenAIServing,
    clamp_prompt_logprobs,
 )
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.renderer import RenderConfig
 from vllm.entrypoints.utils import get_max_tokens, should_include_usage
@ -300,6 +304,8 @@ class OpenAIServingCompletion(OpenAIServing):
            )
        except asyncio.CancelledError:
            return self.create_error_response("Client disconnected")
        except GenerationError as e:
            return self._convert_generation_error_to_response(e)
        except ValueError as e:
            # TODO: Use a vllm-specific Validation Error
            return self.create_error_response(str(e))
@ -437,6 +443,8 @@ class OpenAIServingCompletion(OpenAIServing):
                    finish_reason = output.finish_reason
                    stop_reason = output.stop_reason
                    self._raise_if_error(finish_reason, request_id)
                    chunk = CompletionStreamResponse(
                        id=request_id,
                        created=created_time,
@ -498,8 +506,11 @@ class OpenAIServingCompletion(OpenAIServing):
            # report to FastAPI middleware aggregate usage across all choices
            request_metadata.final_usage_info = final_usage_info
        except GenerationError as e:
            yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n"
        except Exception as e:
            # TODO: Use a vllm-specific Validation Error
            logger.exception("Error in completion stream generator.")
            data = self.create_streaming_error_response(str(e))
            yield f"data: {data}\n\n"
        yield "data: [DONE]\n\n"
@ -530,6 +541,8 @@ class OpenAIServingCompletion(OpenAIServing):
            out_logprobs: GenericSequence[dict[int, Logprob] | None] | None
            for output in final_res.outputs:
                self._raise_if_error(output.finish_reason, request_id)
                assert request.max_tokens is not None
                if request.echo:
                    if request.return_token_ids:
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@ -133,6 +133,15 @@ from vllm.utils.async_utils import (
 from vllm.utils.collection_utils import is_list_of
 from vllm.v1.engine import EngineCoreRequest
 class GenerationError(Exception):
    """raised when finish_reason indicates internal server error (500)"""
    def __init__(self, message: str = "Internal server error"):
        super().__init__(message)
        self.status_code = HTTPStatus.INTERNAL_SERVER_ERROR
 logger = init_logger(__name__)
 CompletionLikeRequest: TypeAlias = (
@ -456,6 +465,29 @@ class OpenAIServing:
            # Iterate through all beam inference results
            for i, result in enumerate(output):
                current_beam = all_beams[i]
                # check for error finish reason and abort beam search
                if result.outputs[0].finish_reason == "error":
                    # yield error output and terminate beam search
                    yield RequestOutput(
                        request_id=request_id,
                        prompt=prompt_text,
                        outputs=[
                            CompletionOutput(
                                index=0,
                                text="",
                                token_ids=[],
                                cumulative_logprob=None,
                                logprobs=None,
                                finish_reason="error",
                            )
                        ],
                        finished=True,
                        prompt_token_ids=prompt_token_ids,
                        prompt_logprobs=None,
                    )
                    return
                if result.outputs[0].logprobs is not None:
                    logprobs = result.outputs[0].logprobs[0]
                    all_beams_token_id.extend(list(logprobs.keys()))
@ -780,6 +812,35 @@ class OpenAIServing:
        )
        return json_str
    def _raise_if_error(self, finish_reason: str | None, request_id: str) -> None:
        """Raise GenerationError if finish_reason indicates an error."""
        if finish_reason == "error":
            logger.error(
                "Request %s failed with an internal error during generation",
                request_id,
            )
            raise GenerationError("Internal server error")
    def _convert_generation_error_to_response(
        self, e: GenerationError
    ) -> ErrorResponse:
        """Convert GenerationError to ErrorResponse."""
        return self.create_error_response(
            str(e),
            err_type="InternalServerError",
            status_code=e.status_code,
        )
    def _convert_generation_error_to_streaming_response(
        self, e: GenerationError
    ) -> str:
        """Convert GenerationError to streaming error response."""
        return self.create_streaming_error_response(
            str(e),
            err_type="InternalServerError",
            status_code=e.status_code,
        )
    async def _check_model(
        self,
        request: AnyRequest,
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@ -50,6 +50,7 @@ from openai.types.responses.response_reasoning_item import (
 )
 from openai.types.responses.tool import Mcp, Tool
 from openai_harmony import Message as OpenAIHarmonyMessage
 from pydantic import TypeAdapter
 from vllm import envs
 from vllm.engine.protocol import EngineClient
@ -94,7 +95,10 @@ from vllm.entrypoints.openai.protocol import (
    ResponseUsage,
    StreamingResponsesResponse,
 )
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_engine import (
    GenerationError,
    OpenAIServing,
 )
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.responses_utils import (
    construct_input_messages,
@ -541,6 +545,8 @@ class OpenAIServingResponses(OpenAIServing):
                tokenizer,
                request_metadata,
            )
        except GenerationError as e:
            return self._convert_generation_error_to_response(e)
        except Exception as e:
            return self.create_error_response(str(e))
@ -648,6 +654,8 @@ class OpenAIServingResponses(OpenAIServing):
                    status = "incomplete"
                elif context.finish_reason == "abort":
                    status = "cancelled"
                else:
                    self._raise_if_error(context.finish_reason, request.request_id)
            else:
                status = "incomplete"
        elif isinstance(context, ParsableContext):
@ -673,6 +681,9 @@ class OpenAIServingResponses(OpenAIServing):
            assert len(final_res.outputs) == 1
            final_output = final_res.outputs[0]
            # finish_reason='error' indicates retryable internal error
            self._raise_if_error(final_output.finish_reason, request.request_id)
            output = self._make_response_output_items(request, final_output, tokenizer)
            if request.enable_response_messages:
@ -1066,6 +1077,8 @@ class OpenAIServingResponses(OpenAIServing):
            async for event in generator:
                event_deque.append(event)
                new_event_signal.set()  # Signal new event available
        except GenerationError as e:
            response = self._convert_generation_error_to_response(e)
        except Exception as e:
            logger.exception("Background request failed for %s", request.request_id)
            response = self.create_error_response(str(e))
@ -1089,6 +1102,8 @@ class OpenAIServingResponses(OpenAIServing):
    ):
        try:
            response = await self.responses_full_generator(request, *args, **kwargs)
        except GenerationError as e:
            response = self._convert_generation_error_to_response(e)
        except Exception as e:
            logger.exception("Background request failed for %s", request.request_id)
            response = self.create_error_response(str(e))
@ -1227,6 +1242,8 @@ class OpenAIServingResponses(OpenAIServing):
                continue
            if ctx.last_output.outputs:
                output = ctx.last_output.outputs[0]
                # finish_reason='error' indicates a retryable error
                self._raise_if_error(output.finish_reason, request.request_id)
                if reasoning_parser:
                    delta_message = reasoning_parser.extract_reasoning_streaming(
                        previous_text=previous_text,
@ -1522,6 +1539,9 @@ class OpenAIServingResponses(OpenAIServing):
        async for ctx in result_generator:
            assert isinstance(ctx, StreamingHarmonyContext)
            # finish_reason='error' indicates a retryable error
            self._raise_if_error(ctx.finish_reason, request.request_id)
            if ctx.is_expecting_start():
                current_output_index += 1
                sent_output_item_added = False
@ -2016,18 +2036,25 @@ class OpenAIServingResponses(OpenAIServing):
                )
            )
-            async for event_data in processer(
+            try:
-                request,
+                async for event_data in processer(
-                sampling_params,
+                    request,
-                result_generator,
+                    sampling_params,
-                context,
+                    result_generator,
-                model_name,
+                    context,
-                tokenizer,
+                    model_name,
-                request_metadata,
+                    tokenizer,
-                created_time,
+                    request_metadata,
-                _increment_sequence_number_and_return,
+                    created_time,
-            ):
+                    _increment_sequence_number_and_return,
-                yield event_data
+                ):
                    yield event_data
            except GenerationError as e:
                error_json = self._convert_generation_error_to_streaming_response(e)
                yield _increment_sequence_number_and_return(
                    TypeAdapter(StreamingResponsesResponse).validate_json(error_json)
                )
                return
            async def empty_async_generator():
                # A hack to trick Python to think this is a generator but
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -1580,6 +1580,12 @@ def __getattr__(name: str):
    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 def _is_envs_cache_enabled() -> bool:
    """Checked if __getattr__ is wrapped with functools.cache"""
    global __getattr__
    return hasattr(__getattr__, "cache_clear")
 def enable_envs_cache() -> None:
    """
    Enables caching of environment variables. This is useful for performance
@ -1590,6 +1596,9 @@ def enable_envs_cache() -> None:
    runtime overhead. This also means that environment variables should NOT
    be updated after the service is initialized.
    """
    if _is_envs_cache_enabled():
        # Avoid wrapping functools.cache multiple times
        return
    # Tag __getattr__ with functools.cache
    global __getattr__
    __getattr__ = functools.cache(__getattr__)
@ -1599,6 +1608,17 @@ def enable_envs_cache() -> None:
        __getattr__(key)
 def disable_envs_cache() -> None:
    """
    Resets the environment variables cache. It could be used to isolate environments
    between unit tests.
    """
    global __getattr__
    # If __getattr__ is wrapped by functions.cache, unwrap the caching layer.
    if _is_envs_cache_enabled():
        __getattr__ = __getattr__.__wrapped__
 def __dir__():
    return list(environment_variables.keys())
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@ -1556,6 +1556,14 @@ class FusedMoE(CustomOp):
                    f"EPLB is not supported for {self.quant_method.method_name}."
                )
        def valid_grouping() -> bool:
            # Check if num_experts is greater than num_expert_group
            # and is divisible by num_expert_group
            num_experts = router_logits.shape[-1]
            if num_experts <= self.num_expert_group:
                return False
            return num_experts % self.num_expert_group == 0
        indices_type = self.quant_method.topk_indices_dtype
        # Check if we should use a routing simulation strategy
@ -1570,7 +1578,7 @@ class FusedMoE(CustomOp):
            )
        # DeepSeekv2 uses grouped_top_k
-        elif self.use_grouped_topk:
+        elif self.use_grouped_topk and valid_grouping():
            assert self.topk_group is not None
            assert self.num_expert_group is not None
            if rocm_aiter_ops.is_fused_moe_enabled():
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@ -502,6 +502,7 @@ class HunYuanVisionTransformer(nn.Module):
        cu_seqlens: list = [0]
        hidden_states = x.to(device=self.device, dtype=self.dtype)
        # embeddings = patch_embeds + patch_pos_embed
        hidden_states = self.embeddings(hidden_states, grid_thw)
        for t, h, w in grid_thw:
@ -515,8 +516,14 @@ class HunYuanVisionTransformer(nn.Module):
        hidden_states = hidden_states.reshape(seq_len, -1)
        hidden_states = hidden_states.unsqueeze(0)
-        for layer_num, layer in enumerate(self.layers):
+
-            hidden_states = layer(hidden_states)
+        # build per-image lengths once
        split_lengths = [int(h) * int(w) for (_, h, w) in grid_thw]
        for layer in self.layers:
            # hidden_states: (1, T_total, D)
            parts = hidden_states.split(split_lengths, dim=1)  # list of (1, L_i, D)
            parts = [layer(p) for p in parts]
            hidden_states = torch.cat(parts, dim=1)
        # adapter
        split_lengths = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
--- a/vllm/model_executor/models/minimax_m2.py
+++ b/vllm/model_executor/models/minimax_m2.py
@ -201,7 +201,7 @@ class MiniMaxM2Attention(nn.Module):
        self.rotary_emb = get_rope(
            self.head_dim,
-            rotary_dim=rotary_dim,
+            rotary_dim=self.head_dim,
            max_position=max_position_embeddings,
            rope_parameters=rope_parameters,
        )
--- a/vllm/model_executor/warmup/deep_gemm_warmup.py
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@ -89,7 +89,7 @@ def _extract_data_from_linear_base_module(
    assert m.quant_method.quant_config is not None
    w = m.weight
-    ws = m.weight_scale
+    ws = m.weight_scale_inv if hasattr(m, "weight_scale_inv") else m.weight_scale
    quant_block_size = m.quant_method.quant_config.weight_block_size
    assert isinstance(w, torch.Tensor)
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@ -954,7 +954,7 @@ MultiModalKwargsOptionalItems: TypeAlias = (
 )
-@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.13.")
+@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.14.")
 class MultiModalKwargs(UserDict[str, NestedTensors]):
    """
    A dictionary that represents the keyword arguments to
@ -964,7 +964,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
    @staticmethod
    @deprecated(
        "`MultiModalKwargs.from_hf_inputs` is deprecated and "
-        "will be removed in v0.13. "
+        "will be removed in v0.14. "
        "Please use `MultiModalKwargsItems.from_hf_inputs` and "
        "access the tensor data using `.get_data()`."
    )
@ -977,7 +977,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
    @staticmethod
    @deprecated(
        "`MultiModalKwargs.from_items` is deprecated and "
-        "will be removed in v0.13. "
+        "will be removed in v0.14. "
        "Please use `MultiModalKwargsItems.from_seq` and "
        "access the tensor data using `.get_data()`."
    )
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@ -429,12 +429,12 @@ def group_mm_kwargs_by_modality(
    if merge_by_field_config is not None:
        logger.warning_once(
            "The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` "
-            "is deprecated and will be removed in v0.13."
+            "is deprecated and will be removed in v0.14."
        )
    if multimodal_cpu_fields is not None:
        logger.warning_once(
            "The `multimodal_cpu_fields` argument of `group_mm_kwargs_by_modality` "
-            "is deprecated and will be removed in v0.13."
+            "is deprecated and will be removed in v0.14."
        )
    from vllm.multimodal.inputs import MultiModalKwargsItems
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@ -403,7 +403,21 @@ class RocmPlatform(Platform):
                compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
        if cache_config and cache_config.block_size is None:
-            cache_config.block_size = 16
+            if (
                envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION and envs.VLLM_ROCM_USE_AITER
                # NOTE: This block has been deprecated
                # or get_env_variable_attn_backend()
                # == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN
                # TODO: monitor https://github.com/vllm-project/vllm/pull/30396
                # to see how we can transition to the new way of selecting
                # attention backends
            ):
                cache_config.block_size = 64
                logger.warning(
                    "[ROCM_AITER_UNIFIED_ATTN]: Setting kv cache block size to 64."
                )
            else:
                cache_config.block_size = 16
        if parallel_config.worker_cls == "auto":
            parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker"
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@ -17,7 +17,7 @@ def __getattr__(name: str):
        warnings.warn(
            "`vllm.transformers_utils.tokenizer.AnyTokenizer` has been moved to "
            "`vllm.tokenizers.TokenizerLike`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
            DeprecationWarning,
            stacklevel=2,
        )
@ -29,7 +29,7 @@ def __getattr__(name: str):
        warnings.warn(
            "`vllm.transformers_utils.tokenizer.get_tokenizer` "
            "has been moved to `vllm.tokenizers.get_tokenizer`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
            DeprecationWarning,
            stacklevel=2,
        )
@ -41,7 +41,7 @@ def __getattr__(name: str):
        warnings.warn(
            "`vllm.transformers_utils.tokenizer.cached_get_tokenizer` "
            "has been moved to `vllm.tokenizers.cached_get_tokenizer`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
            DeprecationWarning,
            stacklevel=2,
        )
@ -53,7 +53,7 @@ def __getattr__(name: str):
        warnings.warn(
            "`vllm.transformers_utils.tokenizer.cached_tokenizer_from_config` "
            "has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
            DeprecationWarning,
            stacklevel=2,
        )
@ -65,7 +65,7 @@ def __getattr__(name: str):
        warnings.warn(
            "`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` "
            "has been moved to `vllm.tokenizers.init_tokenizer_from_config`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
            DeprecationWarning,
            stacklevel=2,
        )
@ -75,7 +75,7 @@ def __getattr__(name: str):
    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
-@deprecated("Will be removed in v0.13. Please use `tokenizer.decode()` instead.")
+@deprecated("Will be removed in v0.14. Please use `tokenizer.decode()` instead.")
 def decode_tokens(
    tokenizer: TokenizerLike,
    token_ids: list[int],
@ -97,7 +97,7 @@ def decode_tokens(
    return tokenizer.decode(token_ids, **kw_args)
-@deprecated("Will be removed in v0.13. Please use `tokenizer.encode()` instead.")
+@deprecated("Will be removed in v0.14. Please use `tokenizer.encode()` instead.")
 def encode_tokens(
    tokenizer: TokenizerLike,
    text: str,
--- a/vllm/transformers_utils/tokenizer_base.py
+++ b/vllm/transformers_utils/tokenizer_base.py
@ -11,7 +11,7 @@ def __getattr__(name: str):
        warnings.warn(
            "`vllm.transformers_utils.tokenizer_base.TokenizerBase` has been "
            "moved to `vllm.tokenizers.TokenizerLike`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
            DeprecationWarning,
            stacklevel=2,
        )
@ -23,7 +23,7 @@ def __getattr__(name: str):
        warnings.warn(
            "`vllm.transformers_utils.tokenizer_base.TokenizerRegistry` has been "
            "moved to `vllm.tokenizers.TokenizerRegistry`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
            DeprecationWarning,
            stacklevel=2,
        )
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@ -269,6 +269,8 @@ def supports_trtllm_attention() -> bool:
 def force_use_trtllm_attention() -> bool | None:
    """
    This function should only be called during initialization stage when vllm config
    is set.
    Return `None` if --attention-config.use_trtllm_attention is not set,
    return `True` if TRTLLM attention is forced to be used,
    return `False` if TRTLLM attention is forced to be not used.
@ -296,11 +298,12 @@ def use_trtllm_attention(
    kv_cache_dtype: str,
    q_dtype: torch.dtype,
    is_prefill: bool,
    # None means auto-detection, True means force on, False means force off
    force_use_trtllm: bool | None = None,
    has_sinks: bool = False,
    has_spec: bool = False,
 ) -> bool:
    """Return `True` if TRTLLM attention is used."""
    force_use_trtllm = force_use_trtllm_attention()
    # CLI argument is set to 0 - respect it
    if force_use_trtllm is not None and not force_use_trtllm:
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@ -21,7 +21,7 @@ from vllm.v1.attention.backends.utils import (
    CommonAttentionMetadata,
    split_decodes_and_prefills,
 )
-from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.v1.kv_cache_interface import AttentionSpec, CrossAttentionSpec
 logger = init_logger(__name__)
@ -50,11 +50,13 @@ class CPUAttentionBackend(AttentionBackend):
    @classmethod
    def supports_attn_type(cls, attn_type: str) -> bool:
-        """CPU attention supports decoder and encoder-only attention."""
+        """CPU attention supports decoder,
        encoder-only and encoder-decoder attention."""
        return attn_type in (
            AttentionType.DECODER,
            AttentionType.ENCODER,
            AttentionType.ENCODER_ONLY,
            AttentionType.ENCODER_DECODER,
        )
    @staticmethod
@ -136,6 +138,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
            self.window_size = -1
        self.block_size = vllm_config.cache_config.block_size
        self.isa = _get_attn_isa(self.dtype, self.block_size)
        self.is_cross_attention = isinstance(kv_cache_spec, CrossAttentionSpec)
    def build(
        self,
@ -151,7 +154,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
        seq_lens = common_attn_metadata.seq_lens
        block_table_tensor = common_attn_metadata.block_table_tensor
        slot_mapping = common_attn_metadata.slot_mapping
-        causal = common_attn_metadata.causal
+        causal = False if self.is_cross_attention else common_attn_metadata.causal
        sdpa_start_loc = query_start_loc
        num_decode_tokens = 0
@ -171,22 +174,19 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
            query_start_loc = query_start_loc[: num_decodes + 1]
            block_table_tensor = block_table_tensor[:num_decodes]
-        sheduler_metadata = None
+        sheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
-        if causal:
+            num_reqs=num_reqs,
-            # for decode batch, use the custom kernel
+            num_heads=self.num_heads,
-            sheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
+            num_kv_heads=self.num_kv_heads,
-                num_reqs=num_reqs,
+            head_dim=self.head_dim,
-                num_heads=self.num_heads,
+            seq_lens=seq_lens,
-                num_kv_heads=self.num_kv_heads,
+            dtype=self.dtype,
-                head_dim=self.head_dim,
+            query_start_loc=query_start_loc,
-                seq_lens=seq_lens,
+            causal=causal,
-                dtype=self.dtype,
+            sliding_window_size=self.window_size,
-                query_start_loc=query_start_loc,
+            isa=self.isa,
-                causal=causal,
+            enable_kv_split=True,
-                sliding_window_size=self.window_size,
+        )
                isa=self.isa,
                enable_kv_split=True,
            )
        attn_metadata = CPUAttentionMetadata(
            isa=self.isa,
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@ -429,6 +429,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
        self.cache_config = vllm_config.cache_config
        self.model_config = vllm_config.model_config
        self.attention_config = vllm_config.attention_config
        self._workspace_buffer = None
        self._prefill_wrapper: (
            BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
@ -779,6 +780,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
            self.cache_dtype,
            self.q_data_type,
            is_prefill=True,
            force_use_trtllm=self.attention_config.use_trtllm_attention,
            has_sinks=self.has_sinks,
            has_spec=uses_spec_reorder,
        )
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@ -397,6 +397,25 @@ class BlockPool:
            [block for block in blocks_list if block.ref_cnt == 0 and not block.is_null]
        )
    def evict_blocks(self, block_ids: set[int]) -> None:
        """evict blocks from the prefix cache by their block IDs.
        only evicts blocks that are currently cached (have a hash). blocks
        with ref_cnt > 0 are not freed from the block pool, only evicted
        from the prefix cache hash table.
        Args:
            block_ids: Set of block IDs to evict from cache.
        """
        for block_id in block_ids:
            assert block_id < len(self.blocks), (
                f"Invalid block_id {block_id} >= {len(self.blocks)}. "
                f"This indicates a bug in the KV connector - workers should "
                f"only report block IDs that were allocated by the scheduler."
            )
            block = self.blocks[block_id]
            self._maybe_evict_cached_block(block)
    def reset_prefix_cache(self) -> bool:
        """Reset prefix cache. This function may be used in RLHF
        flows to invalid prefix caching after the weights are updated,
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@ -333,6 +333,14 @@ class KVCacheManager:
        """
        self.coordinator.free(request.request_id)
    def evict_blocks(self, block_ids: set[int]) -> None:
        """evict blocks from the prefix cache by their block IDs.
        Args:
            block_ids: Set of block IDs to evict from cache.
        """
        self.block_pool.evict_blocks(block_ids)
    def reset_prefix_cache(self) -> bool:
        """Reset prefix cache. This function may be used in RLHF
        flows to invalidate prefix caching after the weights are updated,
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@ -687,7 +687,9 @@ def check_enough_kv_cache_memory(
        raise ValueError(
            "No available memory for the cache blocks. "
            "Try increasing `gpu_memory_utilization` when "
-            "initializing the engine."
+            "initializing the engine. "
            "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
            "for more details."
        )
    max_model_len = vllm_config.model_config.max_model_len
@ -711,8 +713,10 @@ def check_enough_kv_cache_memory(
            f"cache is needed, which is larger than the available KV cache "
            f"memory ({available_memory / GiB_bytes:.2f} GiB). "
            f"{estimated_msg} "
-            f"Try increasing `gpu_memory_utilization` or decreasing "
+            f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` "
-            f"`max_model_len` when initializing the engine."
+            f"when initializing the engine. "
            f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
            f"for more details."
        )
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@ -106,6 +106,7 @@ class Scheduler(SchedulerInterface):
        # KV Connector pushes/pull of remote KVs for P/D and offloading.
        self.connector = None
        self.connector_prefix_cache_stats: PrefixCacheStats | None = None
        self.recompute_kv_load_failures = True
        if self.vllm_config.kv_transfer_config is not None:
            assert not self.is_encoder_decoder, (
                "Encoder-decoder models are not currently supported with KV connectors"
@ -117,6 +118,10 @@ class Scheduler(SchedulerInterface):
            )
            if self.log_stats:
                self.connector_prefix_cache_stats = PrefixCacheStats()
            kv_load_failure_policy = (
                self.vllm_config.kv_transfer_config.kv_load_failure_policy
            )
            self.recompute_kv_load_failures = kv_load_failure_policy == "recompute"
        self.kv_event_publisher = EventPublisherFactory.create(
            self.kv_events_config,
@ -1066,7 +1071,7 @@ class Scheduler(SchedulerInterface):
        for req_id, num_tokens_scheduled in num_scheduled_tokens.items():
            assert num_tokens_scheduled > 0
            if failed_kv_load_req_ids and req_id in failed_kv_load_req_ids:
-                # Skip requests that were recovered from KV load failure
+                # skip failed or rescheduled requests from KV load failure
                continue
            request = self.requests.get(req_id)
            if request is None:
@ -1177,6 +1182,21 @@ class Scheduler(SchedulerInterface):
            # This is a rare case and unlikely to impact performance.
            self.waiting.remove_requests(stopped_preempted_reqs)
        if failed_kv_load_req_ids and not self.recompute_kv_load_failures:
            requests = [self.requests[req_id] for req_id in failed_kv_load_req_ids]
            self.finish_requests(failed_kv_load_req_ids, RequestStatus.FINISHED_ERROR)
            for request in requests:
                outputs[request.client_index].append(
                    EngineCoreOutput(
                        request_id=request.request_id,
                        new_token_ids=[],
                        finish_reason=request.get_finished_reason(),
                        events=request.take_events(),
                        trace_headers=request.trace_headers,
                        num_cached_tokens=request.num_cached_tokens,
                    )
                )
        # KV Connector: update state for finished KV Transfers.
        if kv_connector_output:
            self._update_from_kv_xfer_finished(kv_connector_output)
@ -1610,8 +1630,11 @@ class Scheduler(SchedulerInterface):
            self._free_blocks(self.requests[req_id])
    def _update_requests_with_invalid_blocks(
-        self, requests: Iterable[Request], invalid_block_ids: set[int]
+        self,
-    ) -> tuple[set[str], int]:
+        requests: Iterable[Request],
        invalid_block_ids: set[int],
        evict_blocks: bool = True,
    ) -> tuple[set[str], int, set[int]]:
        """
        Identify and update requests affected by invalid KV cache blocks.
@ -1623,16 +1646,21 @@ class Scheduler(SchedulerInterface):
        Args:
            requests: The set of requests to scan for invalid blocks.
            invalid_block_ids: IDs of invalid blocks.
            evict_blocks: Whether to collect blocks for eviction (False for
                async requests which aren't cached yet).
        Returns:
            tuple:
                - affected_req_ids (set[str]): IDs of requests impacted by
                invalid blocks.
                - total_affected_tokens (int): Total number of tokens that must
-                be recomputed across all affected requests (for observability).
+                be recomputed across all affected requests.
                - blocks_to_evict (set[int]): Block IDs to evict from cache,
                including invalid blocks and downstream dependent blocks.
        """
        affected_req_ids: set[str] = set()
        total_affected_tokens = 0
        blocks_to_evict: set[int] = set()
        # If a block is invalid and shared by multiple requests in the batch,
        # these requests must be rescheduled, but only the first will recompute
        # it. This set tracks blocks already marked for recomputation.
@ -1690,6 +1718,9 @@ class Scheduler(SchedulerInterface):
                )
                total_affected_tokens += num_affected_tokens
                request.num_external_computed_tokens -= num_affected_tokens
                # collect invalid block and all downstream dependent blocks
                if evict_blocks:
                    blocks_to_evict.update(req_block_ids[idx:])
            if is_affected:
                if not marked_invalid_block:
@ -1705,47 +1736,70 @@ class Scheduler(SchedulerInterface):
                affected_req_ids.add(request.request_id)
-        return affected_req_ids, total_affected_tokens
+        return affected_req_ids, total_affected_tokens, blocks_to_evict
    def _handle_invalid_blocks(self, invalid_block_ids: set[int]) -> set[str]:
-        total_requests_to_reschedule = 0
+        """
-        total_tokens_to_reschedule = 0
+        Handle requests affected by invalid KV cache blocks.
-        # --- Handle async KV loads (WAITING_FOR_REMOTE_KVS) ---
+        Returns:
            Set of affected request IDs to skip in update_from_output main loop.
        """
        should_fail = not self.recompute_kv_load_failures
        # handle async KV loads (not cached yet, evict_blocks=False)
        async_load_reqs = (
            req
            for req in self.waiting
            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS
        )
-        async_affected_req_ids, num_tokens_to_reschedule = (
+        async_failed_req_ids, num_failed_tokens, _ = (
            self._update_requests_with_invalid_blocks(
-                async_load_reqs, invalid_block_ids
+                async_load_reqs, invalid_block_ids, evict_blocks=False
            )
        )
-        total_requests_to_reschedule += len(async_affected_req_ids)
+        total_failed_requests = len(async_failed_req_ids)
-        total_tokens_to_reschedule += num_tokens_to_reschedule
+        total_failed_tokens = num_failed_tokens
-        # Mark requests with async KV load failures; they will be rescheduled
+        # handle sync loads (may be cached, collect blocks for eviction)
-        # once loading completes.
+        sync_failed_req_ids, num_failed_tokens, sync_blocks_to_evict = (
-        self.failed_recving_kv_req_ids |= async_affected_req_ids
+            self._update_requests_with_invalid_blocks(
-
+                self.running, invalid_block_ids, evict_blocks=True
-        # --- Handle sync KV loads (running requests) ---
+            )
        sync_affected_req_ids, num_tokens_to_reschedule = (
            self._update_requests_with_invalid_blocks(self.running, invalid_block_ids)
        )
-        total_requests_to_reschedule += len(sync_affected_req_ids)
+        total_failed_requests += len(sync_failed_req_ids)
-        total_tokens_to_reschedule += num_tokens_to_reschedule
+        total_failed_tokens += num_failed_tokens
-        if total_requests_to_reschedule:
+        if not total_failed_requests:
-            logger.warning(
+            return set()
-                "Recovered from KV load failure: "
+
-                "%d request(s) rescheduled (%d tokens affected).",
+        # evict invalid blocks and downstream dependent blocks from cache
-                total_requests_to_reschedule,
+        # only when not using recompute policy (where blocks will be recomputed
-                total_tokens_to_reschedule,
+        # and reused by other requests sharing them)
        if sync_blocks_to_evict and not self.recompute_kv_load_failures:
            self.kv_cache_manager.evict_blocks(sync_blocks_to_evict)
        if should_fail:
            all_failed_req_ids = async_failed_req_ids | sync_failed_req_ids
            logger.error(
                "Failing %d request(s) due to KV load failure "
                "(failure_policy=fail, %d tokens affected). Request IDs: %s",
                total_failed_requests,
                total_failed_tokens,
                all_failed_req_ids,
            )
            return all_failed_req_ids
-        # Return the IDs of affected running requests to skip in
+        logger.warning(
-        # update_from_output.
+            "Recovered from KV load failure: "
-        return sync_affected_req_ids
+            "%d request(s) rescheduled (%d tokens affected).",
            total_failed_requests,
            total_failed_tokens,
        )
        # Mark async requests with KV load failures for retry once loading completes
        self.failed_recving_kv_req_ids |= async_failed_req_ids
        # Return sync affected IDs to skip in update_from_output
        return sync_failed_req_ids
--- a/vllm/v1/engine/init.py
+++ b/vllm/v1/engine/init.py
@ -19,24 +19,27 @@ from vllm.v1.serial_utils import UtilityResult
 # These are possible values of RequestOutput.finish_reason,
 # so form part of the external API.
-FINISH_REASON_STRINGS = ("stop", "length", "abort")
+FINISH_REASON_STRINGS = ("stop", "length", "abort", "error")
 class FinishReason(enum.IntEnum):
    """
-    Reason a request finished - stop, length, or abort.
+    Reason a request finished - stop, length, abort, or error.
    Int rather than Str for more compact serialization.
    stop - a stop string was emitted
    length - max_tokens was consumed, or max_model_len was reached
-    abort - aborted for another reason
+    abort - aborted by client
    error - retryable request-level internal error (e.g., KV load failure).
            Invariant: always converted to 500 Internal Server Error.
    """
    STOP = 0
    LENGTH = 1
    ABORT = 2
    ERROR = 3
    def __str__(self):
        return FINISH_REASON_STRINGS[self.value]
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@ -192,7 +192,7 @@ class AsyncLLM(EngineClient):
    @property
    @deprecated(
        "`AsyncLLM.processor` has been renamed to `AsyncLLM.input_processor`. "
-        "The old name will be removed in v0.13."
+        "The old name will be removed in v0.14."
    )
    def processor(self):
        return self.input_processor
@ -701,10 +701,6 @@ class AsyncLLM(EngineClient):
    def tokenizer(self) -> TokenizerLike | None:
        return self.input_processor.tokenizer
    @tokenizer.setter
    def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
        self.input_processor.tokenizer = tokenizer
    async def get_tokenizer(self) -> TokenizerLike:
        if self.tokenizer is None:
            raise ValueError(
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@ -211,6 +211,9 @@ class EngineCore:
        freeze_gc_heap()
        # If enable, attach GC debugger after static variable freeze.
        maybe_attach_gc_debug_callback()
        # Enable environment variable cache (e.g. assume no more
        # environment variable overrides after this point)
        enable_envs_cache()
    def _initialize_kv_caches(
        self, vllm_config: VllmConfig
@ -672,10 +675,6 @@ class EngineCoreProc(EngineCore):
                assert addresses.coordinator_input is not None
                logger.info("Waiting for READY message from DP Coordinator...")
        # Enable environment variable cache (e.g. assume no more
        # environment variable overrides after this point)
        enable_envs_cache()
    @contextmanager
    def _perform_handshakes(
        self,
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@ -64,10 +64,6 @@ class InputProcessor:
    def tokenizer(self) -> TokenizerLike | None:
        return self.input_preprocessor.tokenizer
    @tokenizer.setter
    def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
        self.input_preprocessor.tokenizer = tokenizer
    def _validate_logprobs(
        self,
        params: SamplingParams,
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@ -139,7 +139,7 @@ class LLMEngine:
    @property
    @deprecated(
        "`LLMEngine.processor` has been renamed to `LLMEngine.input_processor`. "
-        "The old name will be removed in v0.13."
+        "The old name will be removed in v0.14."
    )
    def processor(self):
        return self.input_processor
@ -358,10 +358,6 @@ class LLMEngine:
    def tokenizer(self) -> TokenizerLike | None:
        return self.input_processor.tokenizer
    @tokenizer.setter
    def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
        self.input_processor.tokenizer = tokenizer
    def get_tokenizer(self) -> TokenizerLike:
        if self.tokenizer is None:
            raise ValueError(
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@ -10,7 +10,7 @@ def __getattr__(name: str):
        warnings.warn(
            "`vllm.v1.engine.processor.Processor` has been moved to "
            "`vllm.v1.engine.input_processor.InputProcessor`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
            DeprecationWarning,
            stacklevel=2,
        )
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@ -255,6 +255,7 @@ class RequestStatus(enum.IntEnum):
    FINISHED_LENGTH_CAPPED = enum.auto()
    FINISHED_ABORTED = enum.auto()
    FINISHED_IGNORED = enum.auto()
    FINISHED_ERROR = enum.auto()
    def __str__(self):
        return self.name
@ -277,4 +278,5 @@ _FINISHED_REASON_MAP = {
    RequestStatus.FINISHED_LENGTH_CAPPED: FinishReason.LENGTH,
    RequestStatus.FINISHED_ABORTED: FinishReason.ABORT,
    RequestStatus.FINISHED_IGNORED: FinishReason.LENGTH,
    RequestStatus.FINISHED_ERROR: FinishReason.ERROR,
 }
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -1275,6 +1275,8 @@ class GPUModelRunner(
        if not isinstance(kv_cache_spec, CrossAttentionSpec):
            return None, None
        # Zero out buffer for padding requests that are not actually scheduled (CGs)
        self.encoder_seq_lens.np[:num_reqs] = 0
        # Build encoder_seq_lens array mapping request indices to
        # encoder lengths for inputs scheduled in this batch
        for req_id in num_scheduled_tokens:
@ -2874,6 +2876,7 @@ class GPUModelRunner(
        # be improved in model runner v2)
        force_uniform_decode: bool | None = None,
        force_has_lora: bool | None = None,
        num_encoder_reqs: int = 0,
    ) -> tuple[
        CUDAGraphMode,
        BatchDescriptor,
@ -2890,6 +2893,11 @@ class GPUModelRunner(
            if force_uniform_decode is None
            else force_uniform_decode
        )
        # Encoder-decoder models only support CG for decoder_step > 0 (no enc_output
        # is present). Also, chunked-prefill is disabled, so batch are uniform.
        has_encoder_output = (
            self.model_config.is_encoder_decoder and num_encoder_reqs > 0
        )
        has_lora = (
            len(self.input_batch.lora_id_to_lora_request) > 0
@ -2909,7 +2917,7 @@ class GPUModelRunner(
        )
        cudagraph_mode, batch_descriptor = dispatch_cudagraph(
-            num_tokens_padded, use_cascade_attn
+            num_tokens_padded, use_cascade_attn or has_encoder_output
        )
        num_tokens_padded = batch_descriptor.num_tokens
@ -3107,6 +3115,7 @@ class GPUModelRunner(
                    num_scheduled_tokens_np=num_scheduled_tokens_np,
                    max_num_scheduled_tokens=max_num_scheduled_tokens,
                    use_cascade_attn=cascade_attn_prefix_lens is not None,
                    num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs),
                )
                logger.debug(
@ -3672,74 +3681,89 @@ class GPUModelRunner(
        if self.parallel_config.enable_eplb:
            self.eplb_state = EplbState(self.parallel_config, self.device)
            eplb_models = 0
-        with DeviceMemoryProfiler() as m:
+
-            time_before_load = time.perf_counter()
+        try:
-            model_loader = get_model_loader(self.load_config)
+            with DeviceMemoryProfiler() as m:
-            self.model = model_loader.load_model(
+                time_before_load = time.perf_counter()
-                vllm_config=self.vllm_config, model_config=self.model_config
+                model_loader = get_model_loader(self.load_config)
-            )
+                self.model = model_loader.load_model(
-            if self.lora_config:
+                    vllm_config=self.vllm_config, model_config=self.model_config
                self.model = self.load_lora_model(
                    self.model, self.vllm_config, self.device
                )
-            if hasattr(self, "drafter"):
+                if self.lora_config:
-                logger.info_once("Loading drafter model...")
+                    self.model = self.load_lora_model(
-                self.drafter.load_model(self.model)
+                        self.model, self.vllm_config, self.device
                if (
                    hasattr(self.drafter, "model")
                    and is_mixture_of_experts(self.drafter.model)
                    and self.parallel_config.enable_eplb
                ):
                    spec_config = self.vllm_config.speculative_config
                    assert spec_config is not None
                    assert spec_config.draft_model_config is not None
                    logger.info_once(
                        "EPLB is enabled for drafter model %s.",
                        spec_config.draft_model_config.model,
                    )
                if hasattr(self, "drafter"):
                    logger.info_once("Loading drafter model...")
                    self.drafter.load_model(self.model)
                    if (
                        hasattr(self.drafter, "model")
                        and is_mixture_of_experts(self.drafter.model)
                        and self.parallel_config.enable_eplb
                    ):
                        spec_config = self.vllm_config.speculative_config
                        assert spec_config is not None
                        assert spec_config.draft_model_config is not None
                        logger.info_once(
                            "EPLB is enabled for drafter model %s.",
                            spec_config.draft_model_config.model,
                        )
-                    global_expert_load = (
+                        global_expert_load = (
-                        global_expert_loads[eplb_models]
+                            global_expert_loads[eplb_models]
-                        if global_expert_loads
+                            if global_expert_loads
-                        else None
+                            else None
-                    )
+                        )
-                    old_global_expert_indices = (
+                        old_global_expert_indices = (
-                        old_global_expert_indices_per_model[eplb_models]
+                            old_global_expert_indices_per_model[eplb_models]
-                        if old_global_expert_indices_per_model
+                            if old_global_expert_indices_per_model
-                        else None
+                            else None
-                    )
+                        )
-                    if self.eplb_state is None:
+                        if self.eplb_state is None:
-                        self.eplb_state = EplbState(self.parallel_config, self.device)
+                            self.eplb_state = EplbState(
-                    self.eplb_state.add_model(
+                                self.parallel_config, self.device
-                        self.drafter.model,
+                            )
-                        spec_config.draft_model_config,
+                        self.eplb_state.add_model(
-                        global_expert_load,
+                            self.drafter.model,
-                        old_global_expert_indices,
+                            spec_config.draft_model_config,
-                        rank_mapping,
+                            global_expert_load,
-                    )
+                            old_global_expert_indices,
-                    eplb_models += 1
+                            rank_mapping,
                        )
                        eplb_models += 1
-            if self.use_aux_hidden_state_outputs:
+                if self.use_aux_hidden_state_outputs:
-                if not supports_eagle3(self.get_model()):
+                    if not supports_eagle3(self.get_model()):
-                    raise RuntimeError(
+                        raise RuntimeError(
-                        "Model does not support EAGLE3 interface but "
+                            "Model does not support EAGLE3 interface but "
-                        "aux_hidden_state_outputs was requested"
+                            "aux_hidden_state_outputs was requested"
-                    )
+                        )
-                # Try to get auxiliary layers from speculative config,
+                    # Try to get auxiliary layers from speculative config,
-                # otherwise use model's default layers
+                    # otherwise use model's default layers
-                aux_layers = self._get_eagle3_aux_layers_from_config()
+                    aux_layers = self._get_eagle3_aux_layers_from_config()
-                if aux_layers:
+                    if aux_layers:
-                    logger.info(
+                        logger.info(
-                        "Using auxiliary layers from speculative config: %s",
+                            "Using auxiliary layers from speculative config: %s",
-                        aux_layers,
+                            aux_layers,
-                    )
+                        )
-                else:
+                    else:
-                    aux_layers = self.model.get_eagle3_aux_hidden_state_layers()
+                        aux_layers = self.model.get_eagle3_aux_hidden_state_layers()
-                self.model.set_aux_hidden_state_layers(aux_layers)
+                    self.model.set_aux_hidden_state_layers(aux_layers)
-            time_after_load = time.perf_counter()
+                time_after_load = time.perf_counter()
-        self.model_memory_usage = m.consumed_memory
+            self.model_memory_usage = m.consumed_memory
        except torch.cuda.OutOfMemoryError as e:
            msg = (
                "Failed to load model - not enough GPU memory. "
                "Try lowering --gpu-memory-utilization to free memory for weights, "
                "increasing --tensor-parallel-size, or using --quantization. "
                "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
                "for more tips."
            )
            combined_msg = f"{msg} (original error: {e})"
            logger.error(combined_msg)
            raise e
        logger.info_once(
            "Model loading took %.4f GiB memory and %.6f seconds",
            self.model_memory_usage / GiB_bytes,
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@ -10,7 +10,7 @@ import torch
 import torch.nn as nn
 import vllm.envs as envs
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed import (
    ensure_model_parallel_initialized,
    init_distributed_environment,
@ -207,7 +207,8 @@ class TPUWorker:
        # one compiled bytecode. Having one FX graph/cached bytecode per
        # compiled model is required for `support_torch_compile` decorator to
        # skip dynamo guard.
-        self.model_runner.reset_dynamo_cache()
+        with set_current_vllm_config(self.vllm_config):
            self.model_runner.reset_dynamo_cache()
        # Get the maximum amount of memory used by the model weights and
        # intermediate activations.
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@ -313,8 +313,12 @@ def bind_kv_cache(
            # TODO - analyze where runner_kv_caches is used and the right
            # way to ensure it properly reflects multiple attention layers
            # in the same decoder block.
-            if current_platform.is_cuda_alike() or current_platform.is_xpu():
+            if (
-                # We know that the GPU runner is not impacted by this
+                current_platform.is_cuda_alike()
                or current_platform.is_xpu()
                or current_platform.is_cpu()
            ):
                # We know that the GPU / CPU runner is not impacted by this
                # case. Some test code depends on runner_kv_caches, but
                # not in a way that's impacted by ignoring this.
                pass
`@ -1,2 +1,2 @@`
	`lmcache`	`lmcache >= 0.3.10.post1`
	`nixl >= 0.7.1 # Required for disaggregated prefill`	`nixl >= 0.7.1 # Required for disaggregated prefill`