Merge branch 'main' into model_arch_cfg

2026-06-29 16:17:27 +08:00 · 2025-12-10 10:16:26 -08:00 · 2025-12-10 10:16:26 -08:00 · 72730a74d5
commit 72730a74d5
parent 5401f6529d e8e8cd73e5
58 changed files with 867 additions and 234 deletions
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@ -36,6 +36,11 @@ function cpu_tests() {
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"

+  # Run model tests
+  docker exec cpu-test bash -c "
+    set -e
+    pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model"
+
  # Run kernel tests
  docker exec cpu-test bash -c "
    set -e
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -574,7 +574,7 @@ async def benchmark(
    )
    print(
        "{:<40} {:<10.2f}".format(
-            "Total Token throughput (tok/s):", metrics.total_token_throughput
+            "Total token throughput (tok/s):", metrics.total_token_throughput
        )
    )

--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -251,17 +251,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
        endif()

        # Build ACL with CMake
-        set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
-        set(CMAKE_BUILD_TYPE "Release")
-        set(ARM_COMPUTE_ARCH "armv8.2-a")
-        set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
-        set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
-        set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
-        set(ARM_COMPUTE_ENABLE_OPENMP "ON")
-        set(ARM_COMPUTE_ENABLE_WERROR "OFF")
-        set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
-        set(ARM_COMPUTE_BUILD_TESTING "OFF")
-
        set(_cmake_config_cmd
             ${CMAKE_COMMAND} -G Ninja -B build 
            -DARM_COMPUTE_BUILD_SHARED_LIB=OFF 
--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@ -117,7 +117,6 @@ torch::Tensor get_scheduler_metadata(
  input.casual = casual;
  input.isa = isa;
  input.enable_kv_split = enable_kv_split;
-  TORCH_CHECK(casual, "Only supports casual mask for now.");

  VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() {
    CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] {
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@ -186,7 +186,7 @@ struct AttentionMetadata {
 //  - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2
 //  * q_tile_size * 4, partial output, max + sum (float)
 // Reduction scratchpad contains:
-//  - flags: bool array to indicate wether the split is finished
+//  - flags: bool array to indicate whether the split is finished
 //  - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size
 //  - max, sum: 2 * split_num * q_tile_size * 4
 class AttentionScratchPad {
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@ -617,7 +617,7 @@ struct MacheteCollectiveMma {

  // Same as upstream, should be kept the same when possible, not formatted for
  // easier comparison
-  //   with `SwapAB ? N : M -> M` since we dont support SwapAB
+  //   with `SwapAB ? N : M -> M` since we don't support SwapAB
  // clang-format off
  template<class ProblemShape>
  static bool
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@ -1241,33 +1241,16 @@ __global__ void wvSplitK_hf_big_(const int K, const int M, const int Bx,
 }
 #endif  // defined(__HIP__GFX9__) TODO: Add NAVI support

+// Find the min val of div2 that doesn't increase N/(div1*div2)
 int mindiv(int N, int div1, int div2) {
  int nPrRnd = div1 * div2;
-  int rnds0 = N / nPrRnd;
-  nPrRnd -= div1 * 3;
-  int rnds3 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds4 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds5 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds6 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds7 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds8 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rnds9 = N / nPrRnd;
-  nPrRnd -= div1;
-  int rtn = div2;
-  if (rnds0 == rnds3) rtn = div2 - 3;
-  if (rnds0 == rnds4) rtn = div2 - 4;
-  if (rnds0 == rnds5) rtn = div2 - 5;
-  if (rnds0 == rnds6) rtn = div2 - 6;
-  if (rnds0 == rnds7) rtn = div2 - 7;
-  if (rnds0 == rnds8) rtn = div2 - 8;
-  if (rnds0 == rnds9) rtn = div2 - 9;
-  return rtn;
+  int rnds[13];
+  for (int i = 0; i < 13; i++) {
+    rnds[i] = (N + nPrRnd - 1) / nPrRnd;
+    nPrRnd -= div1;
+  }
+  for (int i = 12; i >= 0; i--)
+    if (rnds[0] == rnds[i]) return (div2 - i);
 }

 torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
@ -1300,26 +1283,37 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  const int max_lds_len = get_lds_size() / 2;

-#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \
-                 _N)                                                          \
-  {                                                                           \
-    dim3 block(64, _WvPrGrp);                                                 \
-    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {              \
-      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp);              \
-      wvSplitK_hf_sml_<fptype, 64, _YTILEs, _WvPrGrp, 8, _UNRLs, _N>          \
-          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
-                                       biasf4, c, __wvPrGrp, CuCount);        \
-    } else if (K_in * N_in <= max_lds_len * 1.2) {                            \
-      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp);              \
-      wvSplitK_hf_<fptype, 64, _YTILEm, _WvPrGrp, 8, _UNRLm, _N>              \
-          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
-                                       biasf4, c, __wvPrGrp, CuCount);        \
-    } else {                                                                  \
-      int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp);              \
-      wvSplitK_hf_big_<fptype, 64, _YTILEb, _WvPrGrp, 8, _UNRLb, _N>          \
-          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4,    \
-                                       biasf4, c, __wvPrGrp, CuCount);        \
-    }                                                                         \
+#define WVSPLITK(_YTILE, _UNRL, _N)                                        \
+  {                                                                        \
+    dim3 block(64, 16);                                                    \
+    int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, 16);                    \
+    if ((K_in * N_in <= max_lds_len) && (M_in % _YTILE == 0))              \
+      wvSplitK_hf_sml_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>               \
+          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
+                                       biasf4, c, __wvPrGrp, CuCount);     \
+    else if (K_in * N_in <= max_lds_len * 1.2)                             \
+      wvSplitK_hf_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>                   \
+          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
+                                       biasf4, c, __wvPrGrp, CuCount);     \
+    else                                                                   \
+      wvSplitK_hf_big_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>               \
+          <<<grid, block, 0, stream>>>(K_in, M_in, Bx_in, By_in, af4, bf4, \
+                                       biasf4, c, __wvPrGrp, CuCount);     \
+  }
+
+#define WVSPLIT_TILE(_sYT, __N)                           \
+  {                                                       \
+    bool fit_lds = (K_in * N_in <= max_lds_len);          \
+    if (_sYT <= 1)                                        \
+      WVSPLITK(1, 4, __N)                                 \
+    else if ((__N == 1) || (!fit_lds) || (_sYT <= 4 * 2)) \
+      WVSPLITK(2, 2, __N)                                 \
+    else if (_sYT <= 4 * 3)                               \
+      WVSPLITK(3, 2, __N)                                 \
+    else if (__N == 4)                                    \
+      WVSPLITK(4, 1, __N)                                 \
+    else                                                  \
+      WVSPLITK(4, 2, __N)                                 \
  }

  AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] {
@ -1331,18 +1325,23 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
            ? reinterpret_cast<const fptype*>(in_bias->data_ptr())
            : nullptr;
    fptype* c = reinterpret_cast<fptype*>(out_c.data_ptr());
+
+    // first shoot for biggest tile-size that keeps all simd busy,
+    // then cut the active waves to balance their distribution...
+    int sYT = (M_in + CuCount * 4 - 1) / (CuCount * 4);
+
    switch (N_in) {
      case 1:
-        WVSPLITK(16, 2, 2, 2, 2, 2, 2, 1)
+        WVSPLIT_TILE(sYT, 1)
        break;
      case 2:
-        WVSPLITK(16, 2, 2, 2, 2, 2, 2, 2)
+        WVSPLIT_TILE(sYT, 2)
        break;
      case 3:
-        WVSPLITK(16, 4, 7, 7, 1, 1, 1, 3)
+        WVSPLIT_TILE(sYT, 3)
        break;
      case 4:
-        WVSPLITK(16, 4, 7, 7, 1, 1, 1, 4)
+        WVSPLIT_TILE(sYT, 4)
        break;
      default:
        throw std::runtime_error(
--- a/docs/benchmarking/cli.md
+++ b/docs/benchmarking/cli.md
@ -84,7 +84,7 @@ Total input tokens:                      1369
 Total generated tokens:                  2212
 Request throughput (req/s):              1.73
 Output token throughput (tok/s):         382.89
-Total Token throughput (tok/s):          619.85
+Total token throughput (tok/s):          619.85
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          71.54
 Median TTFT (ms):                        73.88
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@ -21,30 +21,20 @@ The mental model is that server-level metrics help explain the values of request

 ### v1 Metrics

-In v1, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
+In v1, an extensive set of metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix, for example:

 - `vllm:num_requests_running` (Gauge) - Number of requests currently running.
- `vllm:num_requests_waiting` (Gauge) - Number of requests currently waiting.
 - `vllm:kv_cache_usage_perc` (Gauge) - Fraction of used KV cache blocks (0–1).
 - `vllm:prefix_cache_queries` (Counter) - Number of prefix cache queries.
 - `vllm:prefix_cache_hits` (Counter) - Number of prefix cache hits.
- `vllm:mm_cache_queries` (Counter) - (For multimodal models) Number of multimodal cache queries.
- `vllm:mm_cache_hits` (Counter) - (For multimodal models) Number of multimodal cache hits.
- `vllm:num_preemptions_total` (Counter) - Number of preemptions.
 - `vllm:prompt_tokens_total` (Counter) - Total number of prompt tokens processed.
 - `vllm:generation_tokens_total` (Counter) - Total number of generated tokens.
- `vllm:iteration_tokens_total` (Histogram) - Histogram of tokens processed in each engine step.
- `vllm:cache_config_info` (Gauge) - Information about the cache configuration.
 - `vllm:request_success_total` (Counter) - Number of finished requests (by finish reason).
 - `vllm:request_prompt_tokens` (Histogram) - Histogram of input prompt token counts.
 - `vllm:request_generation_tokens` (Histogram) - Histogram of generation token counts.
- `vllm:request_params_n` (Histogram) - Histogram of request parameter n.
- `vllm:request_params_max_tokens` - (Histogram) - Histogram of max_tokens parameter in requests.
 - `vllm:time_to_first_token_seconds` (Histogram) - Time to first token (TTFT).
 - `vllm:inter_token_latency_seconds` (Histogram) - Inter-token latency.
 - `vllm:e2e_request_latency_seconds` (Histogram) - End-to-end request latency.
- `vllm:request_queue_time_seconds` (Histogram) - Time spent in the queue.
- `vllm:request_inference_time_seconds` (Histogram) - Request inference time.
 - `vllm:request_prefill_time_seconds` (Histogram) - Request prefill time.
 - `vllm:request_decode_time_seconds` (Histogram) - Request decode time.

--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@ -22,7 +22,7 @@ python tools/install_nixl_from_source_ubuntu.py
 NixlConnector uses NIXL library for underlying communication, which supports multiple transport backends. UCX (Unified Communication X) is the primary default transport library used by NIXL. Configure transport environment variables:

 ```bash
-# Example UCX configuration, adjust according to your enviroment
+# Example UCX configuration, adjust according to your environment
 export UCX_TLS=all  # or specify specific transports like "rc,ud,sm,^cuda_ipc" ..etc
 export UCX_NET_DEVICES=all  # or specify network devices like "mlx5_0:1,mlx5_1:1"
 ```
--- a/docs/mkdocs/hooks/generate_metrics.py
+++ b/docs/mkdocs/hooks/generate_metrics.py
@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import ast
+import logging
+from pathlib import Path
+from typing import Literal
+
+logger = logging.getLogger("mkdocs")
+
+ROOT_DIR = Path(__file__).parent.parent.parent.parent
+DOCS_DIR = ROOT_DIR / "docs"
+GENERATED_METRICS_DIR = DOCS_DIR / "generated" / "metrics"
+
+# Files to scan for metric definitions - each will generate a separate table
+METRIC_SOURCE_FILES = [
+    {"path": "vllm/v1/metrics/loggers.py", "output": "general.md"},
+    {
+        "path": "vllm/v1/spec_decode/metrics.py",
+        "output": "spec_decode.md",
+    },
+    {
+        "path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py",
+        "output": "nixl_connector.md",
+    },
+]
+
+
+class MetricExtractor(ast.NodeVisitor):
+    """AST visitor to extract metric definitions."""
+
+    def __init__(self):
+        self.metrics: list[dict[str, str]] = []
+
+    def visit_Call(self, node: ast.Call) -> None:
+        """Visit function calls to find metric class instantiations."""
+        metric_type = self._get_metric_type(node)
+        if metric_type:
+            name = self._extract_kwarg(node, "name")
+            documentation = self._extract_kwarg(node, "documentation")
+
+            if name:
+                self.metrics.append(
+                    {
+                        "name": name,
+                        "type": metric_type,
+                        "documentation": documentation or "",
+                    }
+                )
+
+        self.generic_visit(node)
+
+    def _get_metric_type(self, node: ast.Call) -> str | None:
+        """Determine if this call creates a metric and return its type."""
+        metric_type_map = {
+            "_gauge_cls": "gauge",
+            "_counter_cls": "counter",
+            "_histogram_cls": "histogram",
+        }
+        if isinstance(node.func, ast.Attribute):
+            return metric_type_map.get(node.func.attr)
+        return None
+
+    def _extract_kwarg(self, node: ast.Call, key: str) -> str | None:
+        """Extract a keyword argument value from a function call."""
+        for keyword in node.keywords:
+            if keyword.arg == key:
+                return self._get_string_value(keyword.value)
+        return None
+
+    def _get_string_value(self, node: ast.AST) -> str | None:
+        """Extract string value from an AST node."""
+        if isinstance(node, ast.Constant):
+            return str(node.value) if node.value is not None else None
+        return None
+
+
+def extract_metrics_from_file(filepath: Path) -> list[dict[str, str]]:
+    """Parse a Python file and extract all metric definitions."""
+    try:
+        with open(filepath, encoding="utf-8") as f:
+            source = f.read()
+
+        tree = ast.parse(source, filename=str(filepath))
+        extractor = MetricExtractor()
+        extractor.visit(tree)
+        return extractor.metrics
+    except Exception as e:
+        raise RuntimeError(f"Failed to parse {filepath}: {e}") from e
+
+
+def generate_markdown_table(metrics: list[dict[str, str]]) -> str:
+    """Generate a markdown table from extracted metrics."""
+    if not metrics:
+        return "No metrics found.\n"
+
+    # Sort by type, then by name
+    metrics_sorted = sorted(metrics, key=lambda m: (m["type"], m["name"]))
+
+    lines = []
+    lines.append("| Metric Name | Type | Description |")
+    lines.append("|-------------|------|-------------|")
+
+    for metric in metrics_sorted:
+        name = metric["name"]
+        metric_type = metric["type"].capitalize()
+        doc = metric["documentation"].replace("\n", " ").strip()
+        lines.append(f"| `{name}` | {metric_type} | {doc} |")
+
+    return "\n".join(lines) + "\n"
+
+
+def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
+    """Generate metrics documentation tables from source files."""
+    logger.info("Generating metrics documentation")
+
+    # Create generated directory if it doesn't exist
+    GENERATED_METRICS_DIR.mkdir(parents=True, exist_ok=True)
+
+    total_metrics = 0
+    for source_config in METRIC_SOURCE_FILES:
+        source_path = source_config["path"]
+        output_file = source_config["output"]
+
+        filepath = ROOT_DIR / source_path
+        if not filepath.exists():
+            raise FileNotFoundError(f"Metrics source file not found: {filepath}")
+
+        logger.debug("Extracting metrics from: %s", source_path)
+        metrics = extract_metrics_from_file(filepath)
+        logger.debug("Found %d metrics in %s", len(metrics), source_path)
+
+        # Generate and write the markdown table for this source
+        table_content = generate_markdown_table(metrics)
+        output_path = GENERATED_METRICS_DIR / output_file
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(table_content)
+
+        total_metrics += len(metrics)
+        logger.info(
+            "Generated metrics table: %s (%d metrics)",
+            output_path.relative_to(ROOT_DIR),
+            len(metrics),
+        )
+
+    logger.info(
+        "Total metrics generated: %d across %d files",
+        total_metrics,
+        len(METRIC_SOURCE_FILES),
+    )
--- a/docs/usage/metrics.md
+++ b/docs/usage/metrics.md
@ -33,11 +33,19 @@ Then query the endpoint to get the latest metrics from the server:

 The following metrics are exposed:

-??? code
+## General Metrics

-    ```python
-    --8<-- "vllm/engine/metrics.py:metrics-definitions"
-    ```
+--8<-- "docs/generated/metrics/general.md"
+
+## Speculative Decoding Metrics
+
+--8<-- "docs/generated/metrics/spec_decode.md"
+
+## NIXL KV Connector Metrics
+
+--8<-- "docs/generated/metrics/nixl_connector.md"
+
+## Deprecation Policy

 Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
 but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@ -51,6 +51,7 @@ hooks:
  - docs/mkdocs/hooks/remove_announcement.py
  - docs/mkdocs/hooks/generate_examples.py
  - docs/mkdocs/hooks/generate_argparse.py
+  - docs/mkdocs/hooks/generate_metrics.py
  - docs/mkdocs/hooks/url_schemes.py

 plugins:
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@ -75,7 +75,7 @@ torchgeo==0.7.0
 mteb==2.1.2

 # Data processing
-xgrammar==0.1.27
+xgrammar @ git+https://github.com/divakar-amd/xgrammar@3272f7c520564858056a60480d5afdf69ae79c84
 # Test async scheduling

 # Utilities
--- a/tests/entrypoints/test_responses_utils.py
+++ b/tests/entrypoints/test_responses_utils.py
@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pytest
+from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
 from openai.types.responses.response_function_tool_call_output_item import (
    ResponseFunctionToolCallOutputItem,
 )
@ -14,7 +15,8 @@ from openai.types.responses.response_reasoning_item import (
 )

 from vllm.entrypoints.responses_utils import (
-    construct_chat_message_with_tool_call,
+    _construct_single_message_from_response_item,
+    construct_chat_messages_with_tool_call,
    convert_tool_responses_to_completions_format,
 )

@ -42,7 +44,43 @@ class TestResponsesUtils:

        assert result == {"type": "function", "function": input_tool}

-    def test_construct_chat_message_with_tool_call(self):
+    def test_construct_chat_messages_with_tool_call(self):
+        """Test construction of chat messages with tool calls."""
+        reasoning_item = ResponseReasoningItem(
+            id="lol",
+            summary=[],
+            type="reasoning",
+            content=[
+                Content(
+                    text="Leroy Jenkins",
+                    type="reasoning_text",
+                )
+            ],
+            encrypted_content=None,
+            status=None,
+        )
+        mcp_tool_item = ResponseFunctionToolCall(
+            id="mcp_123",
+            call_id="call_123",
+            type="function_call",
+            status="completed",
+            name="python",
+            arguments='{"code": "123+456"}',
+        )
+        input_items = [reasoning_item, mcp_tool_item]
+        messages = construct_chat_messages_with_tool_call(input_items)
+
+        assert len(messages) == 1
+        message = messages[0]
+        assert message["role"] == "assistant"
+        assert message["reasoning"] == "Leroy Jenkins"
+        assert message["tool_calls"][0]["id"] == "call_123"
+        assert message["tool_calls"][0]["function"]["name"] == "python"
+        assert (
+            message["tool_calls"][0]["function"]["arguments"] == '{"code": "123+456"}'
+        )
+
+    def test_construct_single_message_from_response_item(self):
        item = ResponseReasoningItem(
            id="lol",
            summary=[],
@ -56,7 +94,7 @@ class TestResponsesUtils:
            encrypted_content=None,
            status=None,
        )
-        formatted_item = construct_chat_message_with_tool_call(item)
+        formatted_item = _construct_single_message_from_response_item(item)
        assert formatted_item["role"] == "assistant"
        assert formatted_item["reasoning"] == "Leroy Jenkins"

@ -74,7 +112,7 @@ class TestResponsesUtils:
            status=None,
        )

-        formatted_item = construct_chat_message_with_tool_call(item)
+        formatted_item = _construct_single_message_from_response_item(item)
        assert formatted_item["role"] == "assistant"
        assert (
            formatted_item["reasoning"]
@ -88,7 +126,7 @@ class TestResponsesUtils:
            output="1234",
            status="completed",
        )
-        formatted_item = construct_chat_message_with_tool_call(tool_call_output)
+        formatted_item = _construct_single_message_from_response_item(tool_call_output)
        assert formatted_item["role"] == "tool"
        assert formatted_item["content"] == "1234"
        assert formatted_item["tool_call_id"] == "temp"
@ -102,7 +140,7 @@ class TestResponsesUtils:
            status=None,
        )
        with pytest.raises(ValueError):
-            construct_chat_message_with_tool_call(item)
+            _construct_single_message_from_response_item(item)

        output_item = ResponseOutputMessage(
            id="msg_bf585bbbe3d500e0",
@ -119,6 +157,6 @@ class TestResponsesUtils:
            type="message",
        )

-        formatted_item = construct_chat_message_with_tool_call(output_item)
+        formatted_item = _construct_single_message_from_response_item(output_item)
        assert formatted_item["role"] == "assistant"
        assert formatted_item["content"] == "dongyi"
--- a/tests/kernels/attention/test_cpu_attn.py
+++ b/tests/kernels/attention/test_cpu_attn.py
@ -7,7 +7,8 @@ import math
 import pytest
 import torch

-from vllm.platforms import current_platform
+from vllm.platforms import CpuArchEnum, current_platform
+from vllm.v1.attention.backends.cpu_attn import _get_attn_isa

 if not current_platform.is_cpu():
    pytest.skip("skipping CPU-only tests", allow_module_level=True)
@ -36,6 +37,21 @@ SEQ_LENS = [  # (q_len, kv_len)
 ]


+def get_attn_isa(
+    block_size: int | None = None,
+    dtype: torch.dtype | None = None,
+):
+    if block_size and dtype:
+        return _get_attn_isa(dtype, block_size)
+    else:
+        if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
+            return "neon"
+        elif torch._C._cpu._is_amx_tile_supported():
+            return "amx"
+        else:
+            return "vec"
+
+
 # rand number generation takes too much time, cache rand tensors
@functools.lru_cache(maxsize=128, typed=False)
 def tensor_cache(
@ -452,6 +468,49 @@ def test_varlen_with_paged_kv_normal_vec16(
    )


+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", [96, 128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", QTYPES)
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize("isa", ["neon"])
+@pytest.mark.skipif(
+    current_platform.get_cpu_architecture() != CpuArchEnum.ARM,
+    reason="Not an Arm CPU.",
+)
+def test_varlen_with_paged_kv_normal_neon(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
+
+
@pytest.mark.parametrize("seq_lens", SEQ_LENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", [96])
@ -462,9 +521,7 @@ def test_varlen_with_paged_kv_normal_vec16(
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("use_alibi", [False])
@pytest.mark.parametrize("use_sink", [False])
-@pytest.mark.parametrize(
-    "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
-)
+@pytest.mark.parametrize("isa", [get_attn_isa()])
 def test_varlen_with_paged_kv_softcap(
    seq_lens: list[tuple[int, int]],
    num_heads: tuple[int, int],
@ -503,9 +560,7 @@ def test_varlen_with_paged_kv_softcap(
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("use_alibi", [True])
@pytest.mark.parametrize("use_sink", [False])
-@pytest.mark.parametrize(
-    "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
-)
+@pytest.mark.parametrize("isa", [get_attn_isa()])
 def test_varlen_with_paged_kv_alibi(
    seq_lens: list[tuple[int, int]],
    num_heads: tuple[int, int],
@ -544,9 +599,7 @@ def test_varlen_with_paged_kv_alibi(
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("use_alibi", [False])
@pytest.mark.parametrize("use_sink", [True])
-@pytest.mark.parametrize(
-    "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
-)
+@pytest.mark.parametrize("isa", [get_attn_isa()])
 def test_varlen_with_paged_kv_sink(
    seq_lens: list[tuple[int, int]],
    num_heads: tuple[int, int],
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@ -54,6 +54,10 @@ def setup_cuda():
    torch.set_default_device("cuda")


+@pytest.mark.skipif(
+    current_platform.is_fp8_fnuz(),
+    reason="This platform supports e4m3fnuz, not e4m3fn.",
+)
@pytest.mark.parametrize(
    "num_tokens,d,dtype,group_size,seed",
    itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS),
@ -78,14 +82,14 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
 def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
    torch.manual_seed(seed)
    factor_for_scale = 1e-2
-    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_info = torch.finfo(current_platform.fp8_dtype())
    fp8_max, fp8_min = fp8_info.max, fp8_info.min

    A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
-    A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(current_platform.fp8_dtype())

    B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
-    B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(current_platform.fp8_dtype())

    block_n, block_k = block_size[0], block_size[1]
    n_tiles = (N + block_n - 1) // block_n
@ -103,6 +107,9 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
    assert rel_diff < 0.001


+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="CUTLASS only supported on CUDA platform."
+)
@torch.inference_mode()
 def test_w8a8_block_fp8_cutlass_matmul():
    # Test simple case where weight.shape % 128 != 0,
@ -151,6 +158,10 @@ def test_w8a8_block_fp8_cutlass_matmul():
    assert rel_diff < 0.001


+@pytest.mark.skipif(
+    current_platform.is_fp8_fnuz(),
+    reason="This platform supports e4m3fnuz, not e4m3fn.",
+)
@pytest.mark.parametrize(
    "M,N,K,block_size,out_dtype,seed",
    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS),
--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@ -15,6 +15,9 @@ from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv

+if not current_platform.is_cuda():
+    pytest.skip("These tests use CUTLASS which requires CUDA", allow_module_level=True)
+
 MNK_FACTORS = [
    (1, 256, 128),
    (1, 16384, 1024),
--- a/tests/kernels/quantization/test_cutlass_w4a8.py
+++ b/tests/kernels/quantization/test_cutlass_w4a8.py
@ -21,6 +21,9 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types

+if not current_platform.is_cuda():
+    pytest.skip("These tests use CUTLASS which requires CUDA", allow_module_level=True)
+
 # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
 #  unit tests to a common utility function. Currently the use of
 #  `is_quant_method_supported` conflates kernels with quantization methods
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@ -92,16 +92,19 @@ def run_test(
    *,
    tensor_parallel_size: int,
    distributed_executor_backend: str | None = None,
+    dtype: str = "half",
 ) -> None:
    prompt_list = PROMPTS * 10
    expected_list = EXPECTED[model] * 10

    with vllm_runner(
        model,
-        dtype="half",
+        dtype=dtype,
        max_model_len=448,
        tensor_parallel_size=tensor_parallel_size,
        distributed_executor_backend=distributed_executor_backend,
+        # TODO (NickLucche) figure out output differences with non-eager and re-enable
+        enforce_eager=True,
    ) as vllm_model:
        llm = vllm_model.llm

@ -120,12 +123,28 @@ def run_test(

@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
+@pytest.mark.parametrize("dtype", ["half"])
@create_new_process_for_each_test()
-def test_models(vllm_runner, model) -> None:
+def test_models(vllm_runner, model, dtype) -> None:
    run_test(
        vllm_runner,
        model,
        tensor_parallel_size=1,
+        dtype=dtype,
+    )
+
+
+@pytest.mark.cpu_model
+@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models_cpu(vllm_runner, model, dtype) -> None:
+    # @create_new_process_for_each_test() does not work for some runners
+    # TODO: to fix cpu privilege issues in run-cpu-test-arm.sh
+    run_test(
+        vllm_runner,
+        model,
+        tensor_parallel_size=1,
+        dtype=dtype,
    )


--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+import asyncio
 import base64
 import mimetypes
 import os
@ -186,6 +187,7 @@ async def test_fetch_image_error_conversion():
        connector.fetch_image(broken_img)


+@pytest.mark.flaky(reruns=3, reruns_delay=5)
@pytest.mark.asyncio
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
@ -198,8 +200,12 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
        }
    )

-    video_sync, metadata_sync = connector.fetch_video(video_url)
-    video_async, metadata_async = await connector.fetch_video_async(video_url)
+    try:
+        video_sync, metadata_sync = connector.fetch_video(video_url)
+        video_async, metadata_async = await connector.fetch_video_async(video_url)
+    except (TimeoutError, asyncio.TimeoutError) as e:
+        pytest.skip(f"Timeout fetching video (CI network flakiness): {e}")
+
    assert np.array_equal(video_sync, video_async)
    assert metadata_sync == metadata_async

--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@ -106,8 +106,8 @@ def create_common_attn_metadata(
        query_start_loc=query_start_loc,
        query_start_loc_cpu=query_start_loc_cpu,
        seq_lens=seq_lens,
-        seq_lens_cpu=seq_lens_cpu,
-        num_computed_tokens_cpu=num_computed_tokens_cpu,
+        _seq_lens_cpu=seq_lens_cpu,
+        _num_computed_tokens_cpu=num_computed_tokens_cpu,
        num_reqs=batch_spec.batch_size,
        num_actual_tokens=num_tokens,
        max_query_len=max_query_len,
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@ -8,6 +8,7 @@ import torch._dynamo.config as dynamo_config

 from vllm import SamplingParams
 from vllm.logprobs import Logprob
+from vllm.platforms import current_platform
 from vllm.sampling_params import StructuredOutputsParams
 from vllm.v1.metrics.reader import Metric

@ -70,6 +71,18 @@ def test_without_spec_decoding(
        (True, "uni", True, None, True),
    ]

+    if current_platform.is_rocm():
+        # On ROCm, Only test with structured_outputs (deterministic)
+        # and skip chunk_prefill (more variable).
+        test_configs = [
+            cfg
+            for cfg in test_configs
+            if not cfg[4]  # skip chunk_prefill=True
+        ]
+        test_sampling_params = [
+            p for p in test_sampling_params if p.get("structured_outputs") is not None
+        ]
+
    run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)


@ -108,7 +121,14 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
        (True, "uni", True, spec_config_short, True),
    ]

-    run_tests(monkeypatch, MTP_MODEL, test_configs, test_sampling_params)
+    # On ROCm, use TRITON_ATTN + float32 for better numerical consistency
+    run_tests(
+        monkeypatch,
+        MTP_MODEL,
+        test_configs,
+        test_sampling_params,
+        is_testing_with_spec_decoding=True,
+    )


@dynamo_config.patch(cache_size_limit=16)
@ -117,13 +137,21 @@ def run_tests(
    model: str,
    test_configs: list[tuple],
    test_sampling_params: list[dict[str, Any]],
+    is_testing_with_spec_decoding: bool = False,
 ):
    """Test consistency of combos of async scheduling, preemption,
    uni/multiproc executor with spec decoding."""

    with monkeypatch.context() as m:
        # avoid precision errors
-        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
+        if current_platform.is_rocm():
+            if is_testing_with_spec_decoding:
+                # Use TRITON_ATTN for spec decoding test for consistency
+                m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
+            else:
+                m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA")
+        else:
+            m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
        # lock matmul precision to full FP32
        m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
        # m.setenv("VLLM_BATCH_INVARIANT", "1")
@ -145,6 +173,7 @@ def run_tests(
                async_scheduling,
                spec_config,
                test_prefill_chunking=test_prefill_chunking,
+                is_testing_with_spec_decoding=is_testing_with_spec_decoding,
            )
            outputs.append(test_results)

@ -174,17 +203,34 @@ def run_tests(
                    name_0=f"baseline=[{baseline_config}], params={params}",
                    name_1=f"config=[{test_config}], params={params}",
                )
-                assert _all_logprobs_match(base_logprobs, test_logprobs)
+
+                # On ROCm with TRITON_ATTN (spec decoding test), skip strict
+                # logprobs comparison when logprobs are requested
+                skip_logprobs_check = (
+                    current_platform.is_rocm()
+                    and params.get("logprobs")
+                    and is_testing_with_spec_decoding
+                )
+                if not skip_logprobs_check:
+                    assert _all_logprobs_match(base_logprobs, test_logprobs)

                if (
                    base_acceptance_rate is not None
                    and test_acceptance_rate is not None
                ):
                    if "spec_mml=None" in test_config:
+                        # Preemption causes more variance in acceptance rates
+                        if (
+                            current_platform.is_rocm()
+                            and "preemption=True" in test_config
+                        ):
+                            tolerance = 0.10
+                        else:
+                            tolerance = 0.05
                        assert (
                            test_acceptance_rate > base_acceptance_rate
                            or test_acceptance_rate
-                            == pytest.approx(base_acceptance_rate, rel=5e-2)
+                            == pytest.approx(base_acceptance_rate, rel=tolerance)
                        )
                    else:
                        # Currently the reported acceptance rate is expected to be
@ -215,6 +261,7 @@ def run_test(
    async_scheduling: bool,
    spec_config: dict[str, Any] | None,
    test_prefill_chunking: bool,
+    is_testing_with_spec_decoding: bool = False,
 ):
    spec_decoding = spec_config is not None
    cache_arg: dict[str, Any] = (
@ -233,6 +280,15 @@ def run_test(
    print("-" * 80)
    print(f"---- TESTING {test_str}: {test_config}")
    print("-" * 80)
+
+    # On ROCm: use float16 for first test (ROCM_AITER_FA), but float32 for
+    # spec decoding test (TRITON_ATTN) for better precision.
+    # On others: always use float32.
+    if current_platform.is_rocm() and not is_testing_with_spec_decoding:
+        dtype = "float16"
+    else:
+        dtype = "float32"
+
    with VllmRunner(
        model,
        max_model_len=512,
@ -242,7 +298,7 @@ def run_test(
        # enforce_eager=True,
        async_scheduling=async_scheduling,
        distributed_executor_backend=executor,
-        dtype="float32",  # avoid precision errors
+        dtype=dtype,
        speculative_config=spec_config,
        disable_log_stats=False,
        **cache_arg,
@ -302,11 +358,21 @@ def _all_logprobs_match(req_a, req_b) -> bool:


 def _logprobs_match(lps_a: dict[int, Logprob], lps_b: dict[int, Logprob]) -> bool:
-    return len(lps_a) == len(lps_b) and all(
-        a.decoded_token == b.decoded_token
-        and a.rank == b.rank
-        and a.logprob == pytest.approx(b.logprob, rel=1e-3, abs=1e-6)
-        for a, b in ((lps_a[x], lps_b[x]) for x in lps_a)
+    if current_platform.is_rocm():
+        # ROCm has higher numerical variance
+        # due to use of float16.
+        rel_tol, abs_tol = 5e-2, 1e-5
+    else:
+        rel_tol, abs_tol = 1e-3, 1e-6
+    return (
+        len(lps_a) == len(lps_b)
+        and lps_a.keys() == lps_b.keys()
+        and all(
+            a.decoded_token == b.decoded_token
+            and a.rank == b.rank
+            and a.logprob == pytest.approx(b.logprob, rel=rel_tol, abs=abs_tol)
+            for a, b in ((lps_a[x], lps_b[x]) for x in lps_a)
+        )
    )


--- a/tests/v1/e2e/test_async_spec_decode.py
+++ b/tests/v1/e2e/test_async_spec_decode.py
@ -0,0 +1,131 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test that verifies no implicit GPU-CPU synchronization occurs during
+speculative decoding generation under expected conditions.
+"""
+
+import multiprocessing
+import sys
+import traceback
+
+import pytest
+import torch
+
+
+@pytest.fixture
+def sync_tracker():
+    """
+    Fixture that patches CommonAttentionMetadata.seq_lens_cpu to detect
+    lazy init syncs. Prints stack traces immediately when syncs occur.
+    """
+    from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+
+    # Shared counter for cross-process communication (inherited by fork)
+    sync_count = multiprocessing.Value("i", 0)
+
+    # Save original property
+    original_prop = CommonAttentionMetadata.seq_lens_cpu
+    original_fget = original_prop.fget
+
+    # Create tracking wrapper
+    def tracking_seq_lens_cpu(self):
+        if self._seq_lens_cpu is None:
+            # Increment counter
+            with sync_count.get_lock():
+                sync_count.value += 1
+                count = sync_count.value
+            # Print stack trace immediately (shows in subprocess output)
+            print(f"\n{'=' * 60}", file=sys.stderr)
+            print(f"SYNC #{count}: seq_lens_cpu lazy init triggered!", file=sys.stderr)
+            print(f"{'=' * 60}", file=sys.stderr)
+            traceback.print_stack(file=sys.stderr)
+            print(f"{'=' * 60}\n", file=sys.stderr)
+            sys.stderr.flush()
+        return original_fget(self)
+
+    # Apply patch
+    CommonAttentionMetadata.seq_lens_cpu = property(tracking_seq_lens_cpu)
+
+    class SyncTracker:
+        @property
+        def count(self) -> int:
+            return sync_count.value
+
+        def assert_no_sync(self, msg: str = ""):
+            count = sync_count.value
+            assert count == 0, (
+                f"Unexpected GPU-CPU sync: seq_lens_cpu lazy init triggered "
+                f"{count} times. See stack traces above. {msg}"
+            )
+
+    yield SyncTracker()
+
+    # Restore original property
+    CommonAttentionMetadata.seq_lens_cpu = original_prop
+    torch._dynamo.reset()
+
+
+# Test configurations: (model, spec_model, method, num_spec_tokens, backend_env)
+SPEC_DECODE_CONFIGS = [
+    pytest.param(
+        "meta-llama/Llama-3.2-1B-Instruct",
+        "nm-testing/Llama3_2_1B_speculator.eagle3",
+        "eagle3",
+        2,
+        id="eagle3-llama",
+    ),
+    pytest.param(
+        "eagle618/deepseek-v3-random",
+        "eagle618/eagle-deepseek-v3-random",
+        "eagle",
+        2,
+        id="eagle-mla-deepseek",
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    "model,spec_model,method,num_spec_tokens",
+    SPEC_DECODE_CONFIGS,
+)
+def test_no_sync_with_spec_decode(
+    sync_tracker,
+    model: str,
+    spec_model: str,
+    method: str,
+    num_spec_tokens: int,
+):
+    """
+    Test that no implicit GPU-CPU sync occurs during speculative decoding
+    generation.
+    """
+    # Import vLLM AFTER sync_tracker fixture has applied the patch
+    from vllm import LLM, SamplingParams
+    from vllm.distributed import cleanup_dist_env_and_memory
+
+    llm = LLM(
+        model=model,
+        max_model_len=256,
+        speculative_config={
+            "method": method,
+            "num_speculative_tokens": num_spec_tokens,
+            "model": spec_model,
+        },
+        enforce_eager=True,
+        async_scheduling=True,
+    )
+
+    outputs = llm.generate(
+        ["Hello, my name is"],
+        SamplingParams(temperature=0, max_tokens=10),
+    )
+
+    assert len(outputs) == 1
+    assert len(outputs[0].outputs[0].text) > 0
+
+    del llm
+    torch.cuda.empty_cache()
+    cleanup_dist_env_and_memory()
+
+    sync_tracker.assert_no_sync()
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@ -191,8 +191,8 @@ def test_suffix_decoding_acceptance(
    # Expect the acceptance rate to improve.
    assert first_accept_rate < last_accept_rate

-    # Heuristic: expect at least 82.5% acceptance rate at the end.
-    assert last_accept_rate > 0.825
+    # Heuristic: expect at least 80.0% acceptance rate at the end.
+    assert last_accept_rate > 0.80

    del spec_llm
    torch.cuda.empty_cache()
--- a/tests/v1/spec_decode/test_tree_attention.py
+++ b/tests/v1/spec_decode/test_tree_attention.py
@ -88,8 +88,8 @@ def forward_attention(
        query_start_loc=query_start_loc,
        query_start_loc_cpu=query_start_loc.cpu(),
        seq_lens=seq_lens,
-        seq_lens_cpu=seq_lens.cpu(),
-        num_computed_tokens_cpu=context_lens.cpu(),
+        _seq_lens_cpu=seq_lens.cpu(),
+        _num_computed_tokens_cpu=context_lens.cpu(),
        num_reqs=batch_size,
        num_actual_tokens=num_actual_tokens,
        max_query_len=max_query_len,
--- a/vllm/attention/layers/cross_attention.py
+++ b/vllm/attention/layers/cross_attention.py
@ -103,7 +103,7 @@ def create_cross_attention_backend(
            # needed here to know how many tokens to attend to from the cached
            # cross-attention KV cache.
            new_metadata.seq_lens = common_attn_metadata.encoder_seq_lens
-            new_metadata.seq_lens_cpu = torch.from_numpy(
+            new_metadata._seq_lens_cpu = torch.from_numpy(
                common_attn_metadata.encoder_seq_lens_cpu
            )

--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@ -788,7 +788,7 @@ async def benchmark(
        )
    print(
        "{:<40} {:<10.2f}".format(
-            "Total Token throughput (tok/s):", metrics.total_token_throughput
+            "Total token throughput (tok/s):", metrics.total_token_throughput
        )
    )

--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@ -667,8 +667,9 @@ class VllmConfig:

        default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
        self._apply_optimization_level_defaults(default_config)
+
        if (
-            self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            self.compilation_config.cudagraph_mode.requires_piecewise_compilation()
            and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
        ):
            logger.info(
@ -693,22 +694,29 @@ class VllmConfig:

        if current_platform.support_static_graph_mode():
            # if cudagraph_mode has full cudagraphs, we need to check support
-            if (
-                self.compilation_config.cudagraph_mode.has_full_cudagraphs()
-                and self.model_config is not None
-            ):
-                if self.model_config.pooler_config is not None:
+            if model_config := self.model_config:
+                if (
+                    self.compilation_config.cudagraph_mode.has_full_cudagraphs()
+                    and model_config.pooler_config is not None
+                ):
                    logger.warning_once(
                        "Pooling models do not support full cudagraphs. "
                        "Overriding cudagraph_mode to PIECEWISE."
                    )
                    self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
-                elif self.model_config.is_encoder_decoder:
-                    logger.warning_once(
-                        "Encoder-decoder models do not support full cudagraphs. "
-                        "Overriding cudagraph_mode to PIECEWISE."
+                elif (
+                    model_config.is_encoder_decoder
+                    and self.compilation_config.cudagraph_mode
+                    not in (CUDAGraphMode.NONE, CUDAGraphMode.FULL_DECODE_ONLY)
+                ):
+                    logger.info_once(
+                        "Encoder-decoder models do not support %s. "
+                        "Overriding cudagraph_mode to FULL_DECODE_ONLY.",
+                        self.compilation_config.cudagraph_mode.name,
+                    )
+                    self.compilation_config.cudagraph_mode = (
+                        CUDAGraphMode.FULL_DECODE_ONLY
                    )
-                    self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE

            # disable cudagraph when enforce eager execution
            if self.model_config is not None and self.model_config.enforce_eager:
@ -1048,8 +1056,14 @@ class VllmConfig:
                self.compilation_config.max_cudagraph_capture_size
            )
            if max_cudagraph_capture_size is None:
+                decode_query_len = 1
+                if (
+                    self.speculative_config
+                    and self.speculative_config.num_speculative_tokens
+                ):
+                    decode_query_len += self.speculative_config.num_speculative_tokens
                max_cudagraph_capture_size = min(
-                    self.scheduler_config.max_num_seqs * 2, 512
+                    self.scheduler_config.max_num_seqs * decode_query_len * 2, 512
                )
            max_num_tokens = self.scheduler_config.max_num_batched_tokens
            max_cudagraph_capture_size = min(max_num_tokens, max_cudagraph_capture_size)
--- a/vllm/entrypoints/constants.py
+++ b/vllm/entrypoints/constants.py
@ -8,3 +8,5 @@ Shared constants for vLLM entrypoints.
 # These constants help mitigate header abuse attacks
 H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT = 4194304  # 4 MB
 H11_MAX_HEADER_COUNT_DEFAULT = 256
+
+MCP_PREFIX = "mcp_"
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@ -19,6 +19,7 @@ from vllm import envs
 from vllm.entrypoints.chat_utils import (
    ChatTemplateContentFormatOption,
 )
+from vllm.entrypoints.constants import MCP_PREFIX
 from vllm.entrypoints.openai.parser.harmony_utils import (
    get_encoding,
    get_streamable_parser_for_assistant,
@ -303,7 +304,7 @@ class ParsableContext(ConversationContext):
        result_str = result.content[0].text

        message = ResponseFunctionToolCallOutputItem(
-            id=f"fco_{random_uuid()}",
+            id=f"mcpo_{random_uuid()}",
            type="function_call_output",
            call_id=f"call_{random_uuid()}",
            output=result_str,
@ -385,6 +386,9 @@ class ParsableContext(ConversationContext):
        if not self.parser.response_messages:
            return []
        last_msg = self.parser.response_messages[-1]
+        # change this to a mcp_ function call
+        last_msg.id = f"{MCP_PREFIX}{random_uuid()}"
+        self.parser.response_messages[-1] = last_msg
        if last_msg.name == "code_interpreter":
            return await self.call_python_tool(self._tool_sessions["python"], last_msg)
        elif last_msg.name == "web_search_preview":
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@ -1339,6 +1339,7 @@ class OpenAIServing:
                )
                engine_prompt = engine_prompts[0]
                request_prompt = request_prompts[0]
+                prompt_text, _, _ = self._get_prompt_components(request_prompt)

            # Update the sampling params.
            sampling_params.max_tokens = self.max_model_len - len(
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@ -22,6 +22,7 @@ from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from openai.types.responses.tool import Tool

 from vllm import envs
+from vllm.entrypoints.constants import MCP_PREFIX
 from vllm.entrypoints.openai.protocol import (
    ChatCompletionMessageParam,
    ResponseInputOutputItem,
@ -44,13 +45,13 @@ def make_response_output_items_from_parsable_context(
                )
            if isinstance(output_messages[-1], ResponseFunctionToolCall):
                mcp_message = McpCall(
-                    id=f"mcp_{random_uuid()}",
+                    id=f"{MCP_PREFIX}{random_uuid()}",
                    arguments=output_messages[-1].arguments,
                    name=output_messages[-1].name,
                    server_label=output_messages[
                        -1
                    ].name,  # TODO: store the server label
-                    type="mcp_call",
+                    type=f"{MCP_PREFIX}call",
                    status="completed",
                    output=message.output,
                    # TODO: support error output
@ -98,12 +99,63 @@ def construct_input_messages(
    if isinstance(request_input, str):
        messages.append({"role": "user", "content": request_input})
    else:
-        for item in request_input:
-            messages.append(construct_chat_message_with_tool_call(item))
+        input_messages = construct_chat_messages_with_tool_call(request_input)
+        messages.extend(input_messages)
    return messages


-def construct_chat_message_with_tool_call(
+def _maybe_combine_reasoning_and_tool_call(
+    item: ResponseInputOutputItem, messages: list[ChatCompletionMessageParam]
+) -> ChatCompletionMessageParam | None:
+    """Many models treat MCP calls and reasoning as a single message.
+    This function checks if the last message is a reasoning message and
+    the current message is a tool call"""
+    if not (
+        isinstance(item, ResponseFunctionToolCall) and item.id.startswith(MCP_PREFIX)
+    ):
+        return None
+    if len(messages) == 0:
+        return None
+    last_message = messages[-1]
+    if not (
+        last_message.get("role") == "assistant"
+        and last_message.get("reasoning") is not None
+    ):
+        return None
+
+    last_message["tool_calls"] = [
+        ChatCompletionMessageToolCallParam(
+            id=item.call_id,
+            function=FunctionCallTool(
+                name=item.name,
+                arguments=item.arguments,
+            ),
+            type="function",
+        )
+    ]
+    return last_message
+
+
+def construct_chat_messages_with_tool_call(
+    input_messages: list[ResponseInputOutputItem],
+) -> list[ChatCompletionMessageParam]:
+    """This function wraps _construct_single_message_from_response_item
+    Because some chatMessages come from multiple response items
+    for example a reasoning item and a MCP tool call are two response items
+    but are one chat message
+    """
+    messages: list[ChatCompletionMessageParam] = []
+    for item in input_messages:
+        maybe_combined_message = _maybe_combine_reasoning_and_tool_call(item, messages)
+        if maybe_combined_message is not None:
+            messages[-1] = maybe_combined_message
+        else:
+            messages.append(_construct_single_message_from_response_item(item))
+
+    return messages
+
+
+def _construct_single_message_from_response_item(
    item: ResponseInputOutputItem,
 ) -> ChatCompletionMessageParam:
    if isinstance(item, ResponseFunctionToolCall):
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@ -881,7 +881,7 @@ class FusedMoE(CustomOp):
            # Record that the clone will be used by shared_experts_stream
            # to avoid gc issue from deallocation of hidden_states_clone
            # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
-            # NOTE: We dont need shared_output.record_stream(current_stream())
+            # NOTE: We don't need shared_output.record_stream(current_stream())
            # because we synch the streams before using shared_output.
            hidden_states_clone.record_stream(self.shared_experts_stream)

--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py
@ -28,7 +28,7 @@ class CompressedTensorsW4A16Fp4(CompressedTensorsScheme):

    @classmethod
    def get_min_capability(cls) -> int:
-        # dont restrict as emulations
+        # don't restrict as emulations
        return 80

    def create_weights(
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@ -31,7 +31,6 @@ from vllm.model_executor.utils import replace_parameter
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import (
-    DeepGemmQuantScaleFMT,
    fp8_gemm_nt,
    is_deep_gemm_e8m0_used,
    is_deep_gemm_supported,
@ -248,6 +247,7 @@ class W8A8BlockFp8LinearOp:
        self.act_quant_group_shape = act_quant_group_shape
        self.is_deep_gemm_supported = is_deep_gemm_supported()
        self.is_hopper = current_platform.is_device_capability(90)
+        self.is_blackwell = current_platform.is_device_capability(100)
        self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used()

        # Get the correct blockscale mul and input quant operations.
@ -303,7 +303,7 @@ class W8A8BlockFp8LinearOp:
        weight: torch.Tensor,
        weight_scale: torch.Tensor,
    ) -> torch.Tensor:
-        if DeepGemmQuantScaleFMT.from_oracle() == DeepGemmQuantScaleFMT.UE8M0:
+        if self.use_deep_gemm_e8m0 and self.is_blackwell:
            q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm(
                input_2d,
                group_size=self.act_quant_group_shape.col,
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@ -502,6 +502,7 @@ class HunYuanVisionTransformer(nn.Module):
        cu_seqlens: list = [0]

        hidden_states = x.to(device=self.device, dtype=self.dtype)
+        # embeddings = patch_embeds + patch_pos_embed
        hidden_states = self.embeddings(hidden_states, grid_thw)

        for t, h, w in grid_thw:
@ -515,8 +516,14 @@ class HunYuanVisionTransformer(nn.Module):

        hidden_states = hidden_states.reshape(seq_len, -1)
        hidden_states = hidden_states.unsqueeze(0)
-        for layer_num, layer in enumerate(self.layers):
-            hidden_states = layer(hidden_states)
+
+        # build per-image lengths once
+        split_lengths = [int(h) * int(w) for (_, h, w) in grid_thw]
+        for layer in self.layers:
+            # hidden_states: (1, T_total, D)
+            parts = hidden_states.split(split_lengths, dim=1)  # list of (1, L_i, D)
+            parts = [layer(p) for p in parts]
+            hidden_states = torch.cat(parts, dim=1)

        # adapter
        split_lengths = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
--- a/vllm/model_executor/models/minimax_m2.py
+++ b/vllm/model_executor/models/minimax_m2.py
@ -201,7 +201,7 @@ class MiniMaxM2Attention(nn.Module):

        self.rotary_emb = get_rope(
            self.head_dim,
-            rotary_dim=rotary_dim,
+            rotary_dim=self.head_dim,
            max_position=max_position_embeddings,
            rope_parameters=rope_parameters,
        )
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@ -403,6 +403,7 @@ class Qwen3MoeModel(nn.Module):
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size
        self.config = config
+        self.quant_config = quant_config
        self.embed_tokens = VocabParallelEmbedding(
            config.vocab_size,
            config.hidden_size,
@ -505,6 +506,19 @@ class Qwen3MoeModel(nn.Module):
        loaded_params: set[str] = set()
        expert_params_mapping = self.get_expert_mapping()
        for name, loaded_weight in weights:
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                assert loaded_weight.numel() == 1, (
+                    f"KV scale numel {loaded_weight.numel()} != 1"
+                )
+                loaded_weight = loaded_weight.squeeze()
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
            for param_name, weight_name, shard_id in stacked_params_mapping:
                # Skip non-stacked layers and experts (experts handled below).
                if weight_name not in name:
--- a/vllm/model_executor/warmup/deep_gemm_warmup.py
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@ -89,7 +89,7 @@ def _extract_data_from_linear_base_module(
    assert m.quant_method.quant_config is not None

    w = m.weight
-    ws = m.weight_scale
+    ws = m.weight_scale_inv if hasattr(m, "weight_scale_inv") else m.weight_scale
    quant_block_size = m.quant_method.quant_config.weight_block_size

    assert isinstance(w, torch.Tensor)
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@ -954,7 +954,7 @@ MultiModalKwargsOptionalItems: TypeAlias = (
 )


-@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.13.")
+@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.14.")
 class MultiModalKwargs(UserDict[str, NestedTensors]):
    """
    A dictionary that represents the keyword arguments to
@ -964,7 +964,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
    @staticmethod
    @deprecated(
        "`MultiModalKwargs.from_hf_inputs` is deprecated and "
-        "will be removed in v0.13. "
+        "will be removed in v0.14. "
        "Please use `MultiModalKwargsItems.from_hf_inputs` and "
        "access the tensor data using `.get_data()`."
    )
@ -977,7 +977,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
    @staticmethod
    @deprecated(
        "`MultiModalKwargs.from_items` is deprecated and "
-        "will be removed in v0.13. "
+        "will be removed in v0.14. "
        "Please use `MultiModalKwargsItems.from_seq` and "
        "access the tensor data using `.get_data()`."
    )
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@ -429,12 +429,12 @@ def group_mm_kwargs_by_modality(
    if merge_by_field_config is not None:
        logger.warning_once(
            "The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` "
-            "is deprecated and will be removed in v0.13."
+            "is deprecated and will be removed in v0.14."
        )
    if multimodal_cpu_fields is not None:
        logger.warning_once(
            "The `multimodal_cpu_fields` argument of `group_mm_kwargs_by_modality` "
-            "is deprecated and will be removed in v0.13."
+            "is deprecated and will be removed in v0.14."
        )

    from vllm.multimodal.inputs import MultiModalKwargsItems
--- a/vllm/tokenizers/deepseekv32.py
+++ b/vllm/tokenizers/deepseekv32.py
@ -17,6 +17,8 @@ class DeepseekV32Tokenizer(HfTokenizer):
        self.name_or_path = (
            tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else ""
        )
+        self._added_vocab = self.tokenizer.get_added_vocab()
+        self._added_vocab_size = len(self._added_vocab)

    @classmethod
    def from_pretrained(
@ -98,7 +100,7 @@ class DeepseekV32Tokenizer(HfTokenizer):

    def __len__(self) -> int:
        # </think> is an added token in DeepseekV32 tokenizer
-        return self.vocab_size + len(self.get_added_vocab())
+        return self.vocab_size + self._added_vocab_size

    def __call__(
        self,
@ -120,7 +122,7 @@ class DeepseekV32Tokenizer(HfTokenizer):
        return self.tokenizer.get_vocab()

    def get_added_vocab(self) -> dict[str, int]:
-        return self.tokenizer.get_added_vocab()
+        return self._added_vocab.copy()

    def encode(
        self,
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@ -17,7 +17,7 @@ def __getattr__(name: str):
        warnings.warn(
            "`vllm.transformers_utils.tokenizer.AnyTokenizer` has been moved to "
            "`vllm.tokenizers.TokenizerLike`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
            DeprecationWarning,
            stacklevel=2,
        )
@ -29,7 +29,7 @@ def __getattr__(name: str):
        warnings.warn(
            "`vllm.transformers_utils.tokenizer.get_tokenizer` "
            "has been moved to `vllm.tokenizers.get_tokenizer`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
            DeprecationWarning,
            stacklevel=2,
        )
@ -41,7 +41,7 @@ def __getattr__(name: str):
        warnings.warn(
            "`vllm.transformers_utils.tokenizer.cached_get_tokenizer` "
            "has been moved to `vllm.tokenizers.cached_get_tokenizer`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
            DeprecationWarning,
            stacklevel=2,
        )
@ -53,7 +53,7 @@ def __getattr__(name: str):
        warnings.warn(
            "`vllm.transformers_utils.tokenizer.cached_tokenizer_from_config` "
            "has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
            DeprecationWarning,
            stacklevel=2,
        )
@ -65,7 +65,7 @@ def __getattr__(name: str):
        warnings.warn(
            "`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` "
            "has been moved to `vllm.tokenizers.init_tokenizer_from_config`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
            DeprecationWarning,
            stacklevel=2,
        )
@ -75,7 +75,7 @@ def __getattr__(name: str):
    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


-@deprecated("Will be removed in v0.13. Please use `tokenizer.decode()` instead.")
+@deprecated("Will be removed in v0.14. Please use `tokenizer.decode()` instead.")
 def decode_tokens(
    tokenizer: TokenizerLike,
    token_ids: list[int],
@ -97,7 +97,7 @@ def decode_tokens(
    return tokenizer.decode(token_ids, **kw_args)


-@deprecated("Will be removed in v0.13. Please use `tokenizer.encode()` instead.")
+@deprecated("Will be removed in v0.14. Please use `tokenizer.encode()` instead.")
 def encode_tokens(
    tokenizer: TokenizerLike,
    text: str,
--- a/vllm/transformers_utils/tokenizer_base.py
+++ b/vllm/transformers_utils/tokenizer_base.py
@ -11,7 +11,7 @@ def __getattr__(name: str):
        warnings.warn(
            "`vllm.transformers_utils.tokenizer_base.TokenizerBase` has been "
            "moved to `vllm.tokenizers.TokenizerLike`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
            DeprecationWarning,
            stacklevel=2,
        )
@ -23,7 +23,7 @@ def __getattr__(name: str):
        warnings.warn(
            "`vllm.transformers_utils.tokenizer_base.TokenizerRegistry` has been "
            "moved to `vllm.tokenizers.TokenizerRegistry`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
            DeprecationWarning,
            stacklevel=2,
        )
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@ -21,7 +21,7 @@ from vllm.v1.attention.backends.utils import (
    CommonAttentionMetadata,
    split_decodes_and_prefills,
 )
-from vllm.v1.kv_cache_interface import AttentionSpec
+from vllm.v1.kv_cache_interface import AttentionSpec, CrossAttentionSpec

 logger = init_logger(__name__)

@ -50,11 +50,13 @@ class CPUAttentionBackend(AttentionBackend):

    @classmethod
    def supports_attn_type(cls, attn_type: str) -> bool:
-        """CPU attention supports decoder and encoder-only attention."""
+        """CPU attention supports decoder,
+        encoder-only and encoder-decoder attention."""
        return attn_type in (
            AttentionType.DECODER,
            AttentionType.ENCODER,
            AttentionType.ENCODER_ONLY,
+            AttentionType.ENCODER_DECODER,
        )

    @staticmethod
@ -136,6 +138,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
            self.window_size = -1
        self.block_size = vllm_config.cache_config.block_size
        self.isa = _get_attn_isa(self.dtype, self.block_size)
+        self.is_cross_attention = isinstance(kv_cache_spec, CrossAttentionSpec)

    def build(
        self,
@ -151,7 +154,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
        seq_lens = common_attn_metadata.seq_lens
        block_table_tensor = common_attn_metadata.block_table_tensor
        slot_mapping = common_attn_metadata.slot_mapping
-        causal = common_attn_metadata.causal
+        causal = False if self.is_cross_attention else common_attn_metadata.causal

        sdpa_start_loc = query_start_loc
        num_decode_tokens = 0
@ -171,22 +174,19 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata]
            query_start_loc = query_start_loc[: num_decodes + 1]
            block_table_tensor = block_table_tensor[:num_decodes]

-        sheduler_metadata = None
-        if causal:
-            # for decode batch, use the custom kernel
-            sheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
-                num_reqs=num_reqs,
-                num_heads=self.num_heads,
-                num_kv_heads=self.num_kv_heads,
-                head_dim=self.head_dim,
-                seq_lens=seq_lens,
-                dtype=self.dtype,
-                query_start_loc=query_start_loc,
-                causal=causal,
-                sliding_window_size=self.window_size,
-                isa=self.isa,
-                enable_kv_split=True,
-            )
+        sheduler_metadata = ops.cpu_attn_get_scheduler_metadata(
+            num_reqs=num_reqs,
+            num_heads=self.num_heads,
+            num_kv_heads=self.num_kv_heads,
+            head_dim=self.head_dim,
+            seq_lens=seq_lens,
+            dtype=self.dtype,
+            query_start_loc=query_start_loc,
+            causal=causal,
+            sliding_window_size=self.window_size,
+            isa=self.isa,
+            enable_kv_split=True,
+        )

        attn_metadata = CPUAttentionMetadata(
            isa=self.isa,
--- a/vllm/v1/attention/backends/gdn_attn.py
+++ b/vllm/v1/attention/backends/gdn_attn.py
@ -370,6 +370,6 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata]

        num_accepted_tokens = torch.diff(m.query_start_loc)
        num_decode_draft_tokens_cpu = (num_accepted_tokens - 1).cpu()
-        m.num_computed_tokens_cpu = m.seq_lens_cpu - num_accepted_tokens.cpu()
+        m._num_computed_tokens_cpu = m.seq_lens_cpu - num_accepted_tokens.cpu()

        return self.build(0, m, num_accepted_tokens, num_decode_draft_tokens_cpu)
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@ -18,7 +18,7 @@ from typing import (

 import numpy as np
 import torch
-from typing_extensions import runtime_checkable
+from typing_extensions import deprecated, runtime_checkable

 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.utils.math_utils import cdiv
@ -66,11 +66,6 @@ class CommonAttentionMetadata:
    """(batch_size + 1,), the start location of each request in query Tensor"""

    seq_lens: torch.Tensor
-    seq_lens_cpu: torch.Tensor
-    """(batch_size,), the length of each request including both computed tokens
-    and newly scheduled tokens"""
-
-    num_computed_tokens_cpu: torch.Tensor
    """(batch_size,), the number of computed tokens for each request"""

    num_reqs: int
@ -81,7 +76,7 @@ class CommonAttentionMetadata:
    max_query_len: int
    """Longest query in batch"""
    max_seq_len: int
-    """Longest context length in batch"""
+    """Longest context length (may be an upper bound)"""

    block_table_tensor: torch.Tensor
    slot_mapping: torch.Tensor
@ -100,6 +95,40 @@ class CommonAttentionMetadata:
    dcp_local_seq_lens_cpu: torch.Tensor | None = None
    """Sequence lengths of the local rank in decode context parallelism world"""

+    # WARNING: Deprecated fields. Will be removed in a future release (v0.14.0)
+    _seq_lens_cpu: torch.Tensor | None = None
+    _num_computed_tokens_cpu: torch.Tensor | None = None
+
+    @property
+    @deprecated(
+        """
+    Prefer using device seq_lens directly to avoid implicit H<>D sync.
+    If a CPU copy is needed, use `seq_lens.cpu()` instead.
+    Will be removed in a future release (v0.14.0)
+    """
+    )
+    def seq_lens_cpu(self) -> torch.Tensor:
+        if self._seq_lens_cpu is None:
+            self._seq_lens_cpu = self.seq_lens.to("cpu")
+        return self._seq_lens_cpu
+
+    @property
+    @deprecated(
+        """
+    Prefer using device seq_lens directly to avoid implicit H<>D sync which breaks full
+    async scheduling. If a CPU copy is needed, it can be derived from 
+    query_start_loc_cpu and seq_lens.
+    Will be removed in a future release (v0.14.0)
+    """
+    )
+    def num_computed_tokens_cpu(self) -> torch.Tensor:
+        if self._num_computed_tokens_cpu is None:
+            query_seq_lens = (
+                self.query_start_loc_cpu[1:] - self.query_start_loc_cpu[:-1]
+            )
+            self._num_computed_tokens_cpu = self.seq_lens_cpu - query_seq_lens
+        return self._num_computed_tokens_cpu
+
    # TODO(lucas): remove once we have FULL-CG spec-decode support
    def unpadded(
        self, num_actual_tokens: int, num_actual_reqs: int
@ -109,8 +138,12 @@ class CommonAttentionMetadata:
            query_start_loc=self.query_start_loc[: num_actual_reqs + 1],
            query_start_loc_cpu=self.query_start_loc_cpu[: num_actual_reqs + 1],
            seq_lens=self.seq_lens[:num_actual_reqs],
-            seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs],
-            num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs],
+            _seq_lens_cpu=self._seq_lens_cpu[:num_actual_reqs]
+            if self._seq_lens_cpu is not None
+            else None,
+            _num_computed_tokens_cpu=self._num_computed_tokens_cpu[:num_actual_reqs]
+            if self._num_computed_tokens_cpu is not None
+            else None,
            num_reqs=num_actual_reqs,
            num_actual_tokens=num_actual_tokens,
            max_query_len=self.max_query_len,
@ -224,14 +257,14 @@ def _make_metadata_with_slice(
        query_start_loc=query_start_loc,
        query_start_loc_cpu=query_start_loc_cpu,
        seq_lens=seq_lens,
-        seq_lens_cpu=seq_lens_cpu,
-        num_computed_tokens_cpu=num_computed_tokens_cpu,
        num_reqs=num_requests,
        num_actual_tokens=num_actual_tokens,
        max_query_len=max_query_len,
        max_seq_len=max_seq_len,
        block_table_tensor=block_table_tensor,
        slot_mapping=slot_mapping,
+        _seq_lens_cpu=seq_lens_cpu,
+        _num_computed_tokens_cpu=num_computed_tokens_cpu,
    )


@ -689,9 +722,7 @@ def make_local_attention_virtual_batches(
    return CommonAttentionMetadata(
        query_start_loc_cpu=query_start_loc_cpu,
        query_start_loc=query_start_loc_cpu.to(device=device, non_blocking=True),
-        seq_lens_cpu=seq_lens_cpu,
        seq_lens=seq_lens_cpu.to(device=device, non_blocking=True),
-        num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local),
        num_reqs=len(seq_lens_cpu),
        num_actual_tokens=common_attn_metadata.num_actual_tokens,
        max_query_len=seqlens_q_local.max(),
@ -699,6 +730,8 @@ def make_local_attention_virtual_batches(
        block_table_tensor=block_table_local,
        slot_mapping=common_attn_metadata.slot_mapping,
        causal=True,
+        _seq_lens_cpu=seq_lens_cpu,
+        _num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local),
    )


@ -719,7 +752,6 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
    logits_indices = logits_indices_padded[:num_logits_indices]
    num_reqs = common_attn_metadata.num_reqs
    query_start_loc = common_attn_metadata.query_start_loc
-    seq_lens = common_attn_metadata.seq_lens
    # Example inputs
    # num_reqs: 3
    # generation_indices:  [14, 18, 19, 27]
@ -748,9 +780,7 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
    common_attn_metadata = CommonAttentionMetadata(
        query_start_loc=decode_query_start_loc,
        query_start_loc_cpu=decode_query_start_loc.to("cpu", non_blocking=True),
-        seq_lens=seq_lens,
-        seq_lens_cpu=seq_lens.to("cpu", non_blocking=True),
-        num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu,
+        seq_lens=common_attn_metadata.seq_lens,
        num_reqs=num_reqs,
        num_actual_tokens=total_num_decode_tokens,
        max_query_len=decode_max_query_len,
@ -758,6 +788,8 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
        block_table_tensor=common_attn_metadata.block_table_tensor,
        slot_mapping=common_attn_metadata.slot_mapping,
        causal=True,
+        _seq_lens_cpu=common_attn_metadata._seq_lens_cpu,
+        _num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu,
    )
    return common_attn_metadata

--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@ -192,7 +192,7 @@ class AsyncLLM(EngineClient):
    @property
    @deprecated(
        "`AsyncLLM.processor` has been renamed to `AsyncLLM.input_processor`. "
-        "The old name will be removed in v0.13."
+        "The old name will be removed in v0.14."
    )
    def processor(self):
        return self.input_processor
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@ -139,7 +139,7 @@ class LLMEngine:
    @property
    @deprecated(
        "`LLMEngine.processor` has been renamed to `LLMEngine.input_processor`. "
-        "The old name will be removed in v0.13."
+        "The old name will be removed in v0.14."
    )
    def processor(self):
        return self.input_processor
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@ -10,7 +10,7 @@ def __getattr__(name: str):
        warnings.warn(
            "`vllm.v1.engine.processor.Processor` has been moved to "
            "`vllm.v1.engine.input_processor.InputProcessor`. "
-            "The old name will be removed in v0.13.",
+            "The old name will be removed in v0.14.",
            DeprecationWarning,
            stacklevel=2,
        )
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@ -440,16 +440,16 @@ class EagleProposer:
            # of main model.
            # Increment the sequence lengths.
            common_attn_metadata.seq_lens += 1
-            # This is an out-of-place operation to avoid modifying the original tensor.
-            common_attn_metadata.seq_lens_cpu = common_attn_metadata.seq_lens_cpu + 1
            # For the requests that exceed the max model length, we set the
            # sequence length to 1 to minimize their overheads in attention.
-
            common_attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1)

-            common_attn_metadata.num_computed_tokens_cpu = (
-                common_attn_metadata.seq_lens_cpu - 1
-            )
+            # Also update the CPU-side shadow; NOTE: this is hacky and should be
+            # removed in when common_attn_metadata.seq_lens_cpu is deprecated.
+            if common_attn_metadata._seq_lens_cpu is not None:
+                common_attn_metadata._seq_lens_cpu += 1
+            if common_attn_metadata._num_computed_tokens_cpu is not None:
+                common_attn_metadata._num_computed_tokens_cpu += 1

            # Compute the slot mapping.
            if self.uses_mrope:
@ -656,8 +656,8 @@ class EagleProposer:
            query_start_loc=common_attn_metadata.query_start_loc,
            seq_lens=common_attn_metadata.seq_lens,
            query_start_loc_cpu=query_start_loc_cpu,
-            seq_lens_cpu=common_attn_metadata.seq_lens_cpu,
-            num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu,
+            _seq_lens_cpu=common_attn_metadata._seq_lens_cpu,
+            _num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu,
            num_reqs=common_attn_metadata.num_reqs,
            num_actual_tokens=total_num_tokens,
            max_query_len=new_query_len_per_req.max().item(),
@ -932,8 +932,8 @@ class EagleProposer:
            query_start_loc=new_query_start_loc_cpu.to(device, non_blocking=True),
            seq_lens=new_seq_lens_cpu.to(device, non_blocking=True),
            query_start_loc_cpu=new_query_start_loc_cpu,
-            seq_lens_cpu=new_seq_lens_cpu,
-            num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu,
+            _seq_lens_cpu=new_seq_lens_cpu,
+            _num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu,
            num_reqs=common_attn_metadata.num_reqs,
            num_actual_tokens=total_num_tokens,
            max_query_len=new_query_len_per_req.max().item(),
--- a/vllm/v1/spec_decode/medusa.py
+++ b/vllm/v1/spec_decode/medusa.py
@ -38,16 +38,16 @@ class MedusaProposer:
        self,
        target_hidden_states: torch.Tensor,
        sampling_metadata: SamplingMetadata,
-    ) -> list[list[int]]:
+    ) -> torch.Tensor:
        # Generate blocks and compute logits
        blocks = self.model(target_hidden_states)
        logits = self.model.compute_logits(blocks)

-        # Get draft tokens and transpose the result
-        # TODO(woosuk): OPTIMIZATION: Return GPU tensor without GPU-CPU
-        # synchronization.
-        draft_tokens = [logit.argmax(dim=-1).tolist() for logit in logits]
-        return [list(row) for row in zip(*draft_tokens)]
+        # Compute argmax for each Medusa head and stack into a single tensor
+        # Shape: [batch_size, num_heads]
+        draft_tokens = torch.stack([logit.argmax(dim=-1) for logit in logits], dim=1)
+
+        return draft_tokens

    def load_model(self, target_model: nn.Module) -> None:
        from vllm.compilation.backends import set_model_tag
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@ -10,7 +10,7 @@ import torch
 import vllm.envs
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
-from vllm.tokenizers import MistralTokenizer
+from vllm.tokenizers import DeepseekV32Tokenizer, MistralTokenizer
 from vllm.utils.import_utils import LazyLoader
 from vllm.v1.structured_output.backend_types import (
    StructuredOutputBackend,
@ -56,6 +56,27 @@ class XgrammarBackend(StructuredOutputBackend):
                stop_token_ids=stop_token_ids,
                add_prefix_space=True,
            )
+        elif isinstance(self.tokenizer, DeepseekV32Tokenizer):
+            # copy from xgr.TokenizerInfo.from_huggingface()
+            # because we are using a custom tokenizer wrapper here.
+            vocab_dict = self.tokenizer.get_vocab()
+            tokenizer_vocab_size = max(len(vocab_dict), self.tokenizer.max_token_id + 1)
+            vocab_size = self.vocab_size or tokenizer_vocab_size
+            # maintain tokenizer's indexing
+            encoded_vocab = [""] * vocab_size
+            for token, idx in vocab_dict.items():
+                if idx < vocab_size:
+                    encoded_vocab[idx] = token
+            stop_token_ids = [self.tokenizer.eos_token_id]
+            backend_str = self.tokenizer.tokenizer.backend_tokenizer.to_str()
+            metadata = xgr.TokenizerInfo._detect_metadata_from_hf(backend_str)
+            tokenizer_info = xgr.TokenizerInfo(
+                encoded_vocab=encoded_vocab,
+                vocab_type=metadata["vocab_type"],
+                vocab_size=vocab_size,
+                stop_token_ids=stop_token_ids,
+                add_prefix_space=metadata["add_prefix_space"],
+            )
        else:
            tokenizer_info = xgr.TokenizerInfo.from_huggingface(
                self.tokenizer,
--- a/vllm/v1/worker/gpu/attn_utils.py
+++ b/vllm/v1/worker/gpu/attn_utils.py
@ -168,9 +168,9 @@ def build_attn_metadata(
            query_start_loc=query_start_loc_gpu,
            query_start_loc_cpu=query_start_loc_cpu,
            seq_lens=seq_lens,
-            seq_lens_cpu=seq_lens_cpu,
+            _seq_lens_cpu=seq_lens_cpu,
            max_seq_len=max_seq_len,
-            num_computed_tokens_cpu=num_computed_tokens_cpu,
+            _num_computed_tokens_cpu=num_computed_tokens_cpu,
            num_reqs=num_reqs,
            num_actual_tokens=num_tokens,
            max_query_len=max_query_len,
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -1267,6 +1267,8 @@ class GPUModelRunner(
        if not isinstance(kv_cache_spec, CrossAttentionSpec):
            return None, None

+        # Zero out buffer for padding requests that are not actually scheduled (CGs)
+        self.encoder_seq_lens.np[:num_reqs] = 0
        # Build encoder_seq_lens array mapping request indices to
        # encoder lengths for inputs scheduled in this batch
        for req_id in num_scheduled_tokens:
@ -1626,8 +1628,8 @@ class GPUModelRunner(
                query_start_loc=query_start_loc,
                query_start_loc_cpu=query_start_loc_cpu,
                seq_lens=seq_lens,
-                seq_lens_cpu=seq_lens_cpu,
-                num_computed_tokens_cpu=num_computed_tokens_cpu,
+                _seq_lens_cpu=seq_lens_cpu,
+                _num_computed_tokens_cpu=num_computed_tokens_cpu,
                num_actual_tokens=num_tokens_padded,
                num_reqs=num_reqs_padded,
                max_query_len=max_query_len,
@ -2764,6 +2766,7 @@ class GPUModelRunner(
        # be improved in model runner v2)
        force_uniform_decode: bool | None = None,
        force_has_lora: bool | None = None,
+        num_encoder_reqs: int = 0,
    ) -> tuple[
        CUDAGraphMode,
        BatchDescriptor,
@ -2780,6 +2783,11 @@ class GPUModelRunner(
            if force_uniform_decode is None
            else force_uniform_decode
        )
+        # Encoder-decoder models only support CG for decoder_step > 0 (no enc_output
+        # is present). Also, chunked-prefill is disabled, so batch are uniform.
+        has_encoder_output = (
+            self.model_config.is_encoder_decoder and num_encoder_reqs > 0
+        )

        has_lora = (
            len(self.input_batch.lora_id_to_lora_request) > 0
@ -2799,7 +2807,7 @@ class GPUModelRunner(
        )

        cudagraph_mode, batch_descriptor = dispatch_cudagraph(
-            num_tokens_padded, use_cascade_attn
+            num_tokens_padded, use_cascade_attn or has_encoder_output
        )
        num_tokens_padded = batch_descriptor.num_tokens

@ -2997,6 +3005,7 @@ class GPUModelRunner(
                    num_scheduled_tokens_np=num_scheduled_tokens_np,
                    max_num_scheduled_tokens=max_num_scheduled_tokens,
                    use_cascade_attn=cascade_attn_prefix_lens is not None,
+                    num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs),
                )

                logger.debug(
@ -4871,7 +4880,7 @@ class GPUModelRunner(
        # we need to adjust the cudagraph sizes to be a multiple of the uniform
        # decode query length to avoid: https://github.com/vllm-project/vllm/issues/28207
        # temp-fix: https://github.com/vllm-project/vllm/issues/28207#issuecomment-3504004536
-        # Will be removed in the near future when we have seperate cudagraph capture
+        # Will be removed in the near future when we have separate cudagraph capture
        # sizes for decode and mixed prefill-decode.
        if (
            cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@ -10,7 +10,7 @@ import torch
 import torch.nn as nn

 import vllm.envs as envs
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed import (
    ensure_model_parallel_initialized,
    init_distributed_environment,
@ -207,7 +207,8 @@ class TPUWorker:
        # one compiled bytecode. Having one FX graph/cached bytecode per
        # compiled model is required for `support_torch_compile` decorator to
        # skip dynamo guard.
-        self.model_runner.reset_dynamo_cache()
+        with set_current_vllm_config(self.vllm_config):
+            self.model_runner.reset_dynamo_cache()

        # Get the maximum amount of memory used by the model weights and
        # intermediate activations.
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@ -135,7 +135,7 @@ class AttentionGroup:
    kv_cache_spec: KVCacheSpec
    kv_cache_group_id: int
    # When ubatching is enabled we will have a metadata builder for each ubatch
-    # so that if they use internal persistant buffers for cudagraphs, and they
+    # so that if they use internal persistent buffers for cudagraphs, and they
    # won't have to worry about conflicting with the other ubatches.
    metadata_builders: list[AttentionMetadataBuilder] = field(
        default_factory=lambda: []
@ -313,8 +313,12 @@ def bind_kv_cache(
            # TODO - analyze where runner_kv_caches is used and the right
            # way to ensure it properly reflects multiple attention layers
            # in the same decoder block.
-            if current_platform.is_cuda_alike() or current_platform.is_xpu():
-                # We know that the GPU runner is not impacted by this
+            if (
+                current_platform.is_cuda_alike()
+                or current_platform.is_xpu()
+                or current_platform.is_cpu()
+            ):
+                # We know that the GPU / CPU runner is not impacted by this
                # case. Some test code depends on runner_kv_caches, but
                # not in a way that's impacted by ignoring this.
                pass