Merge branch 'main' into wye-refactor-quant-folder

2026-07-27 06:47:12 +08:00 · 2025-09-13 07:38:47 -07:00 · 2025-09-13 07:38:47 -07:00 · e925187f6d
commit e925187f6d
parent 1e3e56abfc 15b8fef453
106 changed files with 2761 additions and 997 deletions
--- a/.github/workflows/bc-lint.yml
+++ b/.github/workflows/bc-lint.yml
@ -6,6 +6,8 @@ on:
      - opened
      - synchronize
      - reopened
+      - labeled
+      - unlabeled

 jobs:
  bc_lint:
--- a/README.md
+++ b/README.md
@ -81,7 +81,7 @@ vLLM is flexible and easy to use with:
 - Tensor, pipeline, data and expert parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
+- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
 - Prefix caching support
 - Multi-LoRA support

--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@ -1,77 +1,675 @@
-#!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import time
+from collections.abc import Callable

+import matplotlib.pyplot as plt
+import numpy as np
 import torch

 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    silu_mul_fp8_quant_deep_gemm,
+    silu_mul_fp8_quant_deep_gemm_cuda,
 )
 from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used


-def benchmark(E, T, H, G=128, runs=50):
-    current_platform.seed_everything(42)
-    y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
-    tokens_per_expert = torch.randint(
-        T // 2, T, size=(E,), dtype=torch.int32, device="cuda"
+@triton.jit
+def _silu_mul_fp8_quant_deep_gemm(
+    # Pointers ------------------------------------------------------------
+    input_ptr,  # 16-bit activations (E, T, 2*H)
+    y_q_ptr,  # fp8 quantized activations (E, T, H)
+    y_s_ptr,  # 16-bit scales (E, T, G)
+    counts_ptr,  # int32 num tokens per expert (E)
+    # Sizes ---------------------------------------------------------------
+    H: tl.constexpr,  # hidden dimension (per output)
+    GROUP_SIZE: tl.constexpr,  # elements per group (usually 128)
+    # Strides for input (elements) ---------------------------------------
+    stride_i_e,
+    stride_i_t,
+    stride_i_h,
+    # Strides for y_q (elements) -----------------------------------------
+    stride_yq_e,
+    stride_yq_t,
+    stride_yq_h,
+    # Strides for y_s (elements) -----------------------------------------
+    stride_ys_e,
+    stride_ys_t,
+    stride_ys_g,
+    # Stride for counts (elements)
+    stride_counts_e,
+    # Numeric params ------------------------------------------------------
+    eps: tl.constexpr,
+    fp8_min: tl.constexpr,
+    fp8_max: tl.constexpr,
+    use_ue8m0: tl.constexpr,
+    # Meta ---------------------------------------------------------------
+    BLOCK: tl.constexpr,
+    NUM_STAGES: tl.constexpr,
+):
+    G = H // GROUP_SIZE
+
+    # map program id -> (e, g)
+    pid = tl.program_id(0)
+    e = pid // G
+    g = pid % G
+
+    e = e.to(tl.int64)
+    g = g.to(tl.int64)
+
+    # number of valid tokens for this expert
+    n_tokens = tl.load(counts_ptr + e * stride_counts_e).to(tl.int64)
+
+    cols = tl.arange(0, BLOCK).to(tl.int64)
+    mask = cols < BLOCK
+
+    base_input_offset = e * stride_i_e + g * GROUP_SIZE * stride_i_h
+    base_gate_offset = base_input_offset + cols * stride_i_h
+    base_up_offset = base_input_offset + H * stride_i_h + cols * stride_i_h
+    base_yq_offset = e * stride_yq_e + g * GROUP_SIZE * stride_yq_h + cols * stride_yq_h
+    base_ys_offset = e * stride_ys_e + g * stride_ys_g
+
+    for t in tl.range(0, n_tokens, num_stages=NUM_STAGES):
+        gate = tl.load(
+            input_ptr + base_gate_offset + t * stride_i_t, mask=mask, other=0.0
+        ).to(tl.float32)
+        up = tl.load(input_ptr + base_up_offset + t * stride_i_t, mask=mask, other=0.0)
+
+        gate = gate * (1.0 / (1.0 + tl.exp(-gate)))
+        y = gate * up
+
+        y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max
+        if use_ue8m0:
+            y_s = tl.exp2(tl.ceil(tl.log2(y_s)))
+
+        y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+
+        tl.store(y_q_ptr + base_yq_offset + t * stride_yq_t, y_q, mask=mask)
+        tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s)
+
+
+def silu_mul_fp8_quant_deep_gemm_triton(
+    y: torch.Tensor,  # (E, T, 2*H)
+    tokens_per_expert: torch.Tensor,  # (E,) number of valid tokens per expert
+    num_parallel_tokens,
+    group_size: int = 128,
+    eps: float = 1e-10,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
+
+    y has shape (E, T, 2*H). The first half of the last dimension is
+    silu-activated, multiplied by the second half, then quantized into FP8.
+
+    Returns `(y_q, y_s)` where
+    * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H]
+    * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
+    """
+    assert y.ndim == 3, "y must be (E, T, 2*H)"
+    E, T, H2 = y.shape
+    assert H2 % 2 == 0, "last dim of y must be even (2*H)"
+    H = H2 // 2
+    G = (H + group_size - 1) // group_size
+    assert H % group_size == 0, "H must be divisible by group_size"
+    assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E, (
+        "tokens_per_expert must be shape (E,)"
+    )
+    tokens_per_expert = tokens_per_expert.to(device=y.device, dtype=torch.int32)
+
+    # allocate outputs
+    fp8_dtype = torch.float8_e4m3fn
+    y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device)
+
+    # strides (elements)
+    stride_i_e, stride_i_t, stride_i_h = y.stride()
+    stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride()
+
+    # desired scale strides (elements): (T*G, 1, T)
+    stride_ys_e = T * G
+    stride_ys_t = 1
+    stride_ys_g = T
+    y_s = torch.empty_strided(
+        (E, T, G),
+        (stride_ys_e, stride_ys_t, stride_ys_g),
+        dtype=torch.float32,
+        device=y.device,
    )

+    stride_cnt_e = tokens_per_expert.stride()[0]
+
+    # Static grid over experts and H-groups.
+    # A loop inside the kernel handles the token dim
+    grid = (E * G,)
+
+    f_info = torch.finfo(fp8_dtype)
+    fp8_max = f_info.max
+    fp8_min = f_info.min
+
+    _silu_mul_fp8_quant_deep_gemm[grid](
+        y,
+        y_q,
+        y_s,
+        tokens_per_expert,
+        H,
+        group_size,
+        stride_i_e,
+        stride_i_t,
+        stride_i_h,
+        stride_yq_e,
+        stride_yq_t,
+        stride_yq_h,
+        stride_ys_e,
+        stride_ys_t,
+        stride_ys_g,
+        stride_cnt_e,
+        eps,
+        fp8_min,
+        fp8_max,
+        is_deep_gemm_e8m0_used(),
+        BLOCK=group_size,
+        NUM_STAGES=4,
+        num_warps=1,
+    )
+
+    return y_q, y_s
+
+
+# Parse generation strategies
+strategies = ["uniform", "max_t", "first_t"]
+
+
+def benchmark(
+    kernel: Callable,
+    E: int,
+    T: int,
+    H: int,
+    total_tokens: int,
+    num_parallel_tokens: int = 64,
+    G: int = 128,
+    runs: int = 200,
+    num_warmups: int = 20,
+    gen_strategy: str = "default",
+    iterations_per_run: int = 20,
+):
+    def generate_data(seed_offset=0):
+        """Generate input data with given seed offset"""
+        current_platform.seed_everything(42 + seed_offset)
+        y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
+
+        if gen_strategy == "uniform":
+            r = torch.rand(size=(E,), device="cuda")
+            r /= r.sum()
+            r *= total_tokens
+            tokens_per_expert = r.int()
+            tokens_per_expert = torch.minimum(
+                tokens_per_expert,
+                torch.ones((E,), device=r.device, dtype=torch.int) * T,
+            )
+        elif gen_strategy == "max_t":
+            tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
+            tokens_per_expert.fill_(total_tokens / E)
+        elif gen_strategy == "first_t":
+            tokens_per_expert = torch.zeros(size=(E,), dtype=torch.int32, device="cuda")
+            tokens_per_expert[0] = min(T, total_tokens)
+        else:
+            raise ValueError(f"Unknown generation strategy: {gen_strategy}")
+        return y, tokens_per_expert
+
+    dataset_count = 4
+    # Pre-generate different input matrices for each iteration to avoid cache effects
+    data_sets = [generate_data(i) for i in range(dataset_count)]
+
    # Warmup
-    for _ in range(10):
-        silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
-        torch.cuda.synchronize()
+    y, tokens_per_expert = data_sets[0]
+    for _ in range(num_warmups):
+        kernel(
+            y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
+        )
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)

    # Benchmark
-    torch.cuda.synchronize()
-    start = time.perf_counter()
+    latencies: list[float] = []
    for _ in range(runs):
-        silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
-    torch.cuda.synchronize()
+        torch.cuda.synchronize()

-    avg_time = (time.perf_counter() - start) / runs * 1000
+        start_event.record()
+        for i in range(iterations_per_run):
+            y, tokens_per_expert = data_sets[i % dataset_count]
+            kernel(
+                y,
+                tokens_per_expert,
+                num_parallel_tokens=num_parallel_tokens,
+                group_size=G,
+            )
+        end_event.record()
+        end_event.synchronize()

-    # Calculate actual work done (only count valid tokens)
+        total_time_ms = start_event.elapsed_time(end_event)
+        per_iter_time_ms = total_time_ms / iterations_per_run
+        latencies.append(per_iter_time_ms)
+
+    # Use median instead of average for better outlier handling
+    median_time_ms = np.median(latencies)
+    median_time_s = median_time_ms / 1000
+
+    # Calculate actual work done (using first dataset for consistency)
+    _, tokens_per_expert = data_sets[0]
    actual_tokens = tokens_per_expert.sum().item()
    actual_elements = actual_tokens * H

    # GFLOPS: operations per element = exp + 3 muls + 1 div + quantization ops ≈ 8 ops
    ops_per_element = 8
    total_ops = actual_elements * ops_per_element
-    gflops = total_ops / (avg_time / 1000) / 1e9
+    gflops = total_ops / median_time_s / 1e9

    # Memory bandwidth: bfloat16 inputs (2 bytes), fp8 output (1 byte), scales (4 bytes)
    input_bytes = actual_tokens * 2 * H * 2  # 2*H bfloat16 inputs
    output_bytes = actual_tokens * H * 1  # H fp8 outputs
    scale_bytes = actual_tokens * (H // G) * 4  # scales in float32
    total_bytes = input_bytes + output_bytes + scale_bytes
-    memory_bw = total_bytes / (avg_time / 1000) / 1e9
+    memory_bw = total_bytes / median_time_s / 1e9

-    return avg_time, gflops, memory_bw
+    HOPPER_BANDWIDTH_TBPS = 3.35
+    return (
+        median_time_ms,
+        gflops,
+        memory_bw,
+        (memory_bw / (HOPPER_BANDWIDTH_TBPS * 1024)) * 100,
+    )


+def create_comparison_plot(
+    ratio, cuda_times, baseline_times, config_labels, strategy_name, id
+):
+    """Create a comparison plot for a specific generation strategy"""
+    fig, ax = plt.subplots(1, 1, figsize=(16, 6))
+
+    # Configure x-axis positions
+    x = np.arange(len(config_labels))
+    width = 0.35
+
+    # Execution Time plot (lower is better)
+    ax.bar(
+        x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue"
+    )
+    ax.bar(
+        x + width / 2,
+        baseline_times,
+        width,
+        label="Baseline",
+        alpha=0.8,
+        color="orange",
+    )
+
+    # Add speedup labels over each bar pair
+    for i in range(len(x)):
+        speedup = ratio[i]
+        max_height = max(cuda_times[i], baseline_times[i])
+        ax.text(
+            x[i],
+            max_height + max_height * 0.02,
+            f"{speedup:.2f}x",
+            ha="center",
+            va="bottom",
+            fontweight="bold",
+            fontsize=9,
+        )
+
+    ax.set_xlabel("Configuration")
+    ax.set_ylabel("% Utilization")
+    ax.set_title(
+        f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)"
+    )
+    ax.set_xticks(x)
+    ax.set_xticklabels(config_labels, rotation=45, ha="right")
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+
+    plt.tight_layout()
+    return fig, ax
+
+
+def create_combined_plot(all_results):
+    """Create a combined plot with all strategies in one PNG"""
+    num_strategies = len(all_results)
+    fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies))
+
+    if num_strategies == 1:
+        axes = [axes]
+
+    for idx, (
+        strategy_name,
+        ratio,
+        cuda_times,
+        baseline_times,
+        config_labels,
+    ) in enumerate(all_results):
+        ax = axes[idx]
+
+        # Configure x-axis positions
+        x = np.arange(len(config_labels))
+        width = 0.35
+
+        # Execution Time plot (lower is better)
+        ax.bar(
+            x - width / 2,
+            cuda_times,
+            width,
+            label="CUDA Kernel",
+            alpha=0.8,
+            color="blue",
+        )
+        ax.bar(
+            x + width / 2,
+            baseline_times,
+            width,
+            label="Baseline",
+            alpha=0.8,
+            color="orange",
+        )
+
+        # Add speedup labels over each bar pair
+        for i in range(len(x)):
+            speedup = ratio[i]
+            max_height = max(cuda_times[i], baseline_times[i])
+            ax.text(
+                x[i],
+                max_height + max_height * 0.02,
+                f"{speedup:.2f}x",
+                ha="center",
+                va="bottom",
+                fontweight="bold",
+                fontsize=9,
+            )
+
+        ax.set_xlabel("Configuration")
+        ax.set_ylabel("% Utilization")
+        ax.set_title(
+            f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)"
+        )
+        ax.set_xticks(x)
+        ax.set_xticklabels(config_labels, rotation=45, ha="right")
+        ax.legend()
+        ax.grid(True, alpha=0.3)
+
+    plt.tight_layout()
+    filename = "../../silu_bench/silu_benchmark_combined.png"
+    plt.savefig(filename, dpi=300, bbox_inches="tight")
+    plt.show()
+
+    return filename
+
+
+outer_dim = 7168
 configs = [
-    (8, 32, 1024),
-    (16, 64, 2048),
-    (32, 128, 4096),
    # DeepSeekV3 Configs
-    (256, 16, 7168),
-    (256, 32, 7168),
-    (256, 64, 7168),
-    (256, 128, 7168),
-    (256, 256, 7168),
-    (256, 512, 7168),
+    (8, 1024, 7168),
+    # DeepSeekV3 Configs
+    (32, 1024, 7168),
+    # DeepSeekV3 Configs
    (256, 1024, 7168),
 ]

-print(f"GPU: {torch.cuda.get_device_name()}")
-print(f"{'Config':<20} {'Time(ms)':<10} {'GFLOPS':<10} {'GB/s':<10}")
-print("-" * 50)
+runs = 100
+num_warmups = 20

-for E, T, H in configs:
-    try:
-        time_ms, gflops, gbps = benchmark(E, T, H)
-        print(f"E={E:3d},T={T:4d},H={H:4d} {time_ms:8.3f} {gflops:8.1f} {gbps:8.1f}")
-    except Exception:
-        print(f"E={E:3d},T={T:4d},H={H:4d} FAILED")
+strategy_descriptions = {
+    "uniform": "Uniform Random",
+    "max_t": "Even Assignment",
+    "first_t": "experts[0] = T, experts[1:] = 0",
+}
+
+print(f"GPU: {torch.cuda.get_device_name()}")
+print(f"Testing strategies: {', '.join(strategies)}")
+print(f"Configurations: {len(configs)} configs")
+
+all_results = []
+
+# Run benchmarks for each strategy
+for id, strategy in enumerate(strategies):
+    print(f"\n{'=' * 60}")
+    print(f"Testing strategy: {strategy_descriptions[strategy]}")
+    print(f"{'=' * 60}")
+
+    # Collect benchmark data for both algorithms
+    config_labels = []
+    config_x_axis = []
+    all_cuda_results = []
+    all_baseline_results = []
+    all_ratios = []
+
+    for E, T, H in configs:
+        total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E]
+        config_x_axis.append(total_tokens_config)
+
+        cuda_results = []
+        baseline_results = []
+        ratios = []
+
+        for total_tokens in total_tokens_config:
+            config_label = f"E={E},T={T},H={H},TT={total_tokens}"
+            config_labels.append(config_label)
+
+            # CUDA kernel results
+            time_ms_cuda, gflops, gbps, perc = benchmark(
+                silu_mul_fp8_quant_deep_gemm_cuda,
+                E,
+                T,
+                H,
+                total_tokens,
+                runs=runs,
+                num_warmups=num_warmups,
+                gen_strategy=strategy,
+            )
+            cuda_results.append((time_ms_cuda, gflops, gbps, perc))
+
+            # Baseline results
+            time_ms_triton, gflops, gbps, perc = benchmark(
+                silu_mul_fp8_quant_deep_gemm_triton,
+                E,
+                T,
+                H,
+                total_tokens,
+                runs=runs,
+                num_warmups=num_warmups,
+                gen_strategy=strategy,
+            )
+            baseline_results.append((time_ms_triton, gflops, gbps, perc))
+            ratios.append(time_ms_triton / time_ms_cuda)
+
+            print(f"Completed: {config_label}")
+        all_cuda_results.append(cuda_results)
+        all_baseline_results.append(baseline_results)
+        all_ratios.append(ratios)
+
+    # Store results for combined plotting
+    all_results.append(
+        (
+            strategy_descriptions[strategy],
+            all_ratios,
+            all_cuda_results,
+            all_baseline_results,
+            config_labels,
+            config_x_axis,
+        )
+    )
+
+    # Print summary table for this strategy
+    print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
+    print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}")
+    print("-" * 60)
+
+    for i, (E, T, H) in enumerate(configs):
+        speedup = baseline_results[i][0] / cuda_results[i][0]
+        config_label = f"E={E:3d},T={T:4d},H={H:4d}"
+        print(
+            f"{config_label:<20} {cuda_results[i][0]:8.5f} "
+            f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x"
+        )
+
+
+def create_total_tokens_plot(all_results):
+    num_strategies = len(all_results)
+    num_configs = len(configs)
+
+    # Create side-by-side subplots: 2 columns for speedup and bandwidth percentage
+    fig, axs = plt.subplots(
+        num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies)
+    )
+
+    # Add main title to the entire figure
+    fig.suptitle(
+        "Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)",
+        fontsize=16,
+        fontweight="bold",
+        y=0.98,
+    )
+
+    # Handle single strategy case
+    if num_strategies == 1:
+        axs = axs.reshape(1, -1)
+
+    # Handle single config case
+    if num_configs == 1:
+        axs = axs.reshape(-1, 2)
+
+    for strategy_idx, result in enumerate(all_results):
+        (
+            strategy_name,
+            all_ratios,
+            all_cuda_results,
+            all_baseline_results,
+            config_labels,
+            config_x_axis,
+        ) = result
+
+        for config_idx in range(num_configs):
+            # Speedup plot (left column)
+            ax_speedup = axs[strategy_idx, config_idx * 2]
+            # Bandwidth plot (right column)
+            ax_bandwidth = axs[strategy_idx, config_idx * 2 + 1]
+
+            E, T, H = configs[config_idx]
+            ratios = all_ratios[config_idx]
+            total_tokens_values = config_x_axis[config_idx]
+
+            # Extract CUDA and Triton bandwidth percentages
+            cuda_bandwidth_percentages = [
+                result[3] for result in all_cuda_results[config_idx]
+            ]
+            triton_bandwidth_percentages = [
+                result[3] for result in all_baseline_results[config_idx]
+            ]
+
+            # Plot speedup ratios vs total tokens (left plot)
+            ax_speedup.plot(
+                total_tokens_values, ratios, "bo-", linewidth=3, markersize=8
+            )
+            ax_speedup.set_title(
+                f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}",
+                fontsize=12,
+                fontweight="bold",
+            )
+            ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
+            ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
+            ax_speedup.grid(True, alpha=0.3)
+
+            ax_bandwidth.plot(
+                total_tokens_values,
+                cuda_bandwidth_percentages,
+                "ro-",
+                linewidth=3,
+                markersize=8,
+                label="CUDA",
+            )
+            ax_bandwidth.plot(
+                total_tokens_values,
+                triton_bandwidth_percentages,
+                "go-",
+                linewidth=3,
+                markersize=8,
+                label="Triton",
+            )
+            ax_bandwidth.set_title(
+                f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
+                fontsize=12,
+                fontweight="bold",
+            )
+            ax_bandwidth.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
+            ax_bandwidth.set_ylabel(
+                "% of Peak Bandwidth", fontweight="bold", fontsize=11
+            )
+            ax_bandwidth.legend(prop={"weight": "bold"})
+            ax_bandwidth.grid(True, alpha=0.3)
+
+            # Format x-axis labels for both plots
+            for ax in [ax_speedup, ax_bandwidth]:
+                ax.set_xticks(total_tokens_values)
+                ax.set_xticklabels(
+                    [
+                        f"{tt // 1000}K" if tt >= 1000 else str(tt)
+                        for tt in total_tokens_values
+                    ],
+                    fontweight="bold",
+                )
+                # Make tick labels bold
+                for label in ax.get_xticklabels() + ax.get_yticklabels():
+                    label.set_fontweight("bold")
+
+            # Add value labels on speedup points
+            for x, y in zip(total_tokens_values, ratios):
+                ax_speedup.annotate(
+                    f"{y:.2f}x",
+                    (x, y),
+                    textcoords="offset points",
+                    xytext=(0, 12),
+                    ha="center",
+                    fontsize=10,
+                    fontweight="bold",
+                    bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7),
+                )
+
+            # Add value labels on CUDA bandwidth points
+            for x, y in zip(total_tokens_values, cuda_bandwidth_percentages):
+                ax_bandwidth.annotate(
+                    f"{y:.1f}%",
+                    (x, y),
+                    textcoords="offset points",
+                    xytext=(0, 12),
+                    ha="center",
+                    fontsize=9,
+                    fontweight="bold",
+                    bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3),
+                )
+
+            # Add value labels on Triton bandwidth points
+            for x, y in zip(total_tokens_values, triton_bandwidth_percentages):
+                ax_bandwidth.annotate(
+                    f"{y:.1f}%",
+                    (x, y),
+                    textcoords="offset points",
+                    xytext=(0, -15),
+                    ha="center",
+                    fontsize=9,
+                    fontweight="bold",
+                    bbox=dict(boxstyle="round,pad=0.2", facecolor="green", alpha=0.3),
+                )
+
+    plt.tight_layout()
+    plt.subplots_adjust(top=0.93)  # Make room for main title
+    filename = "silu_benchmark_total_tokens.png"
+    plt.savefig(filename, dpi=300, bbox_inches="tight")
+    plt.show()
+
+    return filename
+
+
+# Create combined plot with all strategies
+combined_plot_filename = create_total_tokens_plot(all_results)
+
+print(f"\n{'=' * 60}")
+print("Benchmark Complete!")
+print(f"Generated combined plot: {combined_plot_filename}")
+print(f"{'=' * 60}")
--- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
+++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@ -43,6 +43,7 @@ void sm100_cutlass_mla_decode(
    torch::Tensor const& seq_lens,
    torch::Tensor const& page_table,
    torch::Tensor const& workspace,
+    double sm_scale,
    int64_t num_kv_splits) {
  TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_decode");
 }
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -122,12 +122,6 @@ void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                      std::optional<torch::Tensor> key, int64_t head_size,
                      torch::Tensor& cos_sin_cache, bool is_neox);

-void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
-                              std::optional<torch::Tensor> key,
-                              int64_t head_size, torch::Tensor& cos_sin_cache,
-                              bool is_neox, int64_t rot_dim,
-                              torch::Tensor& cos_sin_cache_offsets);
-
 void silu_and_mul(torch::Tensor& out, torch::Tensor& input);

 void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
@ -139,6 +133,12 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& out,
                              torch::Tensor& input,
                              torch::Tensor& input_global_scale);
 #endif
+void silu_mul_fp8_quant_deep_gemm_cuda(
+    const at::Tensor& input,   // (E, T, 2*H)
+    const at::Tensor& counts,  // (E)
+    at::Tensor& y_q,           // (E, T, H) [OUT]
+    at::Tensor& y_s,           // (E, T, H//group_size) [OUT]
+    int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens);

 void mul_and_silu(torch::Tensor& out, torch::Tensor& input);

--- a/csrc/pos_encoding_kernels.cu
+++ b/csrc/pos_encoding_kernels.cu
@ -99,35 +99,6 @@ __global__ void rotary_embedding_kernel(
      token_idx, query_stride, key_stride, head_stride);
 }

-template <typename scalar_t, bool IS_NEOX>
-__global__ void batched_rotary_embedding_kernel(
-    const int64_t* __restrict__ positions,  // [batch_size, seq_len] or
-                                            // [num_tokens]
-    scalar_t* __restrict__ query,           // [batch_size, seq_len, num_heads,
-                                   // head_size] or [num_tokens, num_heads,
-                                   // head_size]
-    scalar_t* __restrict__ key,  // nullptr or
-                                 // [batch_size, seq_len, num_kv_heads,
-                                 // head_size] or [num_tokens, num_kv_heads,
-                                 // head_size]
-    const scalar_t* __restrict__ cos_sin_cache,  // [max_position, 2, rot_dim //
-                                                 // 2]
-    const int64_t* __restrict__ cos_sin_cache_offsets,  // [batch_size, seq_len]
-    const int rot_dim, const int64_t query_stride, const int64_t key_stride,
-    const int64_t head_stride, const int num_heads, const int num_kv_heads,
-    const int head_size) {
-  // Each thread block is responsible for one token.
-  const int token_idx = blockIdx.x;
-  int64_t pos = positions[token_idx];
-  int64_t cos_sin_cache_offset = cos_sin_cache_offsets[token_idx];
-  const scalar_t* cache_ptr =
-      cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim;
-
-  apply_rotary_embedding<scalar_t, IS_NEOX>(
-      query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim,
-      token_idx, query_stride, key_stride, head_stride);
-}
-
 }  // namespace vllm

 void rotary_embedding(
@ -211,96 +182,3 @@ void rotary_embedding(
    }
  });
 }
-
-/*
-Batched version of rotary embedding, pack multiple LoRAs together
-and process in batched manner.
-*/
-void batched_rotary_embedding(
-    torch::Tensor& positions,  // [batch_size, seq_len] or [num_tokens]
-    torch::Tensor& query,  // [batch_size, seq_len, num_heads * head_size] or
-                           // [num_tokens, num_heads * head_size] or
-                           // [batch_size, seq_len, num_heads, head_size] or
-                           // [num_tokens, num_heads, head_size]
-    std::optional<torch::Tensor>
-        key,  // null or
-              // [batch_size, seq_len, num_kv_heads * head_size] or
-              // [num_tokens, num_kv_heads * head_size] or
-              // [batch_size, seq_len, num_heads, head_size] or
-              // [num_tokens, num_heads, head_size]
-    int64_t head_size,
-    torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
-    bool is_neox, int64_t rot_dim,
-    torch::Tensor& cos_sin_cache_offsets  // [num_tokens] or [batch_size]
-) {
-  // num_tokens = batch_size * seq_len
-  int64_t num_tokens = cos_sin_cache_offsets.size(0);
-  TORCH_CHECK(
-      positions.size(0) == num_tokens || positions.numel() == num_tokens,
-      "positions must have the same num_tokens or batch_size as "
-      "cos_sin_cache_offsets");
-
-  int positions_ndim = positions.dim();
-  // Make sure num_tokens dim is consistent across positions, query, and key
-  TORCH_CHECK(
-      positions_ndim == 1 || positions_ndim == 2,
-      "positions must have shape [num_tokens] or [batch_size, seq_len]");
-  if (positions_ndim == 1) {
-    TORCH_CHECK(query.size(0) == positions.size(0) &&
-                    (!key.has_value() || key->size(0) == positions.size(0)),
-                "query, key and positions must have the same number of tokens");
-  }
-  if (positions_ndim == 2) {
-    TORCH_CHECK(
-        query.size(0) == positions.size(0) &&
-            (!key.has_value() || key->size(0) == positions.size(0)) &&
-            query.size(1) == positions.size(1) &&
-            (!key.has_value() || key->size(1) == positions.size(1)),
-        "query, key and positions must have the same batch_size and seq_len");
-  }
-
-  // Make sure head_size is valid for query and key
-  int query_hidden_size = query.numel() / num_tokens;
-  int key_hidden_size = key.has_value() ? key->numel() / num_tokens : 0;
-  TORCH_CHECK(query_hidden_size % head_size == 0);
-  TORCH_CHECK(key_hidden_size % head_size == 0);
-
-  // Make sure query and key have concistent number of heads
-  int num_heads = query_hidden_size / head_size;
-  int num_kv_heads = key.has_value() ? key_hidden_size / head_size : num_heads;
-  TORCH_CHECK(num_heads % num_kv_heads == 0);
-
-  int seq_dim_idx = positions_ndim - 1;
-  int64_t query_stride = query.stride(seq_dim_idx);
-  int64_t key_stride = key.has_value() ? key->stride(seq_dim_idx) : 0;
-  // Determine head stride: for [*, heads, head_size] use stride of last dim;
-  // for flat [*, heads*head_size], heads blocks are contiguous of size
-  // head_size
-  int query_ndim = query.dim();
-  int64_t head_stride =
-      (query_ndim == positions_ndim + 2) ? query.stride(-2) : head_size;
-
-  dim3 grid(num_tokens);
-  dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
-    if (is_neox) {
-      vllm::batched_rotary_embedding_kernel<scalar_t, true>
-          <<<grid, block, 0, stream>>>(
-              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
-              key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
-              cos_sin_cache.data_ptr<scalar_t>(),
-              cos_sin_cache_offsets.data_ptr<int64_t>(), rot_dim, query_stride,
-              key_stride, head_stride, num_heads, num_kv_heads, head_size);
-    } else {
-      vllm::batched_rotary_embedding_kernel<scalar_t, false>
-          <<<grid, block, 0, stream>>>(
-              positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
-              key.has_value() ? key->data_ptr<scalar_t>() : nullptr,
-              cos_sin_cache.data_ptr<scalar_t>(),
-              cos_sin_cache_offsets.data_ptr<int64_t>(), rot_dim, query_stride,
-              key_stride, head_stride, num_heads, num_kv_heads, head_size);
-    }
-  });
-}
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@ -9,6 +9,26 @@

 #include "quantization/w8a8/fp8/common.cuh"

+#include <c10/util/Float8_e4m3fn.h>
+
+#ifndef USE_ROCM
+  #include <cuda_bf16.h>
+  #include <cuda_fp16.h>
+  #include <cuda_fp8.h>
+#else
+  #include <hip/hip_bf16.h>
+  #include <hip/hip_fp16.h>
+  #include <hip/hip_fp8.h>
+
+typedef __hip_bfloat162 __nv_bfloat162;
+typedef __hip_bfloat16 __nv_bfloat16;
+typedef __hip_bfloat16_raw __nv_bfloat16_raw;
+
+typedef __hip_fp8_e4m3 __nv_fp8_e4m3;
+typedef __hip_fp8x4_e4m3 __nv_fp8x4_e4m3;
+#endif
+
+#include "core/registration.h"
 namespace vllm {

 template <typename T>
@ -87,6 +107,337 @@ __global__ void act_and_mul_quant_kernel(
    }
  }
 }
+
+__device__ __forceinline__ float silu(float x) {
+  return (__fdividef(x, (1.f + expf(-x))));
+}
+
+__device__ __forceinline__ float2 silu2(float2 x) {
+  return make_float2(silu(x.x), silu(x.y));
+}
+
+#ifndef USE_ROCM
+__device__ __forceinline__ float warp_max(float v) {
+  static constexpr unsigned FULL_MASK = 0xffffffffu;
+  for (int offset = 1; offset < WARP_SIZE; offset *= 2) {
+    v = fmaxf(v, __shfl_xor_sync(FULL_MASK, v, offset));
+  }
+  return v;
+}
+
+__device__ __forceinline__ __nv_bfloat16 warp_max(__nv_bfloat16 v) {
+  static constexpr unsigned FULL_MASK = 0xffffffffu;
+  for (int offset = 1; offset < WARP_SIZE; offset *= 2) {
+    v = __hmax(v, __shfl_xor_sync(FULL_MASK, v, offset));
+  }
+  return v;
+}
+#endif
+
+template <typename T, typename U>
+__device__ __forceinline__ void cp_async4(T* _smem_ptr, const U* _glob_ptr) {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  auto smem_ptr = reinterpret_cast<void*>(_smem_ptr);
+  auto glob_ptr = reinterpret_cast<const void*>(_glob_ptr);
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+#else
+  _smem_ptr[0] = _glob_ptr[0];
+#endif
+}
+
+__device__ __forceinline__ void cp_async_fence() {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile("cp.async.commit_group;\n" ::);
+#else
+#endif
+}
+
+template <int N>
+__device__ __forceinline__ void cp_async_wait() {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(N));
+#else
+#endif
+}
+
+template <>
+__device__ __forceinline__ void cp_async_wait<0>() {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile("cp.async.wait_all;\n" ::);
+#else
+#endif
+}
+
+__device__ __forceinline__ float clip(float v, float mmin, float mmax) {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  return fminf(mmax, fmaxf(v, mmin));
+#else
+#endif
+}
+
+__device__ __forceinline__ __nv_bfloat16 clip(__nv_bfloat16 v,
+                                              __nv_bfloat16 mmin,
+                                              __nv_bfloat16 mmax) {
+  return __hmin(mmax, __hmax(v, mmin));
+}
+
+__device__ __forceinline__ __nv_bfloat162 clip(__nv_bfloat162 v,
+                                               __nv_bfloat162 mmin,
+                                               __nv_bfloat162 mmax) {
+  return __hmin2(mmax, __hmax2(v, mmin));
+}
+
+// We use the following values for fp8 min/max:
+//  __nv_fp8_e4m3 = (-448, +448)
+//  __nv_fp8_e4m3uz = (-240.0, +240.0)
+// It is currently assumed that only
+template <class T>
+constexpr __nv_bfloat16 get_fp8_max() {
+  static_assert(std::is_same_v<T, c10::Float8_e4m3fn> ||
+                std::is_same_v<T, c10::Float8_e4m3fnuz>);
+  if constexpr (std::is_same_v<T, c10::Float8_e4m3fn>) {
+    return __nv_bfloat16(__nv_bfloat16_raw{.x = 17376});
+  } else {
+    return __nv_bfloat16(__nv_bfloat16_raw{.x = 17264});
+  }
+}
+
+template <class T>
+constexpr __nv_bfloat16 get_fp8_min() {
+  static_assert(std::is_same_v<T, c10::Float8_e4m3fn> ||
+                std::is_same_v<T, c10::Float8_e4m3fnuz>);
+  if constexpr (std::is_same_v<T, c10::Float8_e4m3fn>) {
+    return __nv_bfloat16(__nv_bfloat16_raw{.x = 50144});
+  } else {
+    return __nv_bfloat16(__nv_bfloat16_raw{.x = 50032});
+  }
+}
+#ifndef USE_ROCM
+template <typename fp8_type, int32_t NUM_WARPS, typename Idx_t,
+          int NUM_PARALLEL_TOKENS, bool USE_UE8M0, int GROUP_SIZE = 128,
+          int NUM_STAGES = 3>
+__global__ void silu_mul_fp8_quant_deep_gemm_kernel(
+    const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q,
+    float* __restrict__ _y_s, const int32_t* __restrict__ counts,
+
+    // sizes
+    int H, int G,
+
+    // strides (in elements)
+    Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e,
+    Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t,
+    Idx_t stride_ys_g, Idx_t stride_counts_e) {
+  static constexpr __nv_bfloat16 fp8_min = get_fp8_min<fp8_type>();
+  static constexpr __nv_bfloat16 fp8_max = get_fp8_max<fp8_type>();
+  // We assign EPS with its 16-bit unsigned counterpart to allow constexpr.
+  static constexpr __nv_bfloat16 EPS = (__nv_bfloat16_raw{.x = 11996});
+
+  // We pack 8 16-bit bfloat16 values into a 128-bit __int128_t.
+  static constexpr int32_t BFLOAT16_PER_GROUP = 8;
+
+  // We split the shared memory in half, corresponding to gate and up matrices:
+  // [...gate_i, ...up_i]  where 0 <= i < stages.
+  static constexpr int32_t S_NUM_128 =
+      2u * (GROUP_SIZE / BFLOAT16_PER_GROUP) * NUM_WARPS * NUM_STAGES;
+  static constexpr auto THREAD_COUNT = NUM_WARPS * WARP_SIZE;
+  static constexpr int HALF_THREAD_COUNT = THREAD_COUNT / 2;
+  static constexpr int32_t S_NUM_64 = S_NUM_128 * 2;
+  __shared__ __int128_t __align__(16) s_buff_128[S_NUM_128];
+
+  const int32_t tid = threadIdx.x;
+  const int32_t warp_id = tid / WARP_SIZE;
+  const int32_t lane_id = tid % WARP_SIZE;
+
+  auto s_buff_compute_32 = reinterpret_cast<__nv_bfloat162*>(s_buff_128);
+
+  // block handles one (expert e, group g)
+  int32_t pid = blockIdx.x;
+  int32_t e = pid / G;
+  int32_t g = pid % G;
+
+  const int32_t n_tokens = counts[e * stride_counts_e];
+
+  if (!n_tokens) {
+    return;  // Exit ASAP.
+  }
+
+  const Idx_t stride_i_t_128 = stride_i_t / 8u;
+
+  int32_t n_tokens_lower, n_tokens_upper;
+
+  // Each block i iterates over tokens of a slice of n_tokens =
+  // expert_counts[i], with the size of chunk being
+  // (n_tokens / NUM_PARALLEL_TOKENS) + residual, instead of
+  // updiv(n_tokens, NUM_PARALLEL_TOKENS) for better scheduling.
+  if (n_tokens < NUM_PARALLEL_TOKENS && blockIdx.y < n_tokens) {
+    // Specialize this, but can be likely fused.
+    if (blockIdx.y >= NUM_PARALLEL_TOKENS) {
+      return;
+    }
+    n_tokens_lower = blockIdx.y;
+    n_tokens_upper = blockIdx.y + 1;
+  } else {
+    auto chunk_size = n_tokens / NUM_PARALLEL_TOKENS;
+    auto residual = n_tokens - chunk_size * NUM_PARALLEL_TOKENS;
+    auto calc_id = [&](int32_t id) {
+      if (id < residual) {
+        return min(n_tokens, id * (chunk_size + 1));
+      } else {
+        return min(n_tokens, id * chunk_size + residual);
+      }
+    };
+    n_tokens_lower = calc_id(blockIdx.y);
+    n_tokens_upper = calc_id(blockIdx.y + 1);
+  }
+
+  if (n_tokens_lower >= n_tokens_upper) {
+    return;
+  }
+
+  // We do calculations here, using constexpr wherever possible.
+  const Idx_t base_i = e * stride_i_e + NUM_WARPS * g * GROUP_SIZE * stride_i_h;
+  const Idx_t base_ys = e * stride_ys_e + NUM_WARPS * g * stride_ys_g;
+  const Idx_t base_yq =
+      e * stride_yq_e + NUM_WARPS * g * GROUP_SIZE * stride_yq_h;
+  Idx_t gate_off_128 = (base_i / static_cast<Idx_t>(8u));
+  auto input_128_ptr = reinterpret_cast<const __int128_t*>(_input);
+  auto gate_128_ptr = input_128_ptr + gate_off_128 + (tid % HALF_THREAD_COUNT) +
+                      stride_i_t_128 * n_tokens_lower;
+  auto up_128_ptr = gate_128_ptr + (H * stride_i_h) / 8u;
+  auto y_s_ptr =
+      _y_s + base_ys + warp_id * stride_ys_g + n_tokens_lower * stride_ys_t;
+  auto y_q_ptr = _y_q + base_yq + warp_id * GROUP_SIZE +
+                 stride_yq_t * n_tokens_lower + 4 * lane_id;
+  int32_t t_load = n_tokens_lower, load_stage_id = 0;
+  auto s_buff_gate_load_128 = s_buff_128 + (tid % HALF_THREAD_COUNT);
+  auto s_buff_up_load_128 = s_buff_gate_load_128 + S_NUM_128 / 2u;
+  int32_t stage_offset{};
+
+  static constexpr int32_t LOAD_STAGE_SIZE = (NUM_WARPS * WARP_SIZE / 2);
+  static constexpr int32_t LOAD_STAGE_MOD =
+      NUM_STAGES * (NUM_WARPS * WARP_SIZE / 2);
+
+  // Two halves of all threads in a block conduct global loads for gate and up,
+  // repsectively.
+  auto load_and_advance_y_pred = [&] {
+    if (t_load < n_tokens_upper) {
+      auto s_gate_stage_128_staged_ptr = s_buff_gate_load_128 + stage_offset;
+      auto s_up_stage_128_staged_ptr = s_buff_up_load_128 + stage_offset;
+
+      // It is very important that LOAD_STAGE_SIZE is constexpr to avoid
+      // unnecessary ALU ops.
+      stage_offset += LOAD_STAGE_SIZE;
+      stage_offset %= LOAD_STAGE_MOD;
+
+      if (tid < HALF_THREAD_COUNT) {
+        cp_async4(s_gate_stage_128_staged_ptr, gate_128_ptr);
+        gate_128_ptr += stride_i_t_128;
+      } else {
+        cp_async4(s_up_stage_128_staged_ptr, up_128_ptr);
+        up_128_ptr += stride_i_t_128;
+      }
+      ++t_load;
+      ++load_stage_id;
+    }
+    // We fence even if there is nothing to load to simplify pipelining.
+    cp_async_fence();
+  };
+
+  #pragma unroll
+  for (int i = 0; i < NUM_STAGES - 1; i++) {
+    load_and_advance_y_pred();
+  }
+
+  __int64_t* s_gate_ptr = reinterpret_cast<__int64_t*>(
+                              s_buff_compute_32 + warp_id * (GROUP_SIZE / 2)) +
+                          lane_id;
+  __int64_t* s_up_ptr = s_gate_ptr + S_NUM_64 / 2;
+
+  static constexpr int32_t STAGE_SIZE = (GROUP_SIZE * NUM_WARPS) / 4u;
+  static constexpr int32_t STAGE_MOD = STAGE_SIZE * NUM_STAGES;
+
+  int32_t compute_pipeline_offset_64 = 0;
+
+  for (int32_t t = n_tokens_lower; t < n_tokens_upper; ++t) {
+    __nv_bfloat16 y_max_bf16 = EPS;
+    __nv_bfloat162 results_bf162[2];
+
+    cp_async_wait<NUM_STAGES - 2>();
+    __syncthreads();
+
+    // We double-buffer pipelined loads so that the next load will
+    // concurrently run with compute without overwrites.
+    load_and_advance_y_pred();
+
+    auto s_gate_compute_64 = s_gate_ptr + compute_pipeline_offset_64;
+    auto s_up_compute_64 = s_up_ptr + compute_pipeline_offset_64;
+
+    // STAGE_SIZE must also be constexpr!
+    compute_pipeline_offset_64 += STAGE_SIZE;
+    compute_pipeline_offset_64 %= STAGE_MOD;
+
+    // Each thread loads (gate/up) 2X 4X bfloat16 values into registers.
+    __int64_t gate64 = *s_gate_compute_64;
+    __nv_bfloat162* s_gate_compute_32 =
+        reinterpret_cast<__nv_bfloat162*>(&gate64);
+
+    __int64_t up64 = *s_up_compute_64;
+    __nv_bfloat162* s_up_compute_32 = reinterpret_cast<__nv_bfloat162*>(&up64);
+
+  #pragma unroll
+    for (int i = 0; i < 2; i++) {
+      // For silu, we make sure that div is emitted.
+      float2 gate = silu2(__bfloat1622float2(s_gate_compute_32[i]));
+      results_bf162[i] = __float22bfloat162_rn(gate);
+    }
+
+  #pragma unroll
+    for (int i = 0; i < 2; i++) {
+      results_bf162[i] = __hmul2(results_bf162[i], s_up_compute_32[i]);
+    }
+
+    auto _y_max2 =
+        __hmax2(__habs2(results_bf162[0]), __habs2(results_bf162[1]));
+
+    y_max_bf16 = __hmax(_y_max2.x, _y_max2.y);
+
+    // An entire group is assigned to a single warp, so a simple warp reduce
+    // is used.
+    __nv_bfloat16 y_s = warp_max(y_max_bf16) / fp8_max;
+
+    if constexpr (USE_UE8M0) {
+      y_s = hexp2(hceil(hlog2(y_s)));
+    }
+
+    auto inv_y = __float2bfloat16_rn(1.f) / y_s;
+
+    auto y_s2 = make_bfloat162(inv_y, inv_y);
+
+  #pragma unroll
+    for (int32_t i = 0; i < 2; ++i) {
+      results_bf162[i] =
+          clip(__hmul2(results_bf162[i], y_s2), __bfloat162bfloat162(fp8_min),
+               __bfloat162bfloat162(fp8_max));
+    }
+
+    auto fp8x4 = __nv_fp8x4_e4m3(results_bf162[0], results_bf162[1]);
+    *reinterpret_cast<__nv_fp8x4_e4m3*>(y_q_ptr) = fp8x4;
+    y_q_ptr += stride_yq_t;
+
+    if (lane_id == 0) {
+      *y_s_ptr = y_s;
+      y_s_ptr += stride_ys_t;
+    }
+  }
+}
+#endif
+
 }  // namespace vllm

 // Launch activation, gating, and quantize kernel.
@ -119,3 +470,117 @@ void silu_and_mul_quant(torch::Tensor& out,    // [..., d]
  TORCH_CHECK(input.size(-1) % 2 == 0);
  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
 }
+
+void silu_mul_fp8_quant_deep_gemm_cuda(
+    const at::Tensor& input,   // (E, T, 2*H)
+    const at::Tensor& counts,  // (E)
+    at::Tensor& y_q,           // (E, T, H) [OUT]
+    at::Tensor& y_s,           // (E, T, H//group_size) [OUT]
+    int64_t group_size, bool use_ue8m0, int64_t num_parallel_tokens) {
+#ifndef USE_ROCM
+  // This kernel relies heavily on cp.async and fp8 support.
+  // This kernel currently only supports H % 128 == 0 and assumes a
+  // fixed GROUP_SIZE of 128.
+  TORCH_CHECK(input.dtype() == torch::kBFloat16);
+  TORCH_CHECK(y_q.dtype() == torch::kFloat8_e4m3fn ||
+              y_q.dtype() == torch::kFloat8_e4m3fnuz);
+  TORCH_CHECK(y_s.dtype() == torch::kFloat32);
+  TORCH_CHECK(input.size(-1) % 256 == 0);
+
+  // Check that num_parallel_tokens is of power of 2 and between 1 and 64.
+  TORCH_CHECK(1 <= num_parallel_tokens && num_parallel_tokens <= 64);
+  TORCH_CHECK(!(num_parallel_tokens & (num_parallel_tokens - 1)));
+
+  using Idx_t = int64_t;
+
+  Idx_t E = input.size(0);
+  Idx_t T = input.size(1);
+  Idx_t H = input.size(2) / 2;
+  Idx_t stride_i_e = input.stride(0);
+  Idx_t stride_i_t = input.stride(1);
+  Idx_t stride_i_h = input.stride(2);
+  Idx_t stride_yq_e = y_q.stride(0);
+  Idx_t stride_yq_t = y_q.stride(1);
+  Idx_t stride_yq_h = y_q.stride(2);
+  Idx_t stride_ys_e = y_s.stride(0);
+  Idx_t stride_ys_t = y_s.stride(1);
+  Idx_t stride_ys_g = y_s.stride(2);
+
+  Idx_t stride_counts_e = counts.stride(0);
+
+  static constexpr int GROUP_SIZE = 128;
+
+  #define KERNEL_FN                                                         \
+    if (use_ue8m0) {                                                        \
+      vllm::silu_mul_fp8_quant_deep_gemm_kernel<fp8_t, NUM_WARPS, Idx_t,    \
+                                                NUM_PARALLEL_TOKENS, true>  \
+          <<<grid, block, 0, stream>>>(                                     \
+              reinterpret_cast<__nv_bfloat16*>(input.data_ptr()),           \
+              (fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(),                \
+              reinterpret_cast<int32_t*>(counts.data_ptr<int>()), H, G,     \
+              stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \
+              stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g,           \
+              stride_counts_e);                                             \
+    } else {                                                                \
+      vllm::silu_mul_fp8_quant_deep_gemm_kernel<fp8_t, NUM_WARPS, Idx_t,    \
+                                                NUM_PARALLEL_TOKENS, false> \
+          <<<grid, block, 0, stream>>>(                                     \
+              reinterpret_cast<__nv_bfloat16*>(input.data_ptr()),           \
+              (fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(),                \
+              reinterpret_cast<int32_t*>(counts.data_ptr<int>()), H, G,     \
+              stride_i_e, stride_i_t, stride_i_h, stride_yq_e, stride_yq_t, \
+              stride_yq_h, stride_ys_e, stride_ys_t, stride_ys_g,           \
+              stride_counts_e);                                             \
+    }
+
+  #define KERNEL_CALL_H                                       \
+    if (H % (4 * GROUP_SIZE) == 0) {                          \
+      static constexpr int NUM_WARPS = 4;                     \
+      populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \
+      KERNEL_FN                                               \
+    } else {                                                  \
+      static constexpr int NUM_WARPS = 1;                     \
+      populate_launch_params(NUM_WARPS, NUM_PARALLEL_TOKENS); \
+      KERNEL_FN                                               \
+    }
+
+  #define KERNEL_CALL_TOP_LEVEL                      \
+    if (num_parallel_tokens == 1) {                  \
+      static constexpr int NUM_PARALLEL_TOKENS = 1;  \
+      KERNEL_CALL_H                                  \
+    } else if (num_parallel_tokens == 2) {           \
+      static constexpr int NUM_PARALLEL_TOKENS = 2;  \
+      KERNEL_CALL_H                                  \
+    } else if (num_parallel_tokens == 4) {           \
+      static constexpr int NUM_PARALLEL_TOKENS = 4;  \
+      KERNEL_CALL_H                                  \
+    } else if (num_parallel_tokens == 8) {           \
+      static constexpr int NUM_PARALLEL_TOKENS = 8;  \
+      KERNEL_CALL_H                                  \
+    } else if (num_parallel_tokens == 16) {          \
+      static constexpr int NUM_PARALLEL_TOKENS = 16; \
+      KERNEL_CALL_H                                  \
+    } else if (num_parallel_tokens == 32) {          \
+      static constexpr int NUM_PARALLEL_TOKENS = 32; \
+      KERNEL_CALL_H                                  \
+    } else if (num_parallel_tokens == 64) {          \
+      static constexpr int NUM_PARALLEL_TOKENS = 64; \
+      KERNEL_CALL_H                                  \
+    }
+
+  Idx_t G;
+  dim3 block, grid;
+  auto populate_launch_params = [&](int num_warps, int _num_parallel_tokens) {
+    G = H / Idx_t(group_size * num_warps);
+    grid = dim3(E * G, _num_parallel_tokens);
+    block = dim3(num_warps * WARP_SIZE);
+  };
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  VLLM_DISPATCH_FP8_TYPES(y_q.scalar_type(),
+                          "silu_mul_fp8_quant_deep_gemm_kernel",
+                          [&] { KERNEL_CALL_TOP_LEVEL });
+
+#endif
+}
--- a/csrc/quantization/w8a8/fp8/common.cuh
+++ b/csrc/quantization/w8a8/fp8/common.cuh
@ -5,7 +5,9 @@

 #include <cmath>

-#ifdef USE_ROCM
+#ifndef USE_ROCM
+  #include "nvidia/quant_utils.cuh"
+#else
  #include "amd/quant_utils.cuh"
 #endif

@ -48,7 +50,9 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
  float r =
      fmaxf(-quant_type_max_v<fp8_type>, fminf(x, quant_type_max_v<fp8_type>));
 #ifndef USE_ROCM
-  return static_cast<fp8_type>(r);
+  // Use hardware cvt instruction for fp8 on nvidia
+  // Currently only support fp8_type = c10::Float8_e4m3fn
+  return fp8::vec_conversion<fp8_type, float>(r);
 #else
  // Use hardware cvt instruction for fp8 on rocm
  return fp8::cvt_c10<fp8_type>(r);
--- a/csrc/quantization/w8a8/fp8/nvidia/quant_utils.cuh
+++ b/csrc/quantization/w8a8/fp8/nvidia/quant_utils.cuh
@ -12,13 +12,26 @@ namespace vllm {
 namespace fp8 {
  #ifdef ENABLE_FP8

-    #if 0  // Disable the following code to reduce the binary size.
 template <typename Tout, typename Tin>
-__inline__ __device__ Tout
-vec_conversion(const Tin &x, const __nv_fp8_interpretation_t fp8_type) {
+__inline__ __device__ Tout vec_conversion(
+    const Tin& x, const __nv_fp8_interpretation_t fp8_type = __NV_E4M3) {
  return x;
 }

+// float -> c10::Float8_e4m3fn
+template <>
+__inline__ __device__ c10::Float8_e4m3fn
+vec_conversion<c10::Float8_e4m3fn, float>(
+    const float& a, const __nv_fp8_interpretation_t fp8_type) {
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  return static_cast<c10::Float8_e4m3fn>(a);
+    #else
+  return c10::Float8_e4m3fn(__nv_cvt_float_to_fp8(a, __NV_SATFINITE, fp8_type),
+                            c10::Float8_e4m3fn::from_bits());
+    #endif
+}
+
+    #if 0  // Disable the following code to reduce the binary size.
 // fp8 -> half
 template <>
 __inline__ __device__ uint16_t vec_conversion<uint16_t, uint8_t>(
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -32,6 +32,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  #define stride_tag
 #endif

+  ops.def(
+      "silu_mul_fp8_quant_deep_gemm_cuda(Tensor input, Tensor counts, Tensor! "
+      "y_q, Tensor! y_s, int group_size, "
+      "bool use_ue8m0, int num_parallel_tokens) -> ()");
+  ops.impl("silu_mul_fp8_quant_deep_gemm_cuda", torch::kCUDA,
+           &silu_mul_fp8_quant_deep_gemm_cuda);
+
  ops.def("weak_ref_tensor(Tensor input) -> Tensor");
  ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);

@ -214,16 +221,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "                 Tensor cos_sin_cache, bool is_neox) -> ()");
  ops.impl("rotary_embedding", torch::kCUDA, &rotary_embedding);

-  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key
-  // (supports multiple loras).
-  ops.def(
-      "batched_rotary_embedding(Tensor positions, Tensor! query,"
-      "                         Tensor!? key, int head_size,"
-      "                         Tensor cos_sin_cache, bool is_neox,"
-      "                         int rot_dim,"
-      "                         Tensor cos_sin_cache_offsets) -> ()");
-  ops.impl("batched_rotary_embedding", torch::kCUDA, &batched_rotary_embedding);
-
  // Quantization ops
 #ifndef USE_ROCM
  // Quantized GEMM for AWQ.
--- a/docs/README.md
+++ b/docs/README.md
@ -56,7 +56,7 @@ vLLM is flexible and easy to use with:
 - Tensor, pipeline, data and expert parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
+- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
 - Prefix caching support
 - Multi-LoRA support

--- a/docs/examples/README.md
+++ b/docs/examples/README.md
@ -2,6 +2,6 @@

 vLLM's examples are split into three categories:

- If you are using vLLM from within Python code, see [Offline Inference](./offline_inference)
- If you are using vLLM from an HTTP application or client, see [Online Serving](./online_serving)
- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see [Others](./others)
+- If you are using vLLM from within Python code, see the *Offline Inference* section.
+- If you are using vLLM from an HTTP application or client, see the *Online Serving* section.
+- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the *Others* section.
--- a/docs/features/README.md
+++ b/docs/features/README.md
@ -76,6 +76,3 @@ th:not(:first-child) {
 | multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](gh-issue:8477) | ✅     | ❌ |
 | best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
 | beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ |
-
-!!! note
-    Please refer to [Feature support through NxD Inference backend][feature-support-through-nxd-inference-backend] for features supported on AWS Neuron hardware
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@ -45,6 +45,32 @@ When using multi-modal inputs, vLLM normally hashes each media item by content t
        print(o.outputs[0].text)
    ```

+Using UUIDs, you can also skip sending media data entirely if you expect cache hits for respective items. Note that the request will fail if the skipped media doesn't have a corresponding UUID, or if the UUID fails to hit the cache.
+
+??? code
+
+    ```python
+    from vllm import LLM
+    from PIL import Image
+
+    # Qwen2.5-VL example with two images
+    llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct")
+
+    prompt = "USER: <image><image>\nDescribe the differences.\nASSISTANT:"
+    img_b = Image.open("/path/to/b.jpg")
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"image": [None, img_b]},
+        # Since img_a is expected to be cached, we can skip sending the actual
+        # image entirely.
+        "multi_modal_uuids": {"image": ["sku-1234-a", None]},
+    })
+
+    for o in outputs:
+        print(o.outputs[0].text)
+    ```
+
 !!! warning
    If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored.

@ -755,6 +781,39 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
    )
    ```

+For Online Serving, you can also skip sending media if you expect cache hits with provided UUIDs. You can do so by sending media like this:
+
+    ```python
+        # Image/video/audio URL:
+        {
+            "type": "image_url",
+            "image_url": None,
+            "uuid": image_uuid,
+        },
+
+        # image_embeds
+        {
+            "type": "image_embeds",
+            "image_embeds": None,
+            "uuid": image_uuid
+        },
+
+        # input_audio:
+        {
+            "type": "input_audio",
+            "input_audio": None,
+            "uuid": audio_uuid
+        },
+
+        # PIL Image:
+        {
+            "type": "image_pil",
+            "image_pil": None
+            "uuid": image_uuid
+        }
+
+    ```
+
 !!! note
    Only one message can contain `{"type": "image_embeds"}`.
    If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc.
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@ -43,19 +43,19 @@ th:not(:first-child) {
 }
 </style>

-| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   | AWS Neuron   | Google TPU   |
-|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|--------------|
-| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌          | ❌           |
-| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌          | ❌           |
-| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        | ✅︎          | ✅︎           |
-| FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ✅︎          | ❌           |
-| BitBLAS               | ✅︎      | ✅       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| BitBLAS (GPTQ)        | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌          | ❌           |
-| INC (W8A8)            | ❌      | ❌       | ❌       | ❌    | ❌       | ❌         | ❌          | ✅︎         | ❌        | ❌          | ❌           |
+| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   | Google TPU   |
+|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|
+| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌           |
+| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌           |
+| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
+| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        | ✅︎           |
+| FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌           |
+| BitBLAS               | ✅︎      | ✅       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
+| BitBLAS (GPTQ)        | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
+| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
+| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
+| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌           |
+| INC (W8A8)            | ❌      | ❌       | ❌       | ❌    | ❌       | ❌         | ❌          | ✅︎         | ❌        | ❌           |

 - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
 - ✅︎ indicates that the quantization method is supported on the specified hardware.
--- a/docs/getting_started/installation/.nav.yml
+++ b/docs/getting_started/installation/.nav.yml
@ -3,5 +3,3 @@ nav:
  - gpu.md
  - cpu.md
  - google_tpu.md
-  - intel_gaudi.md
-  - aws_neuron.md
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@ -12,7 +12,6 @@ vLLM supports the following hardware platforms:
    - [Apple silicon](cpu.md#apple-silicon)
    - [IBM Z (S390X)](cpu.md#ibm-z-s390x)
 - [Google TPU](google_tpu.md)
- [AWS Neuron](aws_neuron.md)

 ## Hardware Plugins

--- a/docs/getting_started/installation/aws_neuron.md
+++ b/docs/getting_started/installation/aws_neuron.md
@ -1,147 +0,0 @@
-# AWS Neuron
-
-[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) is the software development kit (SDK) used to run deep learning and
-generative AI workloads on AWS Inferentia and AWS Trainium powered Amazon EC2 instances and UltraServers (Inf1, Inf2, Trn1, Trn2,
-and Trn2 UltraServer). Both Trainium and Inferentia are powered by fully-independent heterogeneous compute-units called NeuronCores.
-This describes how to set up your environment to run vLLM on Neuron.
-
-!!! warning
-    There are no pre-built wheels or images for this device, so you must build vLLM from source.
-
-## Requirements
-
- OS: Linux
- Python: 3.9 or newer
- Pytorch 2.5/2.6
- Accelerator: NeuronCore-v2 (in trn1/inf2 chips) or NeuronCore-v3 (in trn2 chips)
- AWS Neuron SDK 2.23
-
-## Configure a new environment
-
-### Launch a Trn1/Trn2/Inf2 instance and verify Neuron dependencies
-
-The easiest way to launch a Trainium or Inferentia instance with pre-installed Neuron dependencies is to follow this
-[quick start guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/multiframework/multi-framework-ubuntu22-neuron-dlami.html#setup-ubuntu22-multi-framework-dlami) using the Neuron Deep Learning AMI (Amazon machine image).
-
- After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance
- Once inside your instance, activate the pre-installed virtual environment for inference by running
-
-```bash
-source /opt/aws_neuronx_venv_pytorch_2_6_nxd_inference/bin/activate
-```
-
-Refer to the [NxD Inference Setup Guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/nxdi-setup.html)
-for alternative setup instructions including using Docker and manually installing dependencies.
-
-!!! note
-    NxD Inference is the default recommended backend to run inference on Neuron. If you are looking to use the legacy [transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx)
-    library, refer to [Transformers NeuronX Setup](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/transformers-neuronx/setup/index.html).
-
-## Set up using Python
-
-### Pre-built wheels
-
-Currently, there are no pre-built Neuron wheels.
-
-### Build wheel from source
-
-To build and install vLLM from source, run:
-
-```bash
-git clone https://github.com/vllm-project/vllm.git
-cd vllm
-pip install -U -r requirements/neuron.txt
-VLLM_TARGET_DEVICE="neuron" pip install -e .
-```
-
-AWS Neuron maintains a [Github fork of vLLM](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2) at
-<https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2>, which contains several features in addition to what's
-available on vLLM V0. Please utilize the AWS Fork for the following features:
-
- Llama-3.2 multi-modal support
- Multi-node distributed inference
-
-Refer to [vLLM User Guide for NxD Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/vllm-user-guide.html)
-    for more details and usage examples.
-
-To install the AWS Neuron fork, run the following:
-
-```bash
-git clone -b neuron-2.23-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git
-cd upstreaming-to-vllm
-pip install -r requirements/neuron.txt
-VLLM_TARGET_DEVICE="neuron" pip install -e .
-```
-
-Note that the AWS Neuron fork is only intended to support Neuron hardware; compatibility with other hardwares is not tested.
-
-## Set up using Docker
-
-### Pre-built images
-
-Currently, there are no pre-built Neuron images.
-
-### Build image from source
-
-See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image.
-
-Make sure to use <gh-file:docker/Dockerfile.neuron> in place of the default Dockerfile.
-
-## Extra information
-
-[](){ #feature-support-through-nxd-inference-backend }
-
-### Feature support through NxD Inference backend
-
-The current vLLM and Neuron integration relies on either the `neuronx-distributed-inference` (preferred) or `transformers-neuronx` backend
-to perform most of the heavy lifting which includes PyTorch model initialization, compilation, and runtime execution. Therefore, most
-[features supported on Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html) are also available via the vLLM integration.
-
-To configure NxD Inference features through the vLLM entrypoint, use the `override_neuron_config` setting. Provide the configs you want to override
-as a dictionary (or JSON object when starting vLLM from the CLI). For example, to disable auto bucketing, include
-
-```python
-override_neuron_config={
-    "enable_bucketing":False,
-}
-```
-
-or when launching vLLM from the CLI, pass
-
-```bash
--override-neuron-config "{\"enable_bucketing\":false}"
-```
-
-Alternatively, users can directly call the NxDI library to trace and compile your model, then load the pre-compiled artifacts
-(via `NEURON_COMPILED_ARTIFACTS` environment variable) in vLLM to run inference workloads.
-
-### Known limitations
-
- EAGLE speculative decoding: NxD Inference requires the EAGLE draft checkpoint to include the LM head weights from the target model. Refer to this
-  [guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html#eagle-checkpoint-compatibility)
-  for how to convert pretrained EAGLE model checkpoints to be compatible for NxDI.
- Quantization: the native quantization flow in vLLM is not well supported on NxD Inference. It is recommended to follow this
-  [Neuron quantization guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/custom-quantization.html)
-  to quantize and compile your model using NxD Inference, and then load the compiled artifacts into vLLM.
- Multi-LoRA serving: NxD Inference only supports loading of LoRA adapters at server startup. Dynamic loading of LoRA adapters at
-  runtime is not currently supported. Refer to [multi-lora example](https://github.com/aws-neuron/upstreaming-to-vllm/blob/neuron-2.23-vllm-v0.7.2/examples/offline_inference/neuron_multi_lora.py)
- Multi-modal support: multi-modal support is only available through the AWS Neuron fork. This feature has not been upstreamed
-  to vLLM main because NxD Inference currently relies on certain adaptations to the core vLLM logic to support this feature.
- Multi-node support: distributed inference across multiple Trainium/Inferentia instances is only supported on the AWS Neuron fork. Refer
-  to this [multi-node example](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2/examples/neuron/multi_node)
-  to run. Note that tensor parallelism (distributed inference across NeuronCores) is available in vLLM main.
- Known edge case bug in speculative decoding: An edge case failure may occur in speculative decoding when sequence length approaches
-  max model length (e.g. when requesting max tokens up to the max model length and ignoring eos). In this scenario, vLLM may attempt
-  to allocate an additional block to ensure there is enough memory for number of lookahead slots, but since we do not have good support
-  for paged attention, there isn't another Neuron block for vLLM to allocate. A workaround fix (to terminate 1 iteration early) is
-  implemented in the AWS Neuron fork but is not upstreamed to vLLM main as it modifies core vLLM logic.
-
-### Environment variables
-
- `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid
-  compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the
-  artifacts under `neuron-compiled-artifacts/{unique_hash}/` subdirectory in the model path. If this environment variable is set,
-  but the directory does not exist, or the contents are invalid, Neuron will also fall back to a new compilation and store the artifacts
-  under this specified path.
- `NEURON_CONTEXT_LENGTH_BUCKETS`: Bucket sizes for context encoding. (Only applicable to `transformers-neuronx` backend).
- `NEURON_TOKEN_GEN_BUCKETS`: Bucket sizes for token generation. (Only applicable to `transformers-neuronx` backend).
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -389,6 +389,7 @@ th {
 | `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `OLMo3ForCausalLM` | OLMo3 | TBA | ✅︎ | ✅︎ | ✅︎ |
 | `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | ✅︎ |
 | `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ | ✅︎ |
 | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ |
--- a/examples/offline_inference/tpu.py
+++ b/examples/offline_inference/tpu.py
@ -42,7 +42,7 @@ def main():
        llm_args["model"] = "meta-llama/Llama-3.1-8B-Instruct"

    # Set `enforce_eager=True` to avoid ahead-of-time compilation.
-    # In real workloads, `enforace_eager` should be `False`.
+    # In real workloads, `enforce_eager` should be `False`.
    llm = LLM(**llm_args)
    outputs = llm.generate(prompts, sampling_params)
    print("-" * 50)
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -1764,6 +1764,7 @@ def apply_image_repeat(
    probs = [1.0 - image_repeat_prob, image_repeat_prob]

    inputs = []
+    inputs_with_empty_media = []
    cur_image = data
    for i in range(num_prompts):
        if image_repeat_prob is not None:
@ -1774,14 +1775,25 @@ def apply_image_repeat(
                new_val = (i // 256 // 256, i // 256, i % 256)
                cur_image.putpixel((0, 0), new_val)

+        uuid = "uuid_{}".format(i)
+
        inputs.append(
            {
                "prompt": prompts[i % len(prompts)],
                "multi_modal_data": {modality: cur_image},
+                "multi_modal_uuids": {modality: uuid},
            }
        )

-    return inputs
+        inputs_with_empty_media.append(
+            {
+                "prompt": prompts[i % len(prompts)],
+                "multi_modal_data": {modality: None},
+                "multi_modal_uuids": {modality: uuid},
+            }
+        )
+
+    return inputs, inputs_with_empty_media


@contextmanager
@ -1860,6 +1872,13 @@ def parse_args():
        help="If True, then use different prompt (with the same multi-modal "
        "data) for each request.",
    )
+
+    parser.add_argument(
+        "--verify-mm-cache-hit-with-uuids",
+        action="store_true",
+        help="If True, will send all requests in a second batch with empty mm "
+        "data to verify cache hits with UUIDs.",
+    )
    return parser.parse_args()


@ -1903,26 +1922,48 @@ def main(args):
    assert args.num_prompts > 0
    if args.num_prompts == 1:
        # Single inference
+        uuid = "uuid_0"
        inputs = {
            "prompt": prompts[0],
            "multi_modal_data": {modality: data},
+            "multi_modal_uuids": {modality: uuid},
+        }
+        inputs_with_empty_media = {
+            "prompt": prompts[0],
+            "multi_modal_data": {modality: None},
+            "multi_modal_uuids": {modality: uuid},
        }
    else:
        # Batch inference
        if args.image_repeat_prob is not None:
            # Repeat images with specified probability of "image_repeat_prob"
-            inputs = apply_image_repeat(
-                args.image_repeat_prob, args.num_prompts, data, prompts, modality
+            inputs, inputs_with_empty_media = apply_image_repeat(
+                args.image_repeat_prob,
+                args.num_prompts,
+                data,
+                prompts,
+                modality,
            )
        else:
            # Use the same image for all prompts
-            inputs = [
-                {
-                    "prompt": prompts[i % len(prompts)],
-                    "multi_modal_data": {modality: data},
-                }
-                for i in range(args.num_prompts)
-            ]
+            inputs = []
+            inputs_with_empty_media = []
+            for i in range(args.num_prompts):
+                uuid = "uuid_{}".format(i)
+                inputs.append(
+                    {
+                        "prompt": prompts[i % len(prompts)],
+                        "multi_modal_data": {modality: data},
+                        "multi_modal_uuids": {modality: uuid},
+                    }
+                )
+                inputs_with_empty_media.append(
+                    {
+                        "prompt": prompts[i % len(prompts)],
+                        "multi_modal_data": {modality: None},
+                        "multi_modal_uuids": {modality: uuid},
+                    }
+                )

    # Add LoRA request if applicable
    lora_request = (
@ -1942,6 +1983,26 @@ def main(args):
        print(generated_text)
        print("-" * 50)

+    if args.verify_mm_cache_hit_with_uuids:
+        try:
+            # Verify cache hits with UUIDs
+            print(
+                "Sending a second batch of requests with empty media"
+                " and matching UUIDs."
+            )
+            outputs = llm.generate(
+                inputs_with_empty_media,
+                sampling_params=sampling_params,
+                lora_request=lora_request,
+            )
+            print("-" * 50)
+            for o in outputs:
+                generated_text = o.outputs[0].text
+                print(generated_text)
+                print("-" * 50)
+        except Exception as e:
+            print(f"Failed to verify cache hits with UUIDs. Error: {e}")
+

 if __name__ == "__main__":
    args = parse_args()
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@ -62,6 +62,8 @@ def _fix_prompt_embed_outputs(
@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False])
+@pytest.mark.parametrize("async_scheduling", [True, False])
+@pytest.mark.parametrize("model_executor", ["uni", "mp"])
@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
 def test_models(
    monkeypatch: pytest.MonkeyPatch,
@ -70,6 +72,8 @@ def test_models(
    backend: str,
    max_tokens: int,
    enforce_eager: bool,
+    async_scheduling: bool,
+    model_executor: str,
    enable_prompt_embeds: bool,
 ) -> None:

@ -77,6 +81,12 @@ def test_models(
            "VLLM_USE_V1") and envs.VLLM_USE_V1:
        pytest.skip("enable_prompt_embeds is not supported in v1.")

+    if not envs.VLLM_USE_V1:
+        if async_scheduling:
+            pytest.skip("async_scheduling only supported in v1.")
+        if model_executor != "uni":
+            pytest.skip("only test uniproc executor for v0.")
+
    if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
        pytest.skip(
            f"{backend} does not support gemma2 with full context length.")
@ -98,11 +108,15 @@ def test_models(
                    prompt_embeds = hf_model.get_prompt_embeddings(
                        example_prompts)

-        with VllmRunner(model,
-                        max_model_len=8192,
-                        enforce_eager=enforce_eager,
-                        enable_prompt_embeds=enable_prompt_embeds,
-                        gpu_memory_utilization=0.7) as vllm_model:
+        with VllmRunner(
+                model,
+                max_model_len=8192,
+                enforce_eager=enforce_eager,
+                enable_prompt_embeds=enable_prompt_embeds,
+                gpu_memory_utilization=0.7,
+                async_scheduling=async_scheduling,
+                distributed_executor_backend=model_executor,
+        ) as vllm_model:
            if enable_prompt_embeds:
                vllm_outputs = vllm_model.generate_greedy(
                    prompt_embeds, max_tokens)
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@ -522,6 +522,71 @@ async def test_completions_with_image_with_uuid(
        assert isinstance(chat_completion.choices[0].message.content, str)
        assert len(chat_completion.choices[0].message.content) > 0

+        # Second request, with empty image but the same uuid.
+        chat_completion_with_empty_image = await client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role":
+                    "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {},
+                            "uuid": image_url
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion_with_empty_image.choices[
+            0].message.content is not None
+        assert isinstance(
+            chat_completion_with_empty_image.choices[0].message.content, str)
+        assert len(
+            chat_completion_with_empty_image.choices[0].message.content) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_completions_with_empty_image_with_uuid_without_cache_hit(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+):
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role":
+                    "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {},
+                            "uuid": "uuid_not_previously_seen"
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+

@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@ -79,6 +79,28 @@ def phi3v_tokenizer():
    )


+@pytest.fixture(scope="function")
+def qwen2_audio_model_config():
+    return ModelConfig(
+        QWEN2AUDIO_MODEL_ID,
+        runner="generate",
+        trust_remote_code=True,
+        limit_mm_per_prompt={
+            "audio": 1,
+        },
+    )
+
+
+@pytest.fixture(scope="module")
+def qwen2_audio_tokenizer():
+    return TokenizerGroup(
+        tokenizer_id=QWEN2AUDIO_MODEL_ID,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
+
+
@pytest.fixture(scope="function")
 def qwen25omni_model_config_mm_interleaved():
    return ModelConfig(
@ -169,6 +191,7 @@ def audio_url():
 def _assert_mm_data_is_image_input(
    mm_data: Optional[MultiModalDataDict],
    image_count: int,
+    skipped_image_indices: Optional[list] = None,
 ) -> None:
    assert mm_data is not None
    assert set(mm_data.keys()) == {"image"}
@ -177,6 +200,9 @@ def _assert_mm_data_is_image_input(
    assert image_data is not None

    assert isinstance(image_data, list) and len(image_data) == image_count
+    if skipped_image_indices is not None:
+        for i in skipped_image_indices:
+            assert image_data[i] is None


 def _assert_mm_uuids(
@ -205,8 +231,10 @@ MultiModalDataCounts = Mapping[ModalityType, int]


 def _assert_mm_data_inputs(
-    mm_data: Optional[MultiModalDataDict],
-    data_count: MultiModalDataCounts,
+        mm_data: Optional[MultiModalDataDict],
+        data_count: MultiModalDataCounts,
+        skipped_media_indices: Optional[dict[
+            str, list]] = None,  # modality -> list[int]
 ) -> None:
    assert mm_data is not None
    assert set(data_count.keys()) == (set(mm_data.keys()))
@ -216,6 +244,13 @@ def _assert_mm_data_inputs(
        assert modality_data is not None
        assert isinstance(modality_data, list) and len(modality_data) == n

+        if skipped_media_indices is not None:
+            skipped_media_indices_for_modality = skipped_media_indices.get(
+                modality)
+            assert skipped_media_indices_for_modality is not None
+            for i in skipped_media_indices_for_modality:
+                assert modality_data[i] is None
+

 def test_parse_chat_messages_single_image(
    phi3v_model_config,
@ -289,6 +324,41 @@ def test_parse_chat_messages_single_image_with_uuid(
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])


+def test_parse_chat_messages_single_empty_image_with_uuid(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid = str(hash(image_url))
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": None,
+                    "uuid": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0])
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
+
+
 def test_parse_chat_messages_single_image_with_bad_uuid_format(
    phi3v_model_config,
    phi3v_tokenizer,
@ -375,6 +445,96 @@ def test_parse_chat_messages_multiple_images_with_uuids(
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])


+def test_parse_chat_messages_multiple_empty_images_with_uuids(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid1 = "my_uuid_1"
+    image_uuid2 = "my_uuid_2"
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": None,
+                    "uuid": image_uuid1,
+                },
+                {
+                    "type": "image_url",
+                    "image_url": None,
+                    "uuid": image_uuid2,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in the image?",
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[0, 1])
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
+
+
+def test_parse_chat_messages_mixed_empty_images_with_uuids(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid1 = "my_uuid_1"
+    image_uuid2 = "my_uuid_2"
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url,
+                    },
+                    "uuid": image_uuid1,
+                },
+                {
+                    "type": "image_url",
+                    "image_url": None,
+                    "uuid": image_uuid2,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in the image?",
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[1])
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
+
+
@pytest.mark.asyncio
 async def test_parse_chat_messages_single_image_with_uuid_async(
    phi3v_model_config,
@ -413,6 +573,44 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])


+@pytest.mark.asyncio
+async def test_parse_chat_messages_empty_image_with_uuid_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid = str(hash(image_url))
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": None,
+                    "uuid": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    _assert_mm_data_is_image_input(await mm_future,
+                                   1,
+                                   skipped_image_indices=[0])
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
+
+
@pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_with_uuids_async(
    phi3v_model_config,
@ -460,6 +658,53 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])


+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid1 = "my_uuid_1"
+    image_uuid2 = "my_uuid_2"
+
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": None,
+                    "uuid": image_uuid1,
+                },
+                {
+                    "type": "image_pil",
+                    "image_pil": None,
+                    "uuid": image_uuid2,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in these images?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
+    }]
+    _assert_mm_data_is_image_input(await mm_future,
+                                   2,
+                                   skipped_image_indices=[0, 1])
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
+
+
@pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
    phi3v_model_config,
@ -653,6 +898,114 @@ def test_parse_chat_messages_multiple_images(
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])


+def test_parse_chat_messages_empty_pil_image_with_uuid(
+    phi3v_model_config,
+    phi3v_tokenizer,
+):
+    uuid = "abcd"
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_pil",
+                    "image_pil": None,
+                    "uuid": uuid
+                },
+                {
+                    "type": "text",
+                    "text": "What's in this image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in this image?",
+    }]
+    _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0])
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
+
+
+def test_parse_chat_messages_empty_image_embeds_with_uuid(
+    phi3v_model_config,
+    phi3v_tokenizer,
+):
+    uuid = "abcd"
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_embeds",
+                    "image_embeds": None,
+                    "uuid": uuid
+                },
+                {
+                    "type": "text",
+                    "text": "What's in this image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in this image?",
+    }]
+    assert mm_data is not None
+    assert "image" in mm_data
+    assert mm_data["image"] is None
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+):
+    uuid = "abcd"
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_embeds",
+                    "image_embeds": None,
+                    "uuid": uuid
+                },
+                {
+                    "type": "text",
+                    "text": "What's in this image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in this image?",
+    }]
+    mm_data = await mm_future
+    assert mm_data is not None
+    assert "image" in mm_data
+    assert mm_data["image"] is None
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
+
+
@pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_async(
    phi3v_model_config,
@ -1636,6 +1989,118 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
                     expected_uuids=["audio_123"])


+def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_messages_interleave(  # noqa: E501
+    qwen25omni_model_config_mm_interleaved,
+    qwen25omni_tokenizer,
+    image_url,
+    video_url,
+    audio_url,
+):
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": None,
+                        "uuid": "image_123",
+                    },
+                    {
+                        "type": "text",
+                        "text": "Now listen to this audio"
+                    },
+                    {
+                        "type": "audio_url",
+                        "audio_url": None,
+                        "uuid": "audio_123",
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": None,
+                        "uuid": "image_123",
+                    },
+                    {
+                        "type": "text",
+                        "text": "And what's in the video?"
+                    },
+                    {
+                        "type": "video_url",
+                        "video_url": None,
+                        "uuid": "video_123",
+                    },
+                ],
+            },
+        ],
+        qwen25omni_model_config_mm_interleaved,
+        qwen25omni_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [
+        {
+            "role":
+            "user",
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",  # noqa: E501
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role":
+            "user",
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
+        },
+    ]
+
+    _assert_mm_data_inputs(mm_data, {
+        "image": 2,
+        "video": 1,
+        "audio": 1
+    },
+                           skipped_media_indices={
+                               "image": [0, 1],
+                               "video": [0],
+                               "audio": [0]
+                           })
+    _assert_mm_uuids(mm_uuids,
+                     2,
+                     modality="image",
+                     expected_uuids=["image_123", "image_123"])
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="video",
+                     expected_uuids=["video_123"])
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="audio",
+                     expected_uuids=["audio_123"])
+
+
 def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave(  # noqa: E501
    qwen25omni_model_config_mm_interleaved,
    qwen25omni_tokenizer,
@ -2355,3 +2820,82 @@ def test_apply_mistral_chat_template_thinking_chunk():
        r"[INST]Thanks, what is 3+3?[/INST]")

    assert string_tokens == expected_tokens
+
+
+def test_parse_chat_messages_single_empty_audio_with_uuid(
+    qwen2_audio_model_config,
+    qwen2_audio_tokenizer,
+):
+    audio_uuid = "abcd"
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "input_audio",
+                    "input_audio": {},
+                    "uuid": audio_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What does the audio say?"
+                },
+            ],
+        }],
+        qwen2_audio_model_config,
+        qwen2_audio_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?"
+    }]
+    _assert_mm_data_inputs(mm_data, {"audio": 1})
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="audio",
+                     expected_uuids=[audio_uuid])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
+    qwen2_audio_model_config,
+    qwen2_audio_tokenizer,
+):
+    audio_uuid = "abcd"
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "input_audio",
+                    "input_audio": {},
+                    "uuid": audio_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What does the audio say?"
+                },
+            ],
+        }],
+        qwen2_audio_model_config,
+        qwen2_audio_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?"
+    }]
+    _assert_mm_data_inputs(await mm_future, {"audio": 1})
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="audio",
+                     expected_uuids=[audio_uuid])
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@ -22,7 +22,10 @@ def clear_cache():

 # Define MLA and non-MLA backends separately
 DEVICE_MLA_BACKENDS = {
-    "cuda": ["TRITON_MLA", "FLASHMLA", "FLASH_ATTN_MLA", "CUTLASS_MLA"],
+    "cuda": [
+        "TRITON_MLA", "FLASHMLA", "FLASHINFER_MLA", "FLASH_ATTN_MLA",
+        "CUTLASS_MLA"
+    ],
    "hip": ["TRITON_MLA", "ROCM_AITER_MLA"],
    "cpu": [],
 }
@ -90,8 +93,8 @@ def test_env(

            with patch("vllm.attention.selector.current_platform",
                       CpuPlatform()):
-                backend = get_attn_backend(16, torch.float16, torch.float16,
-                                           block_size, False)
+                backend = get_attn_backend(16, torch.float16, None, block_size,
+                                           False)
            assert backend.get_name() == "TORCH_SDPA_VLLM_V1"

        elif device == "hip":
@ -109,7 +112,7 @@ def test_env(
                        with pytest.raises(ValueError) as exc_info:
                            get_attn_backend(16,
                                             torch.float16,
-                                             torch.float16,
+                                             None,
                                             block_size,
                                             False,
                                             use_mla=use_mla)
@ -120,7 +123,7 @@ def test_env(
                        with pytest.raises(ValueError) as exc_info:
                            get_attn_backend(16,
                                             torch.float16,
-                                             torch.float16,
+                                             None,
                                             block_size,
                                             False,
                                             use_mla=use_mla)
@ -130,7 +133,7 @@ def test_env(
                        # Valid backend-block_size combination
                        backend = get_attn_backend(16,
                                                   torch.float16,
-                                                   torch.float16,
+                                                   None,
                                                   block_size,
                                                   False,
                                                   use_mla=use_mla)
@ -139,7 +142,7 @@ def test_env(
                else:
                    backend = get_attn_backend(16,
                                               torch.float16,
-                                               torch.float16,
+                                               None,
                                               block_size,
                                               False,
                                               use_mla=use_mla)
@ -153,6 +156,8 @@ def test_env(
                    # CUDA MLA backend logic:
                    # - CUTLASS_MLA: only supported with block_size == 128
                    #   and Blackwell GPUs (SM 10.0), V1 only
+                    # - FLASHINFER_MLA: only supported on Blackwell GPUs
+                    #   (SM 10.0+), V1 only
                    # - FLASHMLA: only supported with block_size == 64
                    # - FLASH_ATTN_MLA: V1 only
                    # - TRITON_MLA: fallback for other cases
@ -169,12 +174,31 @@ def test_env(
                        else:
                            backend = get_attn_backend(16,
                                                       torch.float16,
-                                                       torch.float16,
+                                                       None,
                                                       block_size,
                                                       False,
                                                       use_mla=use_mla)
                            expected = "CUTLASS_MLA_VLLM_V1"
                            assert backend.get_name() == expected
+                    elif name == "FLASHINFER_MLA":
+                        if not use_v1:
+                            # FlashInfer MLA only supported on V1 engine
+                            pytest.skip(
+                                "FlashInfer MLA only supported on V1 engine")
+                        elif block_size not in [32, 64]:
+                            # FlashInfer MLA only supports block_size 32 or 64
+                            pytest.skip(
+                                "FlashInfer MLA only supports block_size 32 "
+                                "or 64")
+                        else:
+                            backend = get_attn_backend(16,
+                                                       torch.float16,
+                                                       None,
+                                                       block_size,
+                                                       False,
+                                                       use_mla=use_mla)
+                            expected = "FLASHINFER_MLA"
+                            assert backend.get_name() == expected
                    elif name == "FLASHMLA":
                        if block_size != 64:
                            # FlashMLA only supports block_size == 64
@ -189,7 +213,7 @@ def test_env(
                            else:
                                backend = get_attn_backend(16,
                                                           torch.float16,
-                                                           torch.float16,
+                                                           None,
                                                           block_size,
                                                           False,
                                                           use_mla=use_mla)
@ -204,7 +228,7 @@ def test_env(
                        else:
                            backend = get_attn_backend(16,
                                                       torch.float16,
-                                                       torch.float16,
+                                                       None,
                                                       block_size,
                                                       False,
                                                       use_mla=use_mla)
@ -214,7 +238,7 @@ def test_env(
                        # TRITON_MLA or other fallback
                        backend = get_attn_backend(16,
                                                   torch.float16,
-                                                   torch.float16,
+                                                   None,
                                                   block_size,
                                                   False,
                                                   use_mla=use_mla)
@ -224,7 +248,7 @@ def test_env(
                elif name == "FLASHINFER":
                    backend = get_attn_backend(16,
                                               torch.float16,
-                                               torch.float16,
+                                               None,
                                               block_size,
                                               False,
                                               use_mla=use_mla)
@ -233,7 +257,7 @@ def test_env(
                else:
                    backend = get_attn_backend(32,
                                               torch.float16,
-                                               torch.float16,
+                                               None,
                                               block_size,
                                               False,
                                               use_mla=use_mla)
@ -243,7 +267,7 @@ def test_env(
                    if use_v1:
                        backend = get_attn_backend(16,
                                                   torch.float16,
-                                                   torch.float16,
+                                                   None,
                                                   block_size,
                                                   False,
                                                   use_mla=use_mla)
@ -269,15 +293,13 @@ def test_fp32_fallback(

            with patch("vllm.attention.selector.current_platform",
                       CpuPlatform()):
-                backend = get_attn_backend(16, torch.float32, torch.float32,
-                                           16, False)
+                backend = get_attn_backend(16, torch.float32, None, 16, False)
            assert backend.get_name() == "TORCH_SDPA_VLLM_V1"

        elif device == "cuda":
            with patch("vllm.attention.selector.current_platform",
                       CudaPlatform()):
-                backend = get_attn_backend(16, torch.float32, torch.float32,
-                                           16, False)
+                backend = get_attn_backend(16, torch.float32, None, 16, False)
            assert (backend.get_name() == "FLEX_ATTENTION"
                    if use_v1 else "XFORMERS")

@ -331,7 +353,7 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
        assert backend.get_name() != STR_FLASH_ATTN_VAL

        # Attention-free models should bypass env and use PlaceholderAttention
-        backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
+        backend = get_attn_backend(16, torch.float16, None, 16, True)
        assert backend.get_name() != STR_FLASH_ATTN_VAL


--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from itertools import accumulate, product
+from itertools import product
 from typing import Callable, Optional

 import pytest
@ -111,151 +111,6 @@ def test_rotary_embedding(
            "expected returned key to be None"


-@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
-@pytest.mark.parametrize("tensor_shape_fn", TENSORS_SHAPES_FN)
-@pytest.mark.parametrize("batch_size", BATCH_SIZES)
-@pytest.mark.parametrize("seq_len", SEQ_LENS)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("use_key", USE_KEY)
-@torch.inference_mode()
-def test_batched_rotary_embedding(
-    is_neox_style: bool,
-    tensor_shape_fn: Callable[[int, int, int, int], tuple[int]],
-    batch_size: int,
-    seq_len: int,
-    num_heads: int,
-    head_size: int,
-    rotary_dim: Optional[int],
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-    use_key: bool,
-    max_position: int = 8192,
-    base: float = 10000,
-) -> None:
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    if rotary_dim is None:
-        rotary_dim = head_size
-    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, {
-        "rope_type": "linear",
-        "factor": (1, )
-    })
-    rope = rope.to(dtype=dtype, device=torch.get_default_device())
-
-    positions = torch.randint(0, max_position, (batch_size, seq_len))
-    query_shape = tensor_shape_fn(batch_size, seq_len, num_heads, head_size)
-    query = torch.randn(query_shape, dtype=dtype)
-    key = torch.randn_like(query) if use_key else None
-
-    # slice tensor if required, noop otherwise
-    query = query[..., :head_size]
-    key = key[..., :head_size] if use_key else None
-
-    # NOTE(woosuk): The reference implementation should be executed first
-    # because the custom kernel is in-place.
-    ref_query, ref_key = rope.forward_native(positions, query, key)
-    out_query, out_key = rope.forward(positions,
-                                      query,
-                                      key,
-                                      offsets=torch.zeros(batch_size * seq_len,
-                                                          dtype=torch.long,
-                                                          device=device))
-    # Compare the results.
-    torch.testing.assert_close(out_query,
-                               ref_query,
-                               atol=get_default_atol(out_query),
-                               rtol=get_default_rtol(out_query))
-    if use_key:
-        torch.testing.assert_close(out_key,
-                                   ref_key,
-                                   atol=get_default_atol(out_key),
-                                   rtol=get_default_rtol(out_key))
-    else:
-        assert ref_key is None and out_key is None, \
-            "expected returned key to be None"
-
-
-@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
-@pytest.mark.parametrize("batch_size", BATCH_SIZES)
-@pytest.mark.parametrize("seq_len", SEQ_LENS)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("use_key", USE_KEY)
-@torch.inference_mode()
-def test_batched_rotary_embedding_multi_lora(
-    is_neox_style: bool,
-    batch_size: int,
-    seq_len: int,
-    num_heads: int,
-    head_size: int,
-    rotary_dim: Optional[int],
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-    use_key: bool,
-    max_position: int = 8192,
-    base: float = 10000,
-) -> None:
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    if rotary_dim is None:
-        rotary_dim = head_size
-    scaling_factors: list[int] = [1, 2, 4]
-    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, {
-        "rope_type": "linear",
-        "factor": tuple(scaling_factors)
-    })
-    rope = rope.to(dtype=dtype, device=torch.get_default_device())
-
-    positions = torch.randint(0, max_position, (batch_size, seq_len))
-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads * head_size,
-                        dtype=dtype)
-    key = torch.randn_like(query) if use_key else None
-
-    offset_map = torch.tensor(
-        list(
-            accumulate([0] + [
-                max_position * scaling_factor * 2
-                for scaling_factor in scaling_factors[:-1]
-            ])))
-    query_types = torch.randint(0,
-                                len(scaling_factors), (batch_size, seq_len),
-                                device=device)
-    query_offsets = offset_map[query_types]
-
-    # NOTE(woosuk): The reference implementation should be executed first
-    # because the custom kernel is in-place.
-    ref_query, ref_key = rope.forward_native(positions, query, key,
-                                             query_offsets)
-    out_query, out_key = rope.forward(positions, query, key,
-                                      query_offsets.flatten())
-    # Compare the results.
-    torch.testing.assert_close(out_query,
-                               ref_query,
-                               atol=get_default_atol(out_query),
-                               rtol=get_default_rtol(out_query))
-    if use_key:
-        torch.testing.assert_close(out_key,
-                                   ref_key,
-                                   atol=get_default_atol(out_key),
-                                   rtol=get_default_rtol(out_key))
-    else:
-        assert ref_key is None and out_key is None, \
-            "expected returned key to be None"
-
-
@torch.inference_mode()
 def test_rope_module_cache():
    MAX_POSITIONS = [123, 1234]
--- a/tests/kernels/core/test_rotary_embedding.py
+++ b/tests/kernels/core/test_rotary_embedding.py
@ -16,20 +16,14 @@ from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 def rotary_embedding_opcheck(rot,
                             positions: torch.Tensor,
                             query: torch.Tensor,
-                             key: Optional[torch.Tensor] = None,
-                             offsets: Optional[torch.Tensor] = None):
+                             key: Optional[torch.Tensor] = None):
    cos_sin_cache = rot.cos_sin_cache.to(query.device, dtype=query.dtype)

-    # ops.rotary_embedding()/batched_rotary_embedding()
-    # are in-place operations that update the query and key tensors.
-    if offsets is not None:
-        opcheck(torch.ops._C.batched_rotary_embedding,
-                (positions, query, key, rot.head_size, cos_sin_cache,
-                 rot.is_neox_style, rot.rotary_dim, offsets))
-    else:
-        opcheck(torch.ops._C.rotary_embedding,
-                (positions, query, key, rot.head_size, cos_sin_cache,
-                 rot.is_neox_style))
+    # ops.rotary_embedding() is a in-place operation
+    # that updates the query and key tensors.
+    opcheck(torch.ops._C.rotary_embedding,
+            (positions, query, key, rot.head_size, cos_sin_cache,
+             rot.is_neox_style))


@pytest.mark.parametrize("device", ["cuda"])
@ -65,10 +59,6 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position,
    key = key[..., :head_size] if use_key else None

    rotary_embedding_opcheck(rot, positions, query, key)
-    offsets = torch.zeros(batch_size * seq_len,
-                          device=device,
-                          dtype=torch.long)
-    rotary_embedding_opcheck(rot, positions, query, key, offsets)

    # if we have a contiguous head stride, test the alternate
    # [..., num_heads * head_dim] shape/layout
--- a/tests/kernels/moe/test_mxfp4_moe.py
+++ b/tests/kernels/moe/test_mxfp4_moe.py
@ -771,11 +771,11 @@ def test_flashinfer_cutlass_mxfp4_mxfp8_fused_moe(
    w13_ref = dequant_mxfp4_batches(
        w13_q.view(torch.uint8),
        w13_scale.view(torch.uint8).reshape(-1)).to(torch.float32).reshape(
-            num_experts, 2 * intermediate_size, hidden_size)
+            num_experts, 2 * intermediate_size, hidden_size).to(device)
    w2_ref = dequant_mxfp4_batches(
        w2_q.view(torch.uint8),
        w2_scale.view(torch.uint8).reshape(-1)).to(torch.float32).reshape(
-            num_experts, hidden_size, intermediate_size)
+            num_experts, hidden_size, intermediate_size).to(device)

    # Quantize activations for SM100 path and dequantize for reference
    hidden_states_q, hidden_states_sf = mxfp8_quantize(hidden_states, True, 32)
--- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
+++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
@ -5,28 +5,52 @@ import pytest
 import torch

 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
-    silu_mul_fp8_quant_deep_gemm)
+    silu_mul_fp8_quant_deep_gemm_cuda)
 from vllm.platforms import current_platform
+from vllm.utils import cdiv
+
+fp8_dtype = torch.float8_e4m3fn

-# (E, T, H, group_size, seed)
 CASES = [
-    (1, 1, 128, 64, 0),
-    (1, 4, 128, 128, 0),
-    (2, 4, 256, 128, 0),
-    (32, 64, 256, 128, 0),
-    (17, 31, 768, 128, 0),
+    (1, 1, 128, fp8_dtype),
+    (1, 4, 128, fp8_dtype),
+    (2, 4, 256, fp8_dtype),
+    (32, 64, 256, fp8_dtype),
+    (17, 31, 768, fp8_dtype),
+    (1, 1, 128 * 1, fp8_dtype),
+    (1, 1, 128 * 2, fp8_dtype),
+    (1, 1, 128 * 3, fp8_dtype),
+    (1, 1, 128 * 4, fp8_dtype),
+    (8, 16, 128 * 1, fp8_dtype),
+    (8, 16, 128 * 2, fp8_dtype),
+    (8, 16, 128 * 3, fp8_dtype),
+    (8, 16, 128 * 4, fp8_dtype),
+    (8, 64, 7168, fp8_dtype),
+    (8, 128, 7168, fp8_dtype),
+    (8, 256, 7168, fp8_dtype),
+    (8, 512, 7168, fp8_dtype),
+    (8, 1024, 7168, fp8_dtype),
+    (256, 8, 7168, fp8_dtype),
+    (256, 16, 7168, fp8_dtype),
+    (256, 32, 7168, fp8_dtype),
+    (256, 64, 7168, fp8_dtype),
+
+    # Only add a few fnuz tests to help with long CI times.
+    (8, 512, 7168, torch.float8_e4m3fnuz),
+    (8, 1024, 7168, torch.float8_e4m3fnuz),
 ]


-@pytest.mark.parametrize("E,T,H,group_size,seed", CASES)
+@pytest.mark.parametrize("E,T,H,fp8_type", CASES)
@torch.inference_mode()
-def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed):
-    current_platform.seed_everything(seed)
+def test_silu_mul_fp8_quant_deep_gemm(E, T, H, fp8_type):
+    group_size = 128
+    current_platform.seed_everything(42)

    # Input tensor of shape (E, T, 2*H)
    y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
    tokens_per_expert = torch.randint(
-        low=0,
+        low=T // 2,
        high=T,
        size=(E, ),
        dtype=torch.int32,
@ -34,45 +58,59 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed):
    )

    # Run the Triton kernel
-    y_q, y_s = silu_mul_fp8_quant_deep_gemm(y,
-                                            tokens_per_expert,
-                                            group_size=group_size,
-                                            eps=1e-10)
+    y_q, y_s = silu_mul_fp8_quant_deep_gemm_cuda(y,
+                                                 tokens_per_expert,
+                                                 group_size=group_size)

-    # Reference implementation
-    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    torch.cuda.synchronize()
+    fp8_info = torch.finfo(fp8_dtype)
    fp8_max = fp8_info.max
    fp8_min = fp8_info.min
    eps = 1e-10

-    # Compute silu activation and elementwise multiplication
-    y1 = y[..., :H]
+    y1 = y[..., :H].float()
    y2 = y[..., H:]
    silu_x = y1 * torch.sigmoid(y1)
    merged = silu_x * y2

-    # Compute reference scales and quantized output, skipping padded tokens
    for e in range(E):
        nt = tokens_per_expert[e].item()
-        ref_s = torch.empty((T, H // group_size),
+        ref_s = torch.empty((T, cdiv(H, group_size)),
                            dtype=torch.float32,
                            device="cuda")
-        ref_q = torch.empty((T, H), dtype=torch.float8_e4m3fn, device="cuda")
+        ref_q = torch.empty((T, H), dtype=fp8_dtype, device="cuda")
+
        for t in range(nt):
-            data = merged[e, t]
-            data_grp = data.view(H // group_size, group_size)
-            amax = data_grp.abs().amax(dim=1).clamp(min=eps)
-            scale = amax / fp8_max
+            data = merged[e, t].float()
+            ref_q_row = torch.empty_like(data)

-            scaled = data / scale.repeat_interleave(group_size)
-            clamped = scaled.clamp(fp8_min, fp8_max)
-            q = clamped.to(torch.float8_e4m3fn)
+            # process full groups
+            n_full_groups = H // group_size
+            if n_full_groups > 0:
+                data_grp = data[:n_full_groups * group_size].view(
+                    n_full_groups, group_size)
+                amax = data_grp.abs().amax(dim=1).clamp(min=eps)
+                scale = amax / fp8_max
+                scaled = data[:n_full_groups *
+                              group_size] / scale.repeat_interleave(group_size)
+                ref_q_row[:n_full_groups * group_size] = scaled.clamp(
+                    fp8_min, fp8_max).to(fp8_dtype)
+                ref_s[t, :n_full_groups] = scale

-            ref_s[t] = scale
-            ref_q[t] = q
+            # process remainder group
+            rem = H % group_size
+            if rem > 0:
+                data_rem = data[-rem:]
+                amax = data_rem.abs().amax().clamp(min=eps)
+                scale = amax / fp8_max
+                scaled = data_rem / scale
+                ref_q_row[-rem:] = scaled.clamp(fp8_min, fp8_max).to(fp8_dtype)
+                ref_s[t, -1] = scale

-        y_se = y_s[e]
-        y_qe = y_q[e]
+            ref_q[t] = ref_q_row
+
+        y_se = y_s[e].float()
+        y_qe = y_q[e].float()

        torch.testing.assert_close(y_se[:nt], ref_s[:nt], atol=1e-4, rtol=1e-2)
        torch.testing.assert_close(
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -301,6 +301,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                            trust_remote_code=True),
    "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"),
    "Olmo2ForCausalLM": _HfExamplesInfo("allenai/OLMo-2-0425-1B"),
+    "Olmo3ForCausalLM": _HfExamplesInfo("shanearora/2025-sep-a-base-model"),
    "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"),
    "OPTForCausalLM": _HfExamplesInfo("facebook/opt-125m",
                                      {"1b": "facebook/opt-iml-max-1.3b"}),
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@ -178,6 +178,7 @@ class MockAttentionLayer:
        self._k_scale = torch.tensor(1.0, device=device)
        self._v_scale = torch.tensor(1.0, device=device)
        # Add float versions for flashinfer
+        self._q_scale_float = 1.0
        self._k_scale_float = 1.0
        self._v_scale_float = 1.0

--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@ -141,6 +141,8 @@ def get_attention_backend(backend_name: _Backend):
        "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend",
        _Backend.FLASH_ATTN_MLA:
        "vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend",
+        _Backend.FLASHINFER_MLA:
+        "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend",
        _Backend.TRITON_MLA_VLLM_V1:
        "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend",
    }
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@ -117,9 +117,9 @@ def test_ngram_correctness(
                print(f"ref_output: {ref_output.outputs[0].text}")
                print(f"spec_output: {spec_output.outputs[0].text}")

-        # Heuristic: expect at least 68% of the prompts to match exactly
+        # Heuristic: expect at least 66% of the prompts to match exactly
        # Upon failure, inspect the outputs to check for inaccuracy.
-        assert matches >= int(0.68 * len(ref_outputs))
+        assert matches >= int(0.66 * len(ref_outputs))
        del spec_llm
        torch.cuda.empty_cache()
        cleanup_dist_env_and_memory()
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@ -257,9 +257,13 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
        def execute_model(
            self,
            scheduler_output,
+            non_block=False,
        ) -> Future[ModelRunnerOutput]:
            """Make execute_model non-blocking."""

+            # DummyExecutor used only for testing async case.
+            assert non_block
+
            def _execute():
                output = self.collective_rpc("execute_model",
                                             args=(scheduler_output, ))
--- a/tests/v1/tpu/test_pallas.py
+++ b/tests/v1/tpu/test_pallas.py
@ -33,10 +33,12 @@ def test_ragged_paged_attention():
    )

    class FakeAttentionLayer:
+        _q_scale_float: float
        _k_scale_float: float
        _v_scale_float: float

    layer = FakeAttentionLayer()
+    layer._q_scale_float = 1.0
    layer._k_scale_float = 1.0
    layer._v_scale_float = 1.0

--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@ -257,16 +257,6 @@ def rotary_embedding(
                                  cos_sin_cache, is_neox)


-def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
-                             key: Optional[torch.Tensor], head_size: int,
-                             cos_sin_cache: torch.Tensor, is_neox: bool,
-                             rot_dim: int,
-                             cos_sin_cache_offsets: torch.Tensor) -> None:
-    torch.ops._C.batched_rotary_embedding(positions, query, key, head_size,
-                                          cos_sin_cache, is_neox, rot_dim,
-                                          cos_sin_cache_offsets)
-
-
 # layer norm ops
 def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
             epsilon: float) -> None:
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@ -148,17 +148,6 @@ class ipex_ops:
                                                     head_size, cos_sin_cache,
                                                     is_neox, rot_dim)

-    @staticmethod
-    def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
-                                 key: torch.Tensor, head_size: int,
-                                 cos_sin_cache: torch.Tensor, is_neox: bool,
-                                 rot_dim: int,
-                                 cos_sin_cache_offsets: torch.Tensor) -> None:
-        ipex.llm.functional.rotary_embedding_batched(positions, query, key,
-                                                     head_size, cos_sin_cache,
-                                                     is_neox, rot_dim,
-                                                     cos_sin_cache_offsets)
-
    @staticmethod
    def rms_norm(input: torch.Tensor, weight: torch.Tensor,
                 epsilon: float) -> torch.Tensor:
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@ -240,6 +240,7 @@ class AttentionLayer(Protocol):
    _q_scale: torch.Tensor
    _k_scale: torch.Tensor
    _v_scale: torch.Tensor
+    _q_scale_float: float
    _k_scale_float: float
    _v_scale_float: float
    _prob_scale: torch.Tensor
--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@ -68,6 +68,7 @@ class RequestFuncInput:
    model: str
    model_name: Optional[str] = None
    logprobs: Optional[int] = None
+    extra_headers: Optional[dict] = None
    extra_body: Optional[dict] = None
    multi_modal_content: Optional[Union[dict, list[dict]]] = None
    ignore_eos: bool = False
@ -129,6 +130,8 @@ async def async_request_openai_completions(
    headers = {
        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
    }
+    if request_func_input.extra_headers:
+        headers |= request_func_input.extra_headers
    if request_func_input.request_id:
        headers["x-request-id"] = request_func_input.request_id

@ -258,6 +261,8 @@ async def async_request_openai_chat_completions(
        "Content-Type": "application/json",
        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
    }
+    if request_func_input.extra_headers:
+        headers |= request_func_input.extra_headers
    if request_func_input.request_id:
        headers["x-request-id"] = request_func_input.request_id

@ -364,6 +369,8 @@ async def async_request_openai_audio(
    headers = {
        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
    }
+    if request_func_input.extra_headers:
+        headers |= request_func_input.extra_headers
    if request_func_input.request_id:
        headers["x-request-id"] = request_func_input.request_id

--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@ -389,6 +389,7 @@ async def benchmark(
    goodput_config_dict: dict[str, float],
    max_concurrency: Optional[int],
    lora_modules: Optional[Iterable[str]],
+    extra_headers: Optional[dict],
    extra_body: Optional[dict],
    ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
    ramp_up_start_rps: Optional[int] = None,
@ -452,6 +453,7 @@ async def benchmark(
        logprobs=logprobs,
        multi_modal_content=test_mm_content,
        ignore_eos=ignore_eos,
+        extra_headers=extra_headers,
        extra_body=extra_body,
    )

@ -484,6 +486,7 @@ async def benchmark(
                                         logprobs=logprobs,
                                         multi_modal_content=test_mm_content,
                                         ignore_eos=ignore_eos,
+                                         extra_headers=extra_headers,
                                         extra_body=extra_body)
        profile_output = await request_func(
            request_func_input=profile_input, session=session)
@ -568,6 +571,7 @@ async def benchmark(
                                              logprobs=logprobs,
                                              multi_modal_content=mm_content,
                                              ignore_eos=ignore_eos,
+                                              extra_headers=extra_headers,
                                              extra_body=extra_body,
                                              request_id=request_id,)
        tasks.append(
@ -815,6 +819,15 @@ def add_cli_args(parser: argparse.ArgumentParser):
        default="/v1/completions",
        help="API endpoint.",
    )
+    parser.add_argument(
+        "--header",
+        metavar="KEY=VALUE",
+        nargs="*",
+        help="Key-value pairs (e.g, --header x-additional-info=0.3.3) "
+        "for headers to be passed with each request. These headers override " \
+        "per backend constants and values set via environment variable, and " \
+        "will be overriden by other arguments (such as request ids)."
+    )
    parser.add_argument(
        "--max-concurrency",
        type=int,
@ -1104,6 +1117,19 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
        base_url = f"http://{args.host}:{args.port}"

+    # Headers
+    headers = None
+    if args.header:
+        headers = {}
+        for item in args.header:
+            if "=" in item:
+                kvstring = item.split("=", 1)
+                headers[kvstring[0].strip()] = kvstring[1].strip()
+            else:
+                raise ValueError(
+                    "Invalid header format. Please use KEY=VALUE format."
+                )
+
    tokenizer = get_tokenizer(tokenizer_id,
                              tokenizer_mode=tokenizer_mode,
                              trust_remote_code=args.trust_remote_code)
@ -1161,6 +1187,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
        goodput_config_dict=goodput_config_dict,
        max_concurrency=args.max_concurrency,
        lora_modules=args.lora_modules,
+        extra_headers=headers,
        extra_body=sampling_params,
        ramp_up_strategy=args.ramp_up_strategy,
        ramp_up_start_rps=args.ramp_up_start_rps,
@ -1184,7 +1211,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
    if args.metadata:
        for item in args.metadata:
            if "=" in item:
-                kvstring = item.split("=")
+                kvstring = item.split("=", 1)
                result_json[kvstring[0].strip()] = kvstring[1].strip()
            else:
                raise ValueError(
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@ -182,7 +182,7 @@ class NaiveBlockAllocator(BlockAllocator):
            # Increment refcount for each block.
            assert block.block_id is not None
            refcount = self._refcounter.incr(block.block_id)
-            assert refcount != 1, "can't fork free'd block"
+            assert refcount != 1, "can't fork freed block"

            forked_block = self._block_pool.init_block(
                prev_block=prev_block,
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@ -58,7 +58,7 @@ class Evictor(ABC):

 class BlockMetaData:
    """Data structure for storing key data describe cached block, so that
-    evitor could use to make its decision which one to choose for eviction
+    evictor could use to make its decision which one to choose for eviction

    Here we use physical block id as the dict key, as there maybe several
    blocks with the same content hash, but their physical id is unique.
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@ -337,11 +337,11 @@ class EplbState:
        Args:
            model (MixtureOfExperts): The MoE model.
            is_dummy (bool): If `True`, this is a dummy step and the load
-              metrics recorded in this forward pass will not count. Defaults
-              to `False`.
+                metrics recorded in this forward pass will not count. Defaults
+                to `False`.
            is_profile (bool): If `True`, perform a dummy rearrangement
-              with maximum communication cost. This is used in `profile_run`
-              to reserve enough memory for the communication buffer.
+                with maximum communication cost. This is used in `profile_run`
+                to reserve enough memory for the communication buffer.
            log_stats (bool): If `True`, log the expert load metrics.

        # Stats
--- a/vllm/distributed/eplb/rebalance_algo.py
+++ b/vllm/distributed/eplb/rebalance_algo.py
@ -102,14 +102,14 @@ def rebalance_experts_hierarchical(
    num_groups: int,
    num_nodes: int,
    num_gpus: int,
-):
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Parameters:
        weight: [num_moe_layers, num_logical_experts]
        num_physical_experts: number of physical experts after replication
        num_groups: number of expert groups
-        num_nodes: number of server nodes, where the intra-node network
-        (e.g, NVLink) is faster
+        num_nodes: number of server nodes, where the intra-node network 
+            (e.g, NVLink) is faster
        num_gpus: number of GPUs, must be a multiple of `num_nodes`

    Returns:
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@ -149,7 +149,7 @@ class KVConnectorBase_V1(ABC):

    @abstractmethod
    def start_load_kv(self, forward_context: "ForwardContext",
-                      **kwargs) -> None:
+                      **kwargs: Any) -> None:
        """
        Start loading the KV cache from the connector to vLLM's paged
        KV buffer. This is called from the forward context before the
@ -182,7 +182,8 @@ class KVConnectorBase_V1(ABC):

    @abstractmethod
    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
-                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+                      attn_metadata: "AttentionMetadata",
+                      **kwargs: Any) -> None:
        """
        Start saving a layer of KV cache from vLLM's paged buffer 
        to the connector. This is called from within attention layer to
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@ -30,7 +30,7 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
    # Worker-side methods
    # ==============================
    def start_load_kv(self, forward_context: "ForwardContext",
-                      **kwargs) -> None:
+                      **kwargs: Any) -> None:
        """
        Start loading the KV cache from the connector to vLLM's paged
        KV buffer. This is called from the forward context before the
@ -61,7 +61,8 @@ class LMCacheConnectorV1(KVConnectorBase_V1):
        self._lmcache_engine.wait_for_layer_load(layer_name)

    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
-                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+                      attn_metadata: "AttentionMetadata",
+                      **kwargs: Any) -> None:
        """
        Start saving the a layer of KV cache from vLLM's paged buffer 
        to the connector. This is called from within attention layer to
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@ -708,8 +708,6 @@ class NixlConnectorWorker:
        caches_data = []
        # With hybrid allocator, layers can share a kv cache tensor
        seen_base_addresses = []
-        xfer_buffers = (self.host_xfer_buffers
-                        if self.use_host_buffer else kv_caches)

        # Note(tms): I modified this from the original region setup code.
        # K and V are now in different regions. Advantage is that we can
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@ -91,7 +91,7 @@ class P2pNcclConnector(KVConnectorBase_V1):
    # ==============================

    def start_load_kv(self, forward_context: "ForwardContext",
-                      **kwargs) -> None:
+                      **kwargs: Any) -> None:
        """Start loading the KV cache from the connector buffer to vLLM's
        paged KV buffer.

@ -212,7 +212,8 @@ class P2pNcclConnector(KVConnectorBase_V1):
        return

    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
-                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+                      attn_metadata: "AttentionMetadata",
+                      **kwargs: Any) -> None:
        """Start saving the KV cache of the layer from vLLM's paged buffer
        to the connector.

@ -278,7 +279,7 @@ class P2pNcclConnector(KVConnectorBase_V1):

    def get_finished(
            self, finished_req_ids: set[str],
-            **kwargs) -> tuple[Optional[set[str]], Optional[set[str]]]:
+            **kwargs: Any) -> tuple[Optional[set[str]], Optional[set[str]]]:
        """
        Notifies worker-side connector ids of requests that have
        finished generating tokens.
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py
@ -218,8 +218,9 @@ class TensorMemoryPool:

        return addr

-    def load_tensor(self, addr: int, dtype: torch.dtype,
-                    shape: tuple[int, ...], device) -> torch.Tensor:
+    def load_tensor(self, addr: int, dtype: torch.dtype, shape: tuple[int,
+                                                                      ...],
+                    device: torch.device) -> torch.Tensor:
        """Loads a tensor from pinned host memory to the specified device.

        Args:
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@ -3,7 +3,7 @@
 import hashlib
 import os
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Any, Optional

 import safetensors
 import torch
@ -90,7 +90,7 @@ class SharedStorageConnector(KVConnectorBase_V1):
        logger.info("Shared storage path is %s", self._storage_path)

    def start_load_kv(self, forward_context: "ForwardContext",
-                      **kwargs) -> None:
+                      **kwargs: Any) -> None:
        """Start loading the KV cache from the connector buffer to vLLM's 
        paged KV buffer.

@ -191,7 +191,8 @@ class SharedStorageConnector(KVConnectorBase_V1):
        return

    def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor,
-                      attn_metadata: "AttentionMetadata", **kwargs) -> None:
+                      attn_metadata: "AttentionMetadata",
+                      **kwargs: Any) -> None:
        """Start saving the KV cache of the layer from vLLM's paged buffer 
        to the connector.

--- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
@ -251,8 +251,8 @@ class PyNcclPipe(KVPipeBase):
        """
        Receives a tensor and its metadata from the source rank. Blocking call.

-        Args:
-            tensor: The received tensor, or `None` if no tensor is received.
+        Returns:
+            The received tensor, or `None` if no tensor is received.
        """
        if self.transport_thread is None:
            self.transport_thread = ThreadPoolExecutor(max_workers=1)
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -1296,11 +1296,8 @@ class EngineArgs:
            # Async scheduling does not work with the uniprocess backend.
            if self.distributed_executor_backend is None:
                self.distributed_executor_backend = "mp"
-                logger.info("Using mp-based distributed executor backend "
-                            "for async scheduling.")
-            if self.distributed_executor_backend == "uni":
-                raise ValueError("Async scheduling is not supported with "
-                                 "uni-process backend.")
+                logger.info("Defaulting to mp-based distributed executor "
+                            "backend for async scheduling.")
            if self.pipeline_parallel_size > 1:
                raise ValueError("Async scheduling is not supported with "
                                 "pipeline-parallel-size > 1.")
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@ -379,7 +379,7 @@ class LoggingStatLogger(StatLoggerBase):
        if local_interval_elapsed(stats.now, self.last_local_log,
                                  self.local_interval):
            # Compute summary metrics for tracked stats (and log them
-            # to promethus if applicable).
+            # to prometheus if applicable).
            prompt_throughput = get_throughput(self.num_prompt_tokens,
                                               now=stats.now,
                                               last_log=self.last_local_log)
@ -432,7 +432,7 @@ class LoggingStatLogger(StatLoggerBase):


 class PrometheusStatLogger(StatLoggerBase):
-    """PrometheusStatLogger is used LLMEngine to log to Promethus."""
+    """PrometheusStatLogger is used LLMEngine to log to Prometheus."""
    _metrics_cls = Metrics
    _gauge_cls = prometheus_client.Gauge

--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@ -73,15 +73,10 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False):

    type: Required[Literal["audio_url"]]
    """The type of the content part."""
-    uuid: Optional[str]
-    """
-    User-provided UUID of a media. User must guarantee that it is properly
-    generated and unique for different medias.
-    """


 class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
-    image_embeds: Required[Union[str, dict[str, str]]]
+    image_embeds: Optional[Union[str, dict[str, str]]]
    """
    The image embeddings. It can be either:
    - A single base64 string.
@ -108,11 +103,6 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):

    type: Required[Literal["video_url"]]
    """The type of the content part."""
-    uuid: Optional[str]
-    """
-    User-provided UUID of a media. User must guarantee that it is properly
-    generated and unique for different medias.
-    """


 class PILImage(BaseModel):
@ -133,7 +123,7 @@ class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
    }
    """

-    image_pil: Required[PILImage]
+    image_pil: Optional[PILImage]
    uuid: Optional[str]
    """
    User-provided UUID of a media. User must guarantee that it is properly
@ -151,7 +141,7 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
    }
    """

-    image_url: Required[str]
+    image_url: Optional[str]
    uuid: Optional[str]
    """
    User-provided UUID of a media. User must guarantee that it is properly
@ -168,7 +158,7 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
    }
    """

-    audio_url: Required[str]
+    audio_url: Optional[str]


 class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
@ -180,7 +170,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
    }
    """

-    video_url: Required[str]
+    video_url: Optional[str]
    uuid: Optional[str]
    """
    User-provided UUID of a media. User must guarantee that it is properly
@ -597,7 +587,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
        self._model_config = model_config
        self._tokenizer = tokenizer

-        self._items_by_modality = defaultdict[str, list[_T]](list)
+        self._items_by_modality = defaultdict[str, list[Optional[_T]]](list)
        self._uuids_by_modality = defaultdict[str, list[Optional[str]]](list)

    @property
@ -624,14 +614,17 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
        return self.mm_registry.create_processor(self.model_config)

    def add(
-        self, modality: ModalityStr, item: _T, uuid: Optional[str] = None
+        self,
+        modality: ModalityStr,
+        item: Optional[_T],
+        uuid: Optional[str] = None,
    ) -> Optional[str]:
        """
        Add a multi-modal item to the current prompt and returns the
        placeholder string to use, if any.

        An optional uuid can be added which serves as a unique identifier of the
-        media. 
+        media.
        """
        input_modality = modality.replace("_embeds", "")
        num_items = len(self._items_by_modality[modality]) + 1
@ -708,10 +701,15 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
        if not self._items_by_modality:
            return None
        mm_inputs = {}
-        items_by_modality = {
-            modality: await asyncio.gather(*items)
-            for modality, items in self._items_by_modality.items()
-        }
+        items_by_modality = {}
+        for modality, items in self._items_by_modality.items():
+            coros = []
+            for item in items:
+                if item is not None:
+                    coros.append(item)
+                else:
+                    coros.append(asyncio.sleep(0))
+            items_by_modality[modality] = await asyncio.gather(*coros)

        if "image" in items_by_modality and "image_embeds" in items_by_modality:
            raise ValueError(
@ -760,35 +758,40 @@ class BaseMultiModalContentParser(ABC):
        return dict(self._placeholder_storage)

    @abstractmethod
-    def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
+    def parse_image(
+        self, image_url: Optional[str], uuid: Optional[str] = None) -> None:
        raise NotImplementedError

    @abstractmethod
    def parse_image_embeds(
        self,
-        image_embeds: Union[str, dict[str, str]],
+        image_embeds: Union[str, dict[str, str], None],
        uuid: Optional[str] = None,
    ) -> None:
        raise NotImplementedError

    @abstractmethod
    def parse_image_pil(
-        self, image_pil: Image.Image, uuid: Optional[str] = None
+        self, image_pil: Optional[Image.Image], uuid: Optional[str] = None
    ) -> None:
        raise NotImplementedError

    @abstractmethod
-    def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
+    def parse_audio(
+        self, audio_url: Optional[str], uuid: Optional[str] = None
+    ) -> None:
        raise NotImplementedError

    @abstractmethod
    def parse_input_audio(
-        self, input_audio: InputAudio, uuid: Optional[str] = None
+        self, input_audio: Optional[InputAudio], uuid: Optional[str] = None
    ) -> None:
        raise NotImplementedError

    @abstractmethod
-    def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
+    def parse_video(
+        self, video_url: Optional[str], uuid: Optional[str] = None
+    ) -> None:
        raise NotImplementedError


@ -803,15 +806,17 @@ class MultiModalContentParser(BaseMultiModalContentParser):
            allowed_local_media_path=tracker.allowed_local_media_path,
        )

-    def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
-        image = self._connector.fetch_image(image_url)
+    def parse_image(
+        self, image_url: Optional[str], uuid: Optional[str] = None
+    ) -> None:
+        image = self._connector.fetch_image(image_url) if image_url else None

        placeholder = self._tracker.add("image", image, uuid)
        self._add_placeholder("image", placeholder)

    def parse_image_embeds(
        self,
-        image_embeds: Union[str, dict[str, str]],
+        image_embeds: Union[str, dict[str, str], None],
        uuid: Optional[str] = None,
    ) -> None:
        if isinstance(image_embeds, dict):
@ -825,31 +830,49 @@ class MultiModalContentParser(BaseMultiModalContentParser):
            embedding = self._connector.fetch_image_embedding(image_embeds)
            placeholder = self._tracker.add("image_embeds", embedding, uuid)

+        if image_embeds is None:
+            placeholder = self._tracker.add("image_embeds", None, uuid)
+
        self._add_placeholder("image", placeholder)

    def parse_image_pil(
-        self, image_pil: Image.Image, uuid: Optional[str] = None
+        self, image_pil: Optional[Image.Image], uuid: Optional[str] = None
    ) -> None:
        placeholder = self._tracker.add("image", image_pil, uuid)
        self._add_placeholder("image", placeholder)

-    def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
-        audio = self._connector.fetch_audio(audio_url)
+    def parse_audio(
+        self, audio_url: Optional[str], uuid: Optional[str] = None
+    ) -> None:
+        audio = self._connector.fetch_audio(audio_url) if audio_url else None

        placeholder = self._tracker.add("audio", audio, uuid)
        self._add_placeholder("audio", placeholder)

    def parse_input_audio(
-        self, input_audio: InputAudio, uuid: Optional[str] = None
+        self, input_audio: Optional[InputAudio], uuid: Optional[str] = None
    ) -> None:
-        audio_data = input_audio.get("data", "")
-        audio_format = input_audio.get("format", "")
-        audio_url = f"data:audio/{audio_format};base64,{audio_data}"
+        if input_audio:
+            audio_data = input_audio.get("data", "")
+            audio_format = input_audio.get("format", "")
+            if audio_data:
+                audio_url = f"data:audio/{audio_format};base64,{audio_data}"
+            else:
+                # If a UUID is provided, audio data may be empty.
+                audio_url = None
+        else:
+            audio_url = None

        return self.parse_audio(audio_url, uuid)

-    def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
-        video = self._connector.fetch_video(video_url=video_url)
+    def parse_video(
+        self, video_url: Optional[str], uuid: Optional[str] = None
+    ) -> None:
+        video = (
+            self._connector.fetch_video(video_url=video_url)
+            if video_url
+            else None
+        )

        placeholder = self._tracker.add("video", video, uuid)
        self._add_placeholder("video", placeholder)
@ -865,18 +888,24 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
            allowed_local_media_path=tracker.allowed_local_media_path,
        )

-    def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
-        image_coro = self._connector.fetch_image_async(image_url)
+    def parse_image(
+        self, image_url: Optional[str], uuid: Optional[str] = None
+    ) -> None:
+        image_coro = (
+            self._connector.fetch_image_async(image_url) if image_url else None
+        )

        placeholder = self._tracker.add("image", image_coro, uuid)
        self._add_placeholder("image", placeholder)

    def parse_image_embeds(
        self,
-        image_embeds: Union[str, dict[str, str]],
+        image_embeds: Union[str, dict[str, str], None],
        uuid: Optional[str] = None,
    ) -> None:
-        future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future()
+        future: asyncio.Future[Union[str, dict[str, str], None]] = (
+            asyncio.Future()
+        )

        if isinstance(image_embeds, dict):
            embeds = {
@ -889,35 +918,58 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
            embedding = self._connector.fetch_image_embedding(image_embeds)
            future.set_result(embedding)

+        if image_embeds is None:
+            future.set_result(None)
+
        placeholder = self._tracker.add("image_embeds", future, uuid)
        self._add_placeholder("image", placeholder)

    def parse_image_pil(
-        self, image_pil: Image.Image, uuid: Optional[str] = None
+        self, image_pil: Optional[Image.Image], uuid: Optional[str] = None
    ) -> None:
-        future: asyncio.Future[Image.Image] = asyncio.Future()
-        future.set_result(image_pil)
+        future: asyncio.Future[Optional[Image.Image]] = asyncio.Future()
+        if image_pil:
+            future.set_result(image_pil)
+        else:
+            future.set_result(None)

        placeholder = self._tracker.add("image", future, uuid)
        self._add_placeholder("image", placeholder)

-    def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
-        audio_coro = self._connector.fetch_audio_async(audio_url)
+    def parse_audio(
+        self, audio_url: Optional[str], uuid: Optional[str] = None
+    ) -> None:
+        audio_coro = (
+            self._connector.fetch_audio_async(audio_url) if audio_url else None
+        )

        placeholder = self._tracker.add("audio", audio_coro, uuid)
        self._add_placeholder("audio", placeholder)

    def parse_input_audio(
-        self, input_audio: InputAudio, uuid: Optional[str] = None
+        self, input_audio: Optional[InputAudio], uuid: Optional[str] = None
    ) -> None:
-        audio_data = input_audio.get("data", "")
-        audio_format = input_audio.get("format", "")
-        audio_url = f"data:audio/{audio_format};base64,{audio_data}"
+        if input_audio:
+            audio_data = input_audio.get("data", "")
+            audio_format = input_audio.get("format", "")
+            if audio_data:
+                audio_url = f"data:audio/{audio_format};base64,{audio_data}"
+            else:
+                # If a UUID is provided, audio data may be empty.
+                audio_url = None
+        else:
+            audio_url = None

        return self.parse_audio(audio_url, uuid)

-    def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
-        video = self._connector.fetch_video_async(video_url=video_url)
+    def parse_video(
+        self, video_url: Optional[str], uuid: Optional[str] = None
+    ) -> None:
+        video = (
+            self._connector.fetch_video_async(video_url=video_url)
+            if video_url
+            else None
+        )

        placeholder = self._tracker.add("video", video, uuid)
        self._add_placeholder("video", placeholder)
@ -1130,8 +1182,9 @@ def _parse_chat_message_content_mm_part(
        part, dict
    )  # This is needed to avoid mypy errors: part.get() from str
    part_type = part.get("type", None)
+    uuid = part.get("uuid", None)

-    if isinstance(part_type, str) and part_type in MM_PARSER_MAP:
+    if isinstance(part_type, str) and part_type in MM_PARSER_MAP and uuid is None: # noqa: E501
        content = MM_PARSER_MAP[part_type](part)

        # Special case for 'image_url.detail'
@ -1146,25 +1199,54 @@ def _parse_chat_message_content_mm_part(

    # Handle missing 'type' but provided direct URL fields.
    # 'type' is required field by pydantic
-    if part_type is None:
-        if part.get("image_url") is not None:
+    if part_type is None or uuid is not None:
+        if "image_url" in part:
            image_params = cast(
                CustomChatCompletionContentSimpleImageParam, part
            )
-            return "image_url", image_params.get("image_url", "")
-        if part.get("audio_url") is not None:
+            image_url = image_params.get("image_url", None)
+            if isinstance(image_url, dict):
+                # Can potentially happen if user provides a uuid
+                # with url as a dict of {"url": url}
+                image_url = image_url.get("url", None)
+            return "image_url", image_url
+        if "image_pil" in part:
+            # "image_pil" could be None if UUID is provided.
+            image_params = cast( # type: ignore 
+                CustomChatCompletionContentPILImageParam, part
+            )
+            image_pil = image_params.get("image_pil", None)
+            return "image_pil", image_pil
+        if "image_embeds" in part:
+            # "image_embeds" could be None if UUID is provided.
+            image_params = cast( # type: ignore 
+                ChatCompletionContentPartImageEmbedsParam, part
+            )
+            image_embeds = image_params.get("image_embeds", None)
+            return "image_embeds", image_embeds
+        if "audio_url" in part:
            audio_params = cast(
                CustomChatCompletionContentSimpleAudioParam, part
            )
-            return "audio_url", audio_params.get("audio_url", "")
+            audio_url = audio_params.get("audio_url", None)
+            if isinstance(audio_url, dict):
+                # Can potentially happen if user provides a uuid
+                # with url as a dict of {"url": url}
+                audio_url = audio_url.get("url", None)
+            return "audio_url", audio_url
        if part.get("input_audio") is not None:
            input_audio_params = cast(dict[str, str], part)
            return "input_audio", input_audio_params
-        if part.get("video_url") is not None:
+        if "video_url" in part:
            video_params = cast(
                CustomChatCompletionContentSimpleVideoParam, part
            )
-            return "video_url", video_params.get("video_url", "")
+            video_url = video_params.get("video_url", None)
+            if isinstance(video_url, dict):
+                # Can potentially happen if user provides a uuid
+                # with url as a dict of {"url": url}
+                video_url = video_url.get("url", None)
+            return "video_url", video_url
        # Raise an error if no 'type' or direct URL is found.
        raise ValueError("Missing 'type' field in multimodal part.")

@ -1173,15 +1255,9 @@ def _parse_chat_message_content_mm_part(
    return part_type, "unknown part_type content"


-VALID_MESSAGE_CONTENT_MM_PART_TYPES = (
+PART_TYPES_TO_SKIP_NONE_CONTENT = (
    "text",
    "refusal",
-    "image_url",
-    "image_embeds",
-    "image_pil",
-    "audio_url",
-    "input_audio",
-    "video_url",
 )


@ -1242,7 +1318,7 @@ def _parse_chat_message_content_part(
    part_type, content = _parse_chat_message_content_mm_part(part)
    # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
    # content is None, log a warning and skip
-    if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None:
+    if part_type in PART_TYPES_TO_SKIP_NONE_CONTENT and content is None:
        logger.warning(
            "Skipping multimodal part '%s' (type: '%s') "
            "with empty / unparsable content.",
@ -1266,7 +1342,10 @@ def _parse_chat_message_content_part(

    modality = None
    if part_type == "image_pil":
-        image_content = cast(Image.Image, content)
+        if content is not None:
+            image_content = cast(Image.Image, content)
+        else:
+            image_content = None
        mm_parser.parse_image_pil(image_content, uuid)
        modality = "image"
    elif part_type in ("image_url", "input_image"):
@ -1274,7 +1353,10 @@ def _parse_chat_message_content_part(
        mm_parser.parse_image(str_content, uuid)
        modality = "image"
    elif part_type == "image_embeds":
-        content = cast(Union[str, dict[str, str]], content)
+        if content is not None:
+            content = cast(Union[str, dict[str, str]], content)
+        else:
+            content = None
        mm_parser.parse_image_embeds(content, uuid)
        modality = "image"
    elif part_type == "audio_url":
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -1491,6 +1491,11 @@ class LLM:

        for i, prompt in enumerate(it):

+            if isinstance(prompt, dict):
+                self._validate_mm_data_and_uuids(
+                    prompt.get("multi_modal_data"),
+                    prompt.get("multi_modal_uuids"))
+
            param = params[i] if isinstance(params, Sequence) else params

            tokenization_kwargs: dict[str, Any] = {}
@ -1507,6 +1512,41 @@ class LLM:
                priority=priority[i] if priority else 0,
            )

+    def _validate_mm_data_and_uuids(
+            self,
+            multi_modal_data: Optional[Any],  # MultiModalDataDict
+            multi_modal_uuids: Optional[Any],  # MultiModalUUIDDict
+    ):
+        """
+        Validate that if any multi-modal data is skipped (i.e. None),
+        then its corresponding UUID must be set. 
+        """
+        if multi_modal_data is None:
+            return
+
+        for modality, data in multi_modal_data.items():
+            if isinstance(data, list):
+                for i, d in enumerate(data):
+                    if d is None:
+                        if multi_modal_uuids is None or modality not in multi_modal_uuids or multi_modal_uuids[  # noqa: E501
+                                modality] is None:
+                            raise ValueError(
+                                f"Multi-modal data for {modality} is None "
+                                f"but UUID is not provided")
+                        else:
+                            if len(
+                                    multi_modal_uuids[modality]
+                            ) <= i or multi_modal_uuids[modality][i] is None:
+                                raise ValueError(
+                                    f"Multi-modal data for {modality} is None "
+                                    f"but UUID is not provided")
+            else:
+                if data is None and (multi_modal_uuids is None
+                                     or modality not in multi_modal_uuids
+                                     or multi_modal_uuids[modality] is None):
+                    raise ValueError(f"Multi-modal data for {modality} is None"
+                                     f" but UUID is not provided")
+
    def _add_request(
        self,
        prompt: PromptType,
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -476,6 +476,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # - "FLASHINFER": use flashinfer
    # - "FLASHMLA": use FlashMLA
    # - "FLASH_ATTN_MLA": use FlashAttention for MLA
+    # - "FLASHINFER_MLA": use FlashInfer for MLA
+    # - "CUTLASS_MLA": use CUTLASS for MLA
    "VLLM_ATTENTION_BACKEND":
    lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),

--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 import os
+from concurrent.futures import Future, ThreadPoolExecutor
+from functools import cached_property
 from multiprocessing import Lock
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union

@ -17,6 +18,7 @@ from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
                        run_method)
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.executor.utils import get_and_update_mm_cache
+from vllm.v1.outputs import AsyncModelRunnerOutput
 from vllm.worker.worker_base import WorkerWrapperBase

 logger = init_logger(__name__)
@ -31,15 +33,7 @@ class UniProcExecutor(ExecutorBase):
        """
        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
                                               rpc_rank=0)
-        distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
-        local_rank = 0
-        # set local rank as the device index if specified
-        device_info = self.vllm_config.device_config.device.__str__().split(
-            ":")
-        if len(device_info) > 1:
-            local_rank = int(device_info[1])
-        rank = 0
+        distributed_init_method, rank, local_rank = self._distributed_args()
        is_driver_worker = True
        kwargs = dict(
            vllm_config=self.vllm_config,
@ -50,21 +44,56 @@ class UniProcExecutor(ExecutorBase):
        )
        self.mm_receiver_cache = worker_receiver_cache_from_config(
            self.vllm_config, MULTIMODAL_REGISTRY, Lock())
+
+        self.async_output_thread: Optional[ThreadPoolExecutor] = None
+        if self.max_concurrent_batches > 1:
+            self.async_output_thread = ThreadPoolExecutor(
+                max_workers=1, thread_name_prefix="WorkerAsyncOutput")
+
        self.collective_rpc("init_worker", args=([kwargs], ))
        self.collective_rpc("init_device")
        self.collective_rpc("load_model")

+    def _distributed_args(self) -> tuple[str, int, int]:
+        """Return (distributed_init_method, rank, local_rank)."""
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        # set local rank as the device index if specified
+        device_info = self.vllm_config.device_config.device.__str__().split(
+            ":")
+        local_rank = int(device_info[1]) if len(device_info) > 1 else 0
+        return distributed_init_method, 0, local_rank
+
+    @cached_property
+    def max_concurrent_batches(self) -> int:
+        return 2 if self.scheduler_config.async_scheduling else 1
+
    def collective_rpc(self,
                       method: Union[str, Callable],
                       timeout: Optional[float] = None,
                       args: Tuple = (),
-                       kwargs: Optional[Dict] = None) -> List[Any]:
+                       kwargs: Optional[Dict] = None,
+                       non_block: bool = False) -> List[Any]:
        if kwargs is None:
            kwargs = {}
        if self.mm_receiver_cache is not None and method == "execute_model":
            get_and_update_mm_cache(self.mm_receiver_cache, args)
-        answer = run_method(self.driver_worker, method, args, kwargs)
-        return [answer]
+
+        if not non_block:
+            return [run_method(self.driver_worker, method, args, kwargs)]
+
+        try:
+            result = run_method(self.driver_worker, method, args, kwargs)
+            if isinstance(result, AsyncModelRunnerOutput):
+                if (async_thread := self.async_output_thread) is not None:
+                    return [async_thread.submit(result.get_output)]
+                result = result.get_output()
+            future = Future[Any]()
+            future.set_result(result)
+        except Exception as e:
+            future = Future[Any]()
+            future.set_exception(e)
+        return [future]

    def check_health(self) -> None:
        # UniProcExecutor will always be healthy as long as
@ -116,8 +145,9 @@ class ExecutorWithExternalLauncher(UniProcExecutor):
            assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, \
            ("To get deterministic execution in V1, "
            "please set VLLM_ENABLE_V1_MULTIPROCESSING=0")
-        self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
-                                               rpc_rank=0)
+        super()._init_executor()
+
+    def _distributed_args(self) -> tuple[str, int, int]:
        # engines are launched in torchrun-compatible launchers
        # so we can use the env:// method.
        # required env vars:
@ -128,19 +158,7 @@ class ExecutorWithExternalLauncher(UniProcExecutor):
        distributed_init_method = "env://"
        rank = int(os.environ["RANK"])
        local_rank = int(os.environ["LOCAL_RANK"])
-        is_driver_worker = True
-        kwargs = dict(
-            vllm_config=self.vllm_config,
-            local_rank=local_rank,
-            rank=rank,
-            distributed_init_method=distributed_init_method,
-            is_driver_worker=is_driver_worker,
-        )
-        self.mm_receiver_cache = worker_receiver_cache_from_config(
-            self.vllm_config, MULTIMODAL_REGISTRY, Lock())
-        self.collective_rpc("init_worker", args=([kwargs], ))
-        self.collective_rpc("init_device")
-        self.collective_rpc("load_model")
+        return distributed_init_method, rank, local_rank

    def determine_num_available_blocks(self) -> Tuple[int, int]:
        """
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from math import log2
 from typing import Optional

 import torch
@ -10,6 +11,7 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
    TopKWeightAndReduceDelegate)
 from vllm.model_executor.layers.fused_moe.utils import _resize_cache
+from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import (fp8_m_grouped_gemm_nt_masked,
                                  is_deep_gemm_e8m0_used)
@ -24,35 +26,28 @@ def _silu_mul_fp8_quant_deep_gemm(
    y_q_ptr,  # fp8 quantized activations (E, T, H)
    y_s_ptr,  # 16-bit scales (E, T, G)
    counts_ptr,  # int32 num tokens per expert (E)
-
    # Sizes ---------------------------------------------------------------
    H: tl.constexpr,  # hidden dimension (per output)
    GROUP_SIZE: tl.constexpr,  # elements per group (usually 128)
-
    # Strides for input (elements) ---------------------------------------
    stride_i_e,
    stride_i_t,
    stride_i_h,
-
    # Strides for y_q (elements) -----------------------------------------
    stride_yq_e,
    stride_yq_t,
    stride_yq_h,
-
    # Strides for y_s (elements) -----------------------------------------
    stride_ys_e,
    stride_ys_t,
    stride_ys_g,
-
    # Stride for counts (elements)
    stride_counts_e,
-
    # Numeric params ------------------------------------------------------
    eps: tl.constexpr,
    fp8_min: tl.constexpr,
    fp8_max: tl.constexpr,
    use_ue8m0: tl.constexpr,
-
    # Meta ---------------------------------------------------------------
    BLOCK: tl.constexpr,
    NUM_STAGES: tl.constexpr,
@ -101,17 +96,15 @@ def _silu_mul_fp8_quant_deep_gemm(
        tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s)


-def silu_mul_fp8_quant_deep_gemm(
+def silu_mul_fp8_quant_deep_gemm_cuda(
    y: torch.Tensor,  # (E, T, 2*H)
    tokens_per_expert: torch.Tensor,  # (E,) number of valid tokens per expert
+    num_parallel_tokens=16,
    group_size: int = 128,
-    eps: float = 1e-10,
 ) -> tuple[torch.Tensor, torch.Tensor]:
    """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
-
-    y has shape (E, T, 2*H). The first half of the last dimension is 
+    y has shape (E, T, 2*H). The first half of the last dimension is
    silu-activated, multiplied by the second half, then quantized into FP8.
-
    Returns `(y_q, y_s)` where
    * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H]
    * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
@ -120,22 +113,17 @@ def silu_mul_fp8_quant_deep_gemm(
    E, T, H2 = y.shape
    assert H2 % 2 == 0, "last dim of y must be even (2*H)"
    H = H2 // 2
-    G = H // group_size
-    assert H % group_size == 0, "H must be divisible by group_size"
-    assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E, \
-        "tokens_per_expert must be shape (E,)"
+    G = (H + group_size - 1) // group_size
+    assert H % 8 == 0, "H must be divisible by 8"
+    assert group_size == 128, "H must be divisible by 8"
+    assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E
+
    tokens_per_expert = tokens_per_expert.to(device=y.device,
                                             dtype=torch.int32)

-    # allocate outputs
    fp8_dtype = torch.float8_e4m3fn
    y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device)

-    # strides (elements)
-    stride_i_e, stride_i_t, stride_i_h = y.stride()
-    stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride()
-
-    # desired scale strides (elements): (T*G, 1, T)
    stride_ys_e = T * G
    stride_ys_t = 1
    stride_ys_g = T
@ -144,47 +132,86 @@ def silu_mul_fp8_quant_deep_gemm(
                              dtype=torch.float32,
                              device=y.device)

-    stride_cnt_e = tokens_per_expert.stride()[0]
+    use_ue8m0 = is_deep_gemm_e8m0_used()

-    # Static grid over experts and H-groups.
-    # A loop inside the kernel handles the token dim
-    grid = (E * G, )
+    if E <= 16:
+        max_empirical_parallelism = 64
+    elif E <= 32:
+        max_empirical_parallelism = 16
+    else:
+        max_empirical_parallelism = 4

-    f_info = torch.finfo(fp8_dtype)
-    fp8_max = f_info.max
-    fp8_min = f_info.min
+    # We never want to launch more than Tx number of threads
+    # This computes the clip.
+    num_parallel_tokens = max(
+        1,
+        min(max_empirical_parallelism, 2**int(log2(min(num_parallel_tokens,
+                                                       T)))))
+    cuda_arch = current_platform.get_device_capability(
+        device_id=y.device.index).to_int()

-    _silu_mul_fp8_quant_deep_gemm[grid](
-        y,
-        y_q,
-        y_s,
-        tokens_per_expert,
-        H,
-        group_size,
-        stride_i_e,
-        stride_i_t,
-        stride_i_h,
-        stride_yq_e,
-        stride_yq_t,
-        stride_yq_h,
-        stride_ys_e,
-        stride_ys_t,
-        stride_ys_g,
-        stride_cnt_e,
-        eps,
-        fp8_min,
-        fp8_max,
-        is_deep_gemm_e8m0_used(),
-        BLOCK=group_size,
-        NUM_STAGES=4,
-        num_warps=1,
-    )
+    if cuda_arch >= 80:
+        torch.ops._C.silu_mul_fp8_quant_deep_gemm_cuda(y, tokens_per_expert,
+                                                       y_q, y_s, group_size,
+                                                       use_ue8m0,
+                                                       num_parallel_tokens)
+    else:
+        # Default to triton if not on cuda or if arch is too old
+        y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device)
+
+        stride_cnt_e = tokens_per_expert.stride()[0]
+
+        # Static grid over experts and H-groups.
+        # A loop inside the kernel handles the token dim
+        grid = (E * G, )
+        # strides (elements)
+        stride_i_e, stride_i_t, stride_i_h = y.stride()
+        stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride()
+
+        # desired scale strides (elements): (T*G, 1, T)
+        stride_ys_e = T * G
+        stride_ys_t = 1
+        stride_ys_g = T
+        y_s = torch.empty_strided(
+            (E, T, G),
+            (stride_ys_e, stride_ys_t, stride_ys_g),
+            dtype=torch.float32,
+            device=y.device,
+        )
+        f_info = torch.finfo(fp8_dtype)
+        fp8_max = f_info.max
+        fp8_min = f_info.min
+        eps: float = 1e-10
+        _silu_mul_fp8_quant_deep_gemm[grid](
+            y,
+            y_q,
+            y_s,
+            tokens_per_expert,
+            H,
+            group_size,
+            stride_i_e,
+            stride_i_t,
+            stride_i_h,
+            stride_yq_e,
+            stride_yq_t,
+            stride_yq_h,
+            stride_ys_e,
+            stride_ys_t,
+            stride_ys_g,
+            stride_cnt_e,
+            eps,
+            fp8_min,
+            fp8_max,
+            is_deep_gemm_e8m0_used(),
+            BLOCK=group_size,
+            NUM_STAGES=4,
+            num_warps=1,
+        )

    return y_q, y_s


 class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
-
    # The Deep Gemm kernels only support block size of 128
    DEEPGEMM_BLOCK_SHAPE: list[int] = [128, 128]

@ -297,8 +324,8 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
        fp8_m_grouped_gemm_nt_masked((a1q, a1q_scale), (w1, w1_scale),
                                     workspace1, expert_num_tokens, expected_m)

-        a2q, a2q_scale = silu_mul_fp8_quant_deep_gemm(workspace1,
-                                                      expert_num_tokens)
+        a2q, a2q_scale = silu_mul_fp8_quant_deep_gemm_cuda(
+            workspace1, expert_num_tokens)

        fp8_m_grouped_gemm_nt_masked((a2q, a2q_scale), (w2, w2_scale), output,
                                     expert_num_tokens, expected_m)
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@ -740,7 +740,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
        """
        Handle special case for models where MLP layers are already
        fused on disk. In this case, we have no shard id. This function
-        determmines the shard id by splitting these layers and then calls
+        determines the shard id by splitting these layers and then calls
        the weight loader using the shard id.

        An example of a model with these fused layers:
@ -914,7 +914,7 @@ class QKVParallelLinear(ColumnParallelLinear):
        """
        Handle special case for models where QKV layers are already 
        fused on disk. In this case, we have no shard id. This function
-        determmines the shard id by splitting these layers and then calls
+        determines the shard id by splitting these layers and then calls
        the weight loader using the shard id.

        An example of a model with these fused layers:
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@ -88,6 +88,7 @@ class BaseKVCacheMethod(QuantizeMethodBase):
                    "Setting it to k_scale. This only matters for "
                    "the flash-attn backend.")
                layer._q_scale.copy_(k_scale)
+                layer._q_scale_float = k_scale

            # These are used in the final Attention.forward()
            layer._k_scale.copy_(k_scale)
@ -124,6 +125,7 @@ class BaseKVCacheMethod(QuantizeMethodBase):

        # These are used in the final Attention.forward()
        layer._q_scale.copy_(q_scale)
+        layer._q_scale_float = q_scale
        layer._prob_scale.copy_(prob_scale)
        if layer.kv_cache_dtype == "fp8" and (q_scale == 1.0
                                              or prob_scale == 1.0):
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@ -62,11 +62,8 @@ class RotaryEmbedding(CustomOp):
        positions: torch.Tensor,
        query: torch.Tensor,
        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        """A PyTorch-native implementation of forward()."""
-        if offsets is not None:
-            positions = positions + offsets
        positions = positions.flatten()
        num_tokens = positions.shape[0]
        cos_sin = self.cos_sin_cache.index_select(0, positions)
@ -96,7 +93,6 @@ class RotaryEmbedding(CustomOp):
        positions: torch.Tensor,
        query: torch.Tensor,
        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        from vllm import _custom_ops as ops

@ -107,16 +103,10 @@ class RotaryEmbedding(CustomOp):
            self.cos_sin_cache = self.cos_sin_cache.to(query.device,
                                                       dtype=query.dtype)

-        # ops.rotary_embedding()/batched_rotary_embedding()
-        # are in-place operations that update the query and key tensors.
-        if offsets is not None:
-            ops.batched_rotary_embedding(positions, query, key, self.head_size,
-                                         self.cos_sin_cache,
-                                         self.is_neox_style, self.rotary_dim,
-                                         offsets)
-        else:
-            ops.rotary_embedding(positions, query, key, self.head_size,
-                                 self.cos_sin_cache, self.is_neox_style)
+        # ops.rotary_embedding() is an in-place operation
+        # that updates the query and key tensors.
+        ops.rotary_embedding(positions, query, key, self.head_size,
+                             self.cos_sin_cache, self.is_neox_style)
        return query, key

    def forward_xpu(
@ -124,29 +114,21 @@ class RotaryEmbedding(CustomOp):
        positions: torch.Tensor,
        query: torch.Tensor,
        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        from vllm._ipex_ops import ipex_ops as ops

        self.cos_sin_cache = self.cos_sin_cache.to(positions.device,
                                                   dtype=query.dtype)
-        # ops.rotary_embedding()/batched_rotary_embedding()
-        # are in-place operations that update the query and key tensors.
+        # ops.rotary_embedding() is an in-place operation
+        # that updates the query and key tensors.
        if key is None:
            # XPU kernel doesn't support key=None so fall back to native impl
            # TODO(sarckk): add support for optional key in
            # ipex.llm.functional.rotary_embedding_batched
-            return self.forward_native(positions, query, key, offsets)
+            return self.forward_native(positions, query, key)
        else:
-            if offsets is not None:
-                ops.batched_rotary_embedding(positions, query, key,
-                                             self.head_size,
-                                             self.cos_sin_cache,
-                                             self.is_neox_style,
-                                             self.rotary_dim, offsets)
-            else:
-                ops.rotary_embedding(positions, query, key, self.head_size,
-                                     self.cos_sin_cache, self.is_neox_style)
+            ops.rotary_embedding(positions, query, key, self.head_size,
+                                 self.cos_sin_cache, self.is_neox_style)
        return query, key

    def extra_repr(self) -> str:
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@ -258,7 +258,7 @@ class VocabParallelEmbedding(CustomOp):

        if params_dtype is None:
            params_dtype = torch.get_default_dtype()
-        # Divide the weight matrix along the vocaburaly dimension.
+        # Divide the weight matrix along the vocabulary dimension.
        self.num_added_embeddings = self.num_embeddings - self.org_vocab_size
        self.num_embeddings_per_partition = divide(self.num_embeddings_padded,
                                                   self.tp_size)
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@ -1446,7 +1446,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal,
            return None

        # The result multimodal_embeddings is tuple of tensors, with each
-        # tensor correspoending to a multimodal data item (image or video).
+        # tensor corresponding to a multimodal data item (image or video).
        multimodal_embeddings: tuple[torch.Tensor, ...] = ()

        # NOTE: It is important to iterate over the keys in this dictionary
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@ -586,10 +586,10 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,

        # ruff: noqa
        # The Gemma3nProcessor expects all audio will be 30s in length and inserts 188 audio soft tokens into the
-        # text to account for this. However, the audio preprocessing and encoder do not gurarantee they will
+        # text to account for this. However, the audio preprocessing and encoder do not guarantee they will
        # produce 188 soft tokens; they will produce at most that many tokens, but they may produce fewer tokens
        # depending on the length of the longest audio input in the batch. When we encounter this situation, we pad
-        # the audio feature out to 188 soft tokens with the emebedding of the last token in the embed_audio vocab.
+        # the audio feature out to 188 soft tokens with the embedding of the last token in the embed_audio vocab.
        # TODO precompute and cache padding
        audio_padding_toks = torch.tensor([[self.vocab_size - 1]],
                                          dtype=torch.long,
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@ -823,7 +823,7 @@ class SupportsEagle3(Protocol):
        
        Args:
            layers: Tuple of layer indices that should output auxiliary
-              hidden states.
+                hidden states.
        """
        ...

--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@ -1520,15 +1520,9 @@ class BaseKeyeModule(nn.Module):
                batch.
                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
                opensource models), the shape will be `(3, seq_len)`,
-                otherwise it will be `(seq_len,).
-            pixel_values: Pixel values to be fed to a model.
-                `None` if no images are passed.
-            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
-                `None` if no images are passed.
-            pixel_values_videos: Pixel values of videos to be fed to a model.
-                `None` if no videos are passed.
-            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
-                `None` if no videos are passed.
+                otherwise it will be `(seq_len,)`.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.
        """
        if intermediate_tensors is not None:
            inputs_embeds = None
--- a/vllm/model_executor/models/keye_vl1_5.py
+++ b/vllm/model_executor/models/keye_vl1_5.py
@ -58,17 +58,18 @@ def split_thw(grid_thw: torch.Tensor) -> torch.Tensor:
    return torch.cat([ones, h_w], dim=1).repeat_interleave(t, dim=0)


-def get_num_patches(grid_thw: torch.Tensor, num_frames: Union[list[int],
-                                                              torch.Tensor]):
+def get_num_patches(grid_thw: torch.Tensor,
+                    num_frames: Union[list[int], torch.Tensor]) -> list[int]:
    """
    Return num_patches per video.

    Args:
-        t: tensor with shape [N, ...] where each item is a list/tensor
-        cu_seqlens: list indicating the boundaries of groups
+        grid_thw: Tensor with shape [N, 3] containing temporal, height, width
+            dimensions
+        num_frames: List or tensor indicating the number of frames per video

    Returns:
-        list of ints representing the sum of products for each group
+        List of ints representing the number of patches for each video

    Examples:
        >>> # Suppose there are 2 videos with a total of 3 grids
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@ -732,7 +732,9 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
-            pixel_values: The pixels in each input image.
+            positions: Position indices for the input tokens.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.

        Info:
            [LlavaImageInputs][]
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@ -535,8 +535,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
-            pixel_values: The pixels in each grid patch for each input image.
-            image_sizes: The original `(height, width)` for each input image.
+            positions: Position indices for the input tokens.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.

        Info:
            [LlavaNextImageInputs][]
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@ -578,7 +578,9 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
-            pixel_values: The pixels in each input image.
+            positions: Position indices for the input tokens.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.

        Info:
            [Mistral3ImagePixelInputs][]
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@ -387,11 +387,10 @@ class Llama4VisionEncoder(nn.Module):
    ) -> torch.Tensor:
        r"""
        Args:
-            inputs_embeds (`torch.FloatTensor` of shape
-                    `(batch_size, sequence_length, hidden_size)`):
-                Optionally, instead of passing `input_ids` you can choose to
-                directly pass an embedded representation. This is useful if you
-                want more control over how to convert `input_ids` indices into
+            hidden_states: Input tensor of shape 
+                (batch_size, sequence_length, hidden_size).
+                Hidden states from the model embeddings, representing 
+                the input tokens.
                associated vectors than the model's internal embedding
                lookup matrix.
        """
--- a/vllm/model_executor/models/moonvit.py
+++ b/vllm/model_executor/models/moonvit.py
@ -70,11 +70,15 @@ def multihead_attention(
    v: torch.Tensor,
    q_cu_seqlens: Optional[torch.Tensor] = None,
    k_cu_seqlens: Optional[torch.Tensor] = None,
-):
+) -> torch.Tensor:
    """Multi-head attention using flash attention 2.

    Args:
-        q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
+        q: Query tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+        k: Key tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+        v: Value tensor of shape (batch_size, seqlen, num_heads, head_dim),
            or (tot_seqlens, num_heads, head_dim) if packing.
        q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q.
            The first element should be 0 and the last element should be q.shape[0].
@ -123,8 +127,14 @@ def sdpa_attention(
    """SDPA attention.

    Args:
-        q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
+        q: Query tensor of shape (batch_size, seqlen, num_heads, head_dim),
            or (tot_seqlens, num_heads, head_dim) if packing.
+        k: Key tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+        v: Value tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+        q_cu_seqlens: Optional cumulative sequence lengths of q.
+        k_cu_seqlens: Optional cumulative sequence lengths of k.
    """
    seq_length = q.shape[0]
    attention_mask = torch.zeros([1, seq_length, seq_length],
@ -387,7 +397,7 @@ class MLP2(nn.Module):
    def __init__(self,
                 dims: list[int],
                 activation,
-                 bias=True,
+                 bias: bool = True,
                 prefix: str = "",
                 use_data_parallel: bool = False):
        super().__init__()
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@ -560,7 +560,7 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
            return []

        # The result multimodal_embeddings is tuple of tensors, with each
-        # tensor correspoending to a multimodal data item (image).
+        # tensor corresponding to a multimodal data item (image).
        multimodal_embeddings: tuple[torch.Tensor, ...] = ()

        # NOTE: It is important to iterate over the keys in this dictionary
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@ -52,10 +52,11 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP
 from vllm.model_executor.models.utils import (
-    AutoWeightsLoader, is_pp_missing_parameter,
+    AutoWeightsLoader, extract_layer_index, is_pp_missing_parameter,
    make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs import Olmo3Config


 class Olmo2Attention(nn.Module):
@ -68,7 +69,7 @@ class Olmo2Attention(nn.Module):
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        self.config = vllm_config.model_config.hf_config
-        assert isinstance(self.config, Olmo2Config)
+        assert isinstance(self.config, (Olmo2Config, Olmo3Config))

        hidden_size = self.config.hidden_size
        self.tp_size = get_tensor_model_parallel_world_size()
@ -111,14 +112,14 @@ class Olmo2Attention(nn.Module):
        self.q_norm = RMSNorm(self.config.hidden_size,
                              eps=self.config.rms_norm_eps)

-        # Rotary embeddings.
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=self.max_position_embeddings,
-            base=self.rope_theta,  # type: ignore
-        )
        self.scaling = self.head_dim**-0.5
+
+        layer_idx = extract_layer_index(prefix)
+        sliding_window = None
+        if ((layer_types := getattr(self.config, "layer_types", None))
+                is not None and layer_types[layer_idx] == "sliding_attention"):
+            sliding_window = self.config.sliding_window
+
        self.attn = Attention(
            self.num_heads,
            self.head_dim,
@ -126,7 +127,20 @@ class Olmo2Attention(nn.Module):
            num_kv_heads=self.num_kv_heads,
            cache_config=vllm_config.cache_config,
            quant_config=vllm_config.quant_config,
-            prefix=prefix,
+            per_layer_sliding_window=sliding_window,
+            prefix=f"{prefix}.attn",
+        )
+
+        # Rotary embeddings. Rope scaling is only applied on full attention
+        # layers.
+        self.rope_scaling = (self.config.rope_scaling
+                             if sliding_window is None else None)
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,  # type: ignore
+            rope_scaling=self.rope_scaling,
        )

        # Attention output projection.
@ -176,7 +190,7 @@ class Olmo2MLP(nn.Module):
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        config = vllm_config.model_config.hf_config
-        assert isinstance(config, Olmo2Config)
+        assert isinstance(config, (Olmo2Config, Olmo3Config))
        hidden_size = config.hidden_size
        intermediate_size = config.intermediate_size

@ -221,7 +235,7 @@ class Olmo2DecoderLayer(nn.Module):
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        config = vllm_config.model_config.hf_config
-        assert isinstance(config, Olmo2Config)
+        assert isinstance(config, (Olmo2Config, Olmo3Config))
        # Attention block.
        self.self_attn = Olmo2Attention(vllm_config=vllm_config,
                                        prefix=f"{prefix}.self_attn")
@ -261,7 +275,7 @@ class Olmo2Model(nn.Module):
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        self.config = vllm_config.model_config.hf_config
-        assert isinstance(self.config, Olmo2Config)
+        assert isinstance(self.config, (Olmo2Config, Olmo3Config))

        self.embed_tokens = VocabParallelEmbedding(
            self.config.vocab_size,
@ -376,7 +390,7 @@ class Olmo2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        config = vllm_config.model_config.hf_config
-        assert isinstance(config, Olmo2Config)
+        assert isinstance(config, (Olmo2Config, Olmo3Config))
        self.config = config
        self.model = Olmo2Model(vllm_config=vllm_config,
                                prefix=maybe_prefix(prefix, "model"))
--- a/vllm/model_executor/models/phi4_multimodal.py
+++ b/vllm/model_executor/models/phi4_multimodal.py
@ -374,8 +374,8 @@ class Phi4MMAudioMeanVarianceNormLayer(nn.Module):
    Typically used as a very first layer in a model.

    Args:
-        input_size: int
-            layer input size.
+        config: [Phi4MultimodalAudioConfig](https://huggingface.co/docs/transformers/model_doc/phi4_multimodal#transformers.Phi4MultimodalAudioConfig) 
+            object containing model parameters.
    """

    def __init__(self, config: Phi4MultimodalAudioConfig):
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@ -1154,7 +1154,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
            return None

        # The result multimodal_embeddings is tuple of tensors, with each
-        # tensor correspoending to a multimodal data item (image or video).
+        # tensor corresponding to a multimodal data item (image or video).
        multimodal_embeddings: tuple[torch.Tensor, ...] = ()

        # NOTE: It is important to iterate over the keys in this dictionary
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@ -1372,15 +1372,9 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
                batch.
                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
                opensource models), the shape will be `(3, seq_len)`,
-                otherwise it will be `(seq_len,).
-            pixel_values: Pixel values to be fed to a model.
-                `None` if no images are passed.
-            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
-                `None` if no images are passed.
-            pixel_values_videos: Pixel values of videos to be fed to a model.
-                `None` if no videos are passed.
-            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
-                `None` if no videos are passed.
+                otherwise it will be `(seq_len,)`.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.
        """

        if intermediate_tensors is not None:
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@ -170,8 +170,9 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
        return quant_config

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        # NOTE: hidden_states can have either 1D or 2D shape.
-        orig_shape = hidden_states.shape
+        assert hidden_states.dim(
+        ) <= 2, "Qwen3MoeSparseMoeBlock only supports 1D or 2D inputs"
+        is_input_1d = hidden_states.dim() == 1
        hidden_dim = hidden_states.shape[-1]
        hidden_states = hidden_states.view(-1, hidden_dim)

@ -180,7 +181,9 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
        final_hidden_states = self.experts(hidden_states=hidden_states,
                                           router_logits=router_logits)

-        return final_hidden_states.view(orig_shape)
+        # return to 1d if input is 1d
+        return final_hidden_states.squeeze(0) if is_input_1d else \
+            final_hidden_states


 class Qwen3MoeAttention(nn.Module):
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@ -120,6 +120,7 @@ _TEXT_GENERATION_MODELS = {
    "NemotronHForCausalLM": ("nemotron_h", "NemotronHForCausalLM"),
    "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
    "Olmo2ForCausalLM": ("olmo2", "Olmo2ForCausalLM"),
+    "Olmo3ForCausalLM": ("olmo2", "Olmo2ForCausalLM"),
    "OlmoeForCausalLM": ("olmoe", "OlmoeForCausalLM"),
    "OPTForCausalLM": ("opt", "OPTForCausalLM"),
    "OrionForCausalLM": ("orion", "OrionForCausalLM"),
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@ -390,12 +390,9 @@ class Siglip2EncoderLayer(nn.Module):
                position_embeddings: torch.Tensor) -> tuple[torch.FloatTensor]:
        """
        Args:
-            hidden_states (`torch.FloatTensor`):
-                Input to the layer of shape `(batch, seq_len, embed_dim)`.
-            output_attentions (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the attentions tensors of all 
-                attention layers. See `attentions` under
-                returned tensors for more detail.
+            hidden_states: Input tensor of shape (batch, seq_len, embed_dim).
+            cu_seqlens: Cumulative sequence lengths tensor.
+            position_embeddings: Position embeddings tensor.
        """
        residual = hidden_states

@ -534,19 +531,11 @@ class Siglip2Encoder(nn.Module):
    ) -> torch.Tensor:
        r"""
        Args:
-            inputs_embeds (`torch.FloatTensor` of shape
-                `(batch_size, sequence_length, hidden_size)`):
-                Optionally, instead of passing `input_ids` you can choose to
-                directly pass an embedded representation. This is useful if
-                you want more control over how to convert `input_ids` indices
-                into associated vectors than the model's internal embedding
-                lookup matrix.
-            grid_thws (`torch.LongTensor`):
-                grid shape (num_patches, 3)
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See
-                `hidden_states` under returned tensors for more detail.
-            return_dict (`bool`, *optional*):
+            inputs_embeds: Input tensor of shape 
+                (batch_size, sequence_length, hidden_size).
+                Embedded representation of the input tokens.
+            grid_thws: Grid tensor of shape (num_patches, 3) 
+                containing grid dimensions.
                Whether or not to return a [`~utils.ModelOutput`] instead of
                a plain tuple.
        """
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@ -597,10 +597,11 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
        with the `input_ids`.

        Args:
-            audio_features: A batch of audio input chunks [B, N, 80, M].
-            audio_lens: Length of audio frames for each audio chunk [B].
-            audio_token_len: Length of audio tokens for each audio chunk [B'].
-                Note: batch dim is different from batch dim in audio chunks.
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Position indices for the input tokens.
+            intermediate_tensors: Intermediate tensors from prior forward pass.
+            inputs_embeds: Optional tensor of input embeddings.

        """

--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@ -585,12 +585,12 @@ class VoxtralForConditionalGeneration(nn.Module, SupportsMultiModal,
             r"language_model.model.layers.\1.mlp.down_proj"),
            (r"layers\.(\d+)\.feed_forward\.w3",
             r"language_model.model.layers.\1.mlp.up_proj"),
-            (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.wo",
-             r"whisper_encoder.whisper_encoder.layers.\1.layers.self_attn.out_proj"
-             ),
            (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.w(.*)",
             r"whisper_encoder.whisper_encoder.layers.\1.layers.self_attn.\2_proj"
             ),
+            (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.wo",
+             r"whisper_encoder.whisper_encoder.layers.\1.layers.self_attn.out_proj"
+             ),
            (r"mm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.feed_forward.w(\d+)",
             r"whisper_encoder.whisper_encoder.layers.\1.layers.mlp.fc\2"),
            (r"mm_whisper_embeddings\.whisper_encoder\.conv_layers\.0",
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@ -909,8 +909,8 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid):
            prefix: Optional prefix for parameter names
        
        Raises:
-            AssertionError: If prefix caching is enabled
-            (not supported by Mamba)
+            AssertionError: If prefix caching is enabled 
+                (not supported by Mamba)
        """
        config = vllm_config.model_config.hf_config
        cache_config = vllm_config.cache_config
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@ -85,9 +85,10 @@ which are treated as audio embeddings;
 these are directly passed to the model without HF processing.
 """

-ModalityData: TypeAlias = Union[_T, list[_T]]
+ModalityData: TypeAlias = Union[_T, list[Optional[_T]], None]
 """
-Either a single data item, or a list of data items.
+Either a single data item, or a list of data items. Can only be None if UUID
+is provided.

 The number of data items allowed per modality is restricted by
 `--limit-mm-per-prompt`.
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@ -36,7 +36,7 @@ class ModalityDataItems(ABC, Generic[_T, _I]):
    def __init__(self, data: _T, modality: str) -> None:
        super().__init__()

-        self.data = data
+        self.data: _T = data
        self.modality = modality

    def __repr__(self) -> str:
@ -177,7 +177,9 @@ class DictEmbeddingItems(ModalityDataItems[Mapping[str, torch.Tensor],

 class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):

-    def __init__(self, data: Sequence[HfAudioItem]) -> None:
+    def __init__(self, data: Optional[Sequence[HfAudioItem]]) -> None:
+        if data is None:
+            data = [None]
        super().__init__(data, "audio")

    def get_audio_length(self, item_idx: int) -> int:
@ -198,7 +200,9 @@ class ImageSize(NamedTuple):

 class ImageProcessorItems(ProcessorBatchItems[HfImageItem]):

-    def __init__(self, data: Sequence[HfImageItem]) -> None:
+    def __init__(self, data: Optional[Sequence[HfImageItem]]) -> None:
+        if data is None:
+            data = [None]
        super().__init__(data, "image")

    def get_image_size(self, item_idx: int) -> ImageSize:
@ -223,10 +227,12 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]):

    def __init__(
        self,
-        data: Sequence[HfVideoItem],
+        data: Optional[Sequence[HfVideoItem]],
        metadata: Optional[Union[dict[str, Any],
                                 list[Optional[dict[str, Any]]]]] = None,
    ) -> None:
+        if data is None:
+            data = [None]
        super().__init__(data, "video")
        self.metadata = metadata

@ -385,6 +391,9 @@ class MultiModalDataParser:
        self,
        data: ModalityData[AudioItem],
    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if data is None:
+            return AudioProcessorItems(None)
+
        # also check single audio item with sampling rate
        if self._is_empty(data) or (isinstance(data, tuple)
                                    and self._is_empty(data[0])):
@ -420,6 +429,9 @@ class MultiModalDataParser:
        self,
        data: ModalityData[ImageItem],
    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if data is None:
+            return ImageProcessorItems(None)
+
        if self._is_empty(data):
            return None

@ -441,6 +453,9 @@ class MultiModalDataParser:
        self,
        data: ModalityData[VideoItem],
    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if data is None:
+            return VideoProcessorItems(None)
+
        if self._is_empty(data):
            return None

--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@ -1075,7 +1075,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
        [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data].
        """
        mm_items = self.data_parser.parse_mm_data(mm_data)
-
        for modality, items in mm_items.items():
            self.validate_num_items(modality, len(items))

@ -1436,10 +1435,18 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
            ]
            for modality, items_is_cached in mm_is_cached.items()
        }
-        mm_missing_data = {
-            modality: [mm_data_items[modality][idx] for idx in idxs]
-            for modality, idxs in mm_missing_idxs.items()
-        }
+        mm_missing_data = {}
+        for modality, idxs in mm_missing_idxs.items():
+            missing_modality_data = []
+            for idx in idxs:
+                data = mm_data_items[modality][idx]
+                if data is None:
+                    raise ValueError(
+                        f"Cache miss for {modality} at index {idx} "
+                        f"but data is not provided.")
+                else:
+                    missing_modality_data.append(data)
+            mm_missing_data[modality] = missing_modality_data

        return self._to_mm_items(mm_missing_data)

--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@ -179,6 +179,7 @@ class CudaPlatformBase(Platform):
                cache_config.block_size = 128
                logger.info("Forcing kv cache block size to 128 for "
                            "CUTLASS_MLA backend.")
+
            if use_flashinfer_mla and cache_config.block_size not in [32, 64]:
                cache_config.block_size = 64
                logger.info(
@ -541,7 +542,9 @@ class CudaPlatformBase(Platform):
                    attention_backend = "FLASHMLA"

            # Only FlashMLA and CUTLASS_MLA support fp8
-            if attention_backend in ["FLASHMLA", "CUTLASS_MLA"]:
+            if attention_backend in [
+                    "FLASHMLA", "CUTLASS_MLA", "FLASHINFER_MLA"
+            ]:
                supported = True
            else:
                supported = (not fp8_attention)
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@ -75,6 +75,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
    eagle="EAGLEConfig",
    speculators="SpeculatorsConfig",
    nemotron="NemotronConfig",
+    olmo3="Olmo3Config",
    ovis="OvisConfig",
    ultravox="UltravoxConfig",
    step3_vl="Step3VLConfig",
@ -678,20 +679,21 @@ def get_hf_file_to_dict(file_name: str,


@cache
-def get_pooling_config(model: str, revision: Optional[str] = 'main'):
+def get_pooling_config(model: str,
+                       revision: Optional[str] = 'main') -> Optional[dict]:
    """
    This function gets the pooling and normalize
    config from the model - only applies to
    sentence-transformers models.

    Args:
-        model (str): The name of the Hugging Face model.
-        revision (str, optional): The specific version
-        of the model to use. Defaults to 'main'.
+        model: The name of the Hugging Face model.
+        revision: The specific version of the model to use. 
+            Defaults to 'main'.

    Returns:
-        dict: A dictionary containing the pooling
-        type and whether normalization is used.
+        A dictionary containing the pooling type and whether 
+            normalization is used, or None if no pooling configuration is found.
    """

    modules_file_name = "modules.json"
--- a/vllm/transformers_utils/configs/init.py
+++ b/vllm/transformers_utils/configs/init.py
@ -23,6 +23,7 @@ from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
 from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig
 from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config
+from vllm.transformers_utils.configs.olmo3 import Olmo3Config
 from vllm.transformers_utils.configs.ovis import OvisConfig
 from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig
 from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig
@ -45,6 +46,7 @@ __all__ = [
    "NemotronConfig",
    "NemotronHConfig",
    "Nemotron_Nano_VL_Config",
+    "Olmo3Config",
    "OvisConfig",
    "SpeculatorsConfig",
    "UltravoxConfig",
--- a/vllm/transformers_utils/configs/jais.py
+++ b/vllm/transformers_utils/configs/jais.py
@ -74,10 +74,10 @@ class JAISConfig(PretrainedConfig):
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values
            attentions (not used by all models).
-        scale_attn_by_inverse_layer_idx (`bool`, *optional*,
-            defaults to `False`):
-            Whether to additionally scale attention weights by
-            `1 / layer_idx + 1`.
+        scale_attn_by_inverse_layer_idx
+            (`bool`, *optional*, defaults to `False`): 
+            Whether to additionally scale attention weights 
+            by `1 / layer_idx + 1`.
        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
            Whether to scale keys (K) prior to computing attention
            (dot-product)
--- a/vllm/transformers_utils/configs/olmo3.py
+++ b/vllm/transformers_utils/configs/olmo3.py
@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class Olmo3Config(PretrainedConfig):
+
+    model_type = "olmo3"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=50304,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=None,
+        eos_token_id=50279,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        rms_norm_eps=1e-5,
+        sliding_window=4096,
+        layer_types=None,
+        **kwargs,
+    ):
+        # This model uses Olmo3ForCausalLM in transformers but Olmo2ForCausalLM
+        # in vLLM.
+        if "architectures" not in kwargs:
+            kwargs["architectures"] = ["Olmo2ForCausalLM"]
+        elif "Olmo3ForCausalLM" in kwargs["architectures"]:
+            kwargs["architectures"].remove("Olmo3ForCausalLM")
+            kwargs["architectures"].append("Olmo2ForCausalLM")
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        self.rms_norm_eps = rms_norm_eps
+
+        self.sliding_window = sliding_window
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention" if (i + 1) % 4 != 0 else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@ -37,10 +37,6 @@ class UltravoxConfig(transformers.PretrainedConfig):
            The initialization value for the layer normalization.
        projector_act (`str`, *optional*, defaults to `"swiglu"`):
            The activation function used by the multimodal projector.
-        text_model_lora_config (`LoraConfigSimplified`, *optional*):
-            The LoRA configuration for finetuning the text model.
-        audio_model_lora_config (`LoraConfigSimplified`, *optional*):
-            The LoRA configuration for finetuning the audio model.
        projector_ln_mid (`bool`, *optional*, defaults to `False`):
            Whether to apply layer normalization at the middle of the
            projector or at the end. Versions v0.4.1 and below
--- a/vllm/transformers_utils/processors/deepseek_vl2.py
+++ b/vllm/transformers_utils/processors/deepseek_vl2.py
@ -25,6 +25,7 @@
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

 import math
+from typing import Any

 import torch
 import torchvision.transforms as T
@ -178,17 +179,15 @@ class DeepseekVLV2Processor(ProcessorMixin):
        prompt: str,
        images: list[Image.Image],
        inference_mode: bool = True,
-        **kwargs,
+        **kwargs: Any,
    ):
        """

        Args:
            prompt (str): the formatted prompt;
-            conversations (list[dict]): conversations with a list of messages;
            images (list[ImageType]): the list of images;
            inference_mode (bool): if True, then remove the last eos token;
-            system_prompt (str): the system prompt;
-            **kwargs:
+            **kwargs: Additional keyword arguments.

        Returns:
            outputs (BaseProcessorOutput): the output of the processor,
@ -259,7 +258,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
        text: str,
        images: list[Image.Image],
        inference_mode: bool = True,
-        **kwargs,
+        **kwargs: Any,
    ):
        """

--- a/vllm/transformers_utils/runai_utils.py
+++ b/vllm/transformers_utils/runai_utils.py
@ -33,7 +33,6 @@ def list_safetensors(path: str = "") -> list[str]:

    Args:
        path: The object storage path to list from.
-        allow_pattern: A list of patterns of which files to pull.

    Returns:
        list[str]: List of full object storage paths allowed by the pattern
@ -54,8 +53,7 @@ class ObjectStorageModel:
        dir: The temporary created directory.

    Methods:
-        pull_files(): Pull model from object storage to the temporary
-        directory.
+        pull_files(): Pull model from object storage to the temporary directory.
    """

    def __init__(self) -> None:
--- a/vllm/transformers_utils/s3_utils.py
+++ b/vllm/transformers_utils/s3_utils.py
@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import fnmatch
-from typing import Optional
+from typing import Any, Optional

 from vllm.utils import PlaceholderModule

@ -26,7 +26,7 @@ def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]:
    ]


-def glob(s3=None,
+def glob(s3: Optional[Any] = None,
         path: str = "",
         allow_pattern: Optional[list[str]] = None) -> list[str]:
    """
@ -51,7 +51,7 @@ def glob(s3=None,


 def list_files(
-        s3,
+        s3: Any,
        path: str,
        allow_pattern: Optional[list[str]] = None,
        ignore_pattern: Optional[list[str]] = None
--- a/vllm/utils/init.py
+++ b/vllm/utils/init.py
@ -2082,6 +2082,7 @@ async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args,
        return await task(*args, **kwargs)


+@lru_cache
 def supports_kw(
    callable: Callable[..., object],
    kw_name: str,
--- a/vllm/v1/attention/backends/gdn_attn.py
+++ b/vllm/v1/attention/backends/gdn_attn.py
@ -209,7 +209,8 @@ class GDNAttentionMetadataBuilder(

        # prepare tensors for cudagraph
        if (self.use_full_cuda_graph and num_prefills == 0 and num_decodes == 0
-                and num_spec_decodes <= self.decode_cudagraph_max_bs):
+                and num_spec_decodes <= self.decode_cudagraph_max_bs
+                and m.num_actual_tokens <= self.decode_cudagraph_max_bs):
            num_total_tokens = self.vllm_config.pad_for_cudagraph(
                m.num_actual_tokens)
            batch_size = num_total_tokens // (self.num_spec + 1)
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@ -584,7 +584,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
            window_left=self._global_hyperparameters.window_left,
            logits_soft_cap=self._global_hyperparameters.logits_soft_cap,
            q_data_type=self.model_config.dtype,
-            kv_data_type=self.kv_cache_spec.dtype,
        )

        # Prepare context prefills
@ -605,7 +604,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
                    logits_soft_cap=self._global_hyperparameters.
                    logits_soft_cap,
                    q_data_type=self.model_config.dtype,
-                    kv_data_type=self.kv_cache_spec.dtype,
                )

        prefill.prefill_main = self._fi_prefill_main
--- a/vllm/v1/attention/backends/mla/flashinfer_mla.py
+++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py
@ -6,8 +6,7 @@ from typing import Optional, Union
 import torch
 from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla

-from vllm.attention.backends.abstract import (AttentionLayer, AttentionType,
-                                              is_quantized_kv_cache)
+from vllm.attention.backends.abstract import AttentionLayer, AttentionType
 from vllm.logger import init_logger
 from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
                                                   MLACommonImpl,
@ -69,11 +68,9 @@ class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):
                                      "are not implemented for "
                                      "FlashInferMLAImpl")

-        if is_quantized_kv_cache(self.kv_cache_dtype):
-            raise NotImplementedError(
-                "FlashInferMLA V1 with FP8 KV cache not yet supported")
-
        self._workspace_buffer = g_fi_workspace
+        self.bmm1_scale: Optional[float] = None
+        self.bmm2_scale: Optional[float] = None

    def _forward_decode(
        self,
@ -92,6 +89,12 @@ class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):
        # trtllm API requires extra dimension q_len_per_request for MTP
        q = q.unsqueeze(1)

+        if self.bmm1_scale is None:
+            self.bmm1_scale = (layer._q_scale_float * layer._k_scale_float *
+                               self.scale)
+        if self.bmm2_scale is None:
+            self.bmm2_scale = layer._v_scale_float
+
        o = trtllm_batch_decode_with_kv_cache_mla(
            query=q,
            kv_cache=kv_c_and_k_pe_cache.unsqueeze(1),
@ -102,7 +105,8 @@ class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):
            block_tables=attn_metadata.decode.block_table,
            seq_lens=attn_metadata.decode.seq_lens,
            max_seq_len=attn_metadata.max_seq_len,
-            bmm1_scale=self.scale,
+            bmm1_scale=self.bmm1_scale,
+            bmm2_scale=self.bmm2_scale,
        )

        # TODO: Return LSE pending support from Flashinfer API:
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@ -159,6 +159,9 @@ class EngineCore:
            self.request_block_hasher = get_request_block_hasher(
                block_size, caching_hash_fn)

+        self.step_fn = (self.step if self.batch_queue is None else
+                        self.step_with_batch_queue)
+
    def _initialize_kv_caches(
            self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
        start = time.time()
@ -331,7 +334,8 @@ class EngineCore:
        model_executed = False
        if self.scheduler.has_requests():
            scheduler_output = self.scheduler.schedule()
-            future = self.model_executor.execute_model(scheduler_output)
+            future = self.model_executor.execute_model(scheduler_output,
+                                                       non_block=True)
            batch_queue.appendleft(
                (future, scheduler_output))  # type: ignore[arg-type]

@ -534,9 +538,6 @@ class EngineCoreProc(EngineCore):
                assert addresses.coordinator_input is not None
                logger.info("Waiting for READY message from DP Coordinator...")

-        self.step_fn = (self.step if self.batch_queue is None else
-                        self.step_with_batch_queue)
-
        # Mark the startup heap as static so that it's ignored by GC.
        # Reduces pause times of oldest generation collections.
        gc.collect()
--- a/Show More
+++ b/Show More